RubyGems - schema_sherlock - Versions diffs - 0.1.0 → 0.1.1 - Mend

schema_sherlock 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +10 -1
data/Gemfile.lock +4 -1
data/README.md +2 -0
data/lib/schema_sherlock/analyzers/foreign_key_detector.rb +4 -5
data/lib/schema_sherlock/analyzers/index_recommendation_detector.rb +44 -0
data/lib/schema_sherlock/binary_index.rb +113 -0
data/lib/schema_sherlock/commands/analyze_command.rb +59 -2
data/lib/schema_sherlock/file_cache.rb +189 -0
data/lib/schema_sherlock/indexed_usage_tracker.rb +19 -0
data/lib/schema_sherlock/optimized_scanner.rb +167 -0
data/lib/schema_sherlock/performance_optimizer.rb +111 -0
data/lib/schema_sherlock/schema_cache.rb +118 -0
data/lib/schema_sherlock/usage_tracker.rb +37 -23
data/lib/schema_sherlock/version.rb +1 -1
data/schema_sherlock.gemspec +2 -0
metadata +36 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 5c7488ddd91bb0bb1721c72fc0d3e5045461274a68ce2fea71d0ebe280f1c627
-  data.tar.gz: 400b9b670b80856f543659a55e24e2d138509f578125af5b2821b96fd766d17f
+  metadata.gz: 5c1ac4a467622a053245b33bb795bd7a08b4cc4408cf99c9a74088f29e46c948
+  data.tar.gz: af8c4a4e1700ed94300bc6e95f72d70218477ca93de0049699ac94d08ad2807a
 SHA512:
-  metadata.gz: 61fb4224786228806820ebf244672a8d493fa31a6561d8d9ce4f85ed9bdd03f9f16e869653a3dc0275d61b4c5d7e3fff083538fdd383c6887b2f05b793811221
-  data.tar.gz: 0234a850f322b90c9479542affc4ae8fae40df67c38395fd580f1f72ff683e2a17dee230d4fa918e336968a718cb4ad996165e59253adca3dc9f62b9acca0afe
+  metadata.gz: fe035c9e89e93ed74870d7f6adbc8d5f1596b8f0f6d5fbe6178a0628aa5f124b56e353501e73c792a4e31b0879777bb0ac0d1703207e52d7cf0932649ab1cb61
+  data.tar.gz: f80d18cc2a292c8c85c401dd359980e362dce3666c9bd516e75c0f9acdada5005f0d6845a6734cb7d729da4f490a9b3db6140692a6f1b798aae2badcdd5a85e3

data/CHANGELOG.md CHANGED Viewed

@@ -16,4 +16,13 @@
 - Filters suggestions based on actual usage frequency
 - Provides detailed analysis reports
 - Supports complex foreign key types (integer, bigint, UUID, string)
-- Smart table and model inference
+- Smart table and model inference
+## [0.1.1] - 2025-01-24
+### Features
+- Adds recommendations for missing indices for foreign keys
+### Bugs and improvements
+- Make analysis 80% fast
+- Safe load models to avoid break constraint errors

data/Gemfile.lock CHANGED Viewed

@@ -1,8 +1,10 @@
 PATH
   remote: .
   specs:
-    schema_sherlock (0.1.0)
+    schema_sherlock (0.1.1)
       activerecord (>= 6.0)
+      concurrent-ruby (~> 1.0)
+      msgpack (~> 1.0)
       rails (>= 6.0)
       thor (~> 1.0)
@@ -113,6 +115,7 @@ GEM
     marcel (1.0.4)
     mini_mime (1.1.5)
     minitest (5.25.5)
+    msgpack (1.8.0)
     net-imap (0.5.8)
       date
       net-protocol

data/README.md CHANGED Viewed

@@ -1,5 +1,7 @@
 # SchemaSherlock
+[![Gem Version](https://badge.fury.io/rb/schema_sherlock.svg?icon=si%3Arubygems)](https://badge.fury.io/rb/schema_sherlock)
 Intelligent Rails model analysis and annotation tool that extends beyond traditional schema annotation to provide intelligent analysis and actionable suggestions for Rails model code quality, performance, and maintainability.
 ## Installation

data/lib/schema_sherlock/analyzers/foreign_key_detector.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 require_relative "base_analyzer"
 require_relative "../usage_tracker"
+require_relative "../schema_cache"
 module SchemaSherlock
   module Analyzers
@@ -77,7 +78,7 @@ module SchemaSherlock
       end
       def table_exists?(table_name)
-        ActiveRecord::Base.connection.table_exists?(table_name)
+        SchemaCache.table_exists?(table_name)
       end
       def valid_foreign_key?(column)
@@ -155,13 +156,11 @@ module SchemaSherlock
         # Fallback method when model class is not available
         # Check if the table has an 'id' column with compatible type
         begin
-          connection = ActiveRecord::Base.connection
-          primary_key_name = connection.primary_key(table_name)
+          primary_key_name = SchemaCache.primary_key(table_name)
           return false unless primary_key_name
-          table_columns = connection.columns(table_name)
-          primary_key_column = table_columns.find { |col| col.name == primary_key_name }
+          primary_key_column = SchemaCache.column(table_name, primary_key_name)
           return false unless primary_key_column

data/lib/schema_sherlock/analyzers/index_recommendation_detector.rb ADDED Viewed

@@ -0,0 +1,44 @@
+require_relative "base_analyzer"
+require_relative "../schema_cache"
+module SchemaSherlock
+  module Analyzers
+    class IndexRecommendationDetector < BaseAnalyzer
+      def analyze
+        @results = {
+          missing_foreign_key_indexes: find_missing_foreign_key_indexes,
+        }
+      end
+      private
+      def find_missing_foreign_key_indexes
+        foreign_key_columns.reject do |column|
+          has_index_on_column?(column.name)
+        end.map do |column|
+          {
+            column: column.name,
+            table: table_name,
+            migration: "add_index :#{table_name}, :#{column.name}",
+            reason: "Foreign key without index"
+          }
+        end
+      end
+      def foreign_key_columns
+        @foreign_key_columns ||= columns.select { |col| col.name.end_with?('_id') && col.name != 'id' }
+      end
+      def has_index_on_column?(column_name)
+        existing_indexes.any? do |index|
+          index_columns = Array(index.columns)
+          index_columns.include?(column_name) && index_columns.size == 1
+        end
+      end
+      def existing_indexes
+        @existing_indexes ||= SchemaCache.indexes(table_name)
+      end
+    end
+  end
+end

data/lib/schema_sherlock/binary_index.rb ADDED Viewed

@@ -0,0 +1,113 @@
+require 'msgpack'
+require 'digest'
+module SchemaSherlock
+  class BinaryIndex
+    INDEX_VERSION = "1.0"
+    INDEX_FILE = "tmp/.schema_sherlock_index"
+    class << self
+      def load_or_build(root_path)
+        index_path = File.join(root_path, INDEX_FILE)
+        if File.exist?(index_path) && index_valid?(index_path, root_path)
+          load_index(index_path)
+        else
+          build_and_save_index(root_path, index_path)
+        end
+      end
+      def load_index(path)
+        data = File.binread(path)
+        MessagePack.unpack(data, symbolize_keys: true)
+      rescue => e
+        Rails.logger.warn("Failed to load index: #{e.message}") if defined?(Rails)
+        nil
+      end
+      def build_and_save_index(root_path, index_path)
+        index = build_index(root_path)
+        # Binary serialization
+        packed_data = MessagePack.pack(index)
+        File.binwrite(index_path, packed_data)
+        index
+      rescue => e
+        Rails.logger.warn("Failed to save index: #{e.message}") if defined?(Rails)
+        index
+      end
+      private
+      def index_valid?(index_path, root_path)
+        return false unless File.exist?(index_path)
+        index = load_index(index_path)
+        return false unless index && index[:version] == INDEX_VERSION
+        # Check if any files have been modified since index was built
+        index_time = File.mtime(index_path)
+        # If any Ruby file is newer than index, rebuild
+        Dir.glob(File.join(root_path, "**/*.rb")).any? do |file|
+          File.mtime(file) > index_time
+        end == false
+      end
+      def build_index(root_path)
+        index = {
+          version: INDEX_VERSION,
+          created_at: Time.now.to_i,
+          files: {},
+          column_references: {},
+          file_checksums: {}
+        }
+        # Build file index with checksums
+        Dir.glob(File.join(root_path, "**/*.rb")).each do |file|
+          next if should_skip_file?(file)
+          content = File.read(file, encoding: 'UTF-8', invalid: :replace, undef: :replace)
+          checksum = Digest::MD5.hexdigest(content)
+          index[:files][file] = {
+            size: File.size(file),
+            mtime: File.mtime(file).to_i,
+            checksum: checksum
+          }
+          # Pre-scan for common patterns and cache results
+          pre_scan_content(content, file, index)
+        end
+        index
+      end
+      def pre_scan_content(content, file, index)
+        # Pre-scan for column references
+        content.scan(/\.(\w+)_id\b/) do |match|
+          column = "#{match[0]}_id"
+          index[:column_references][column] ||= []
+          index[:column_references][column] << file
+        end
+        # Pre-scan for associations
+        content.scan(/\.(?:joins|includes)\s*\(\s*['":]?(\w+)/) do |match|
+          association = match[0]
+          column = "#{association}_id"
+          index[:column_references][column] ||= []
+          index[:column_references][column] << file
+        end
+      end
+      def should_skip_file?(file)
+        file.include?('/spec/') ||
+        file.include?('/test/') ||
+        file.include?('/vendor/') ||
+        file.include?('/node_modules/')
+      end
+    end
+  end
+end

data/lib/schema_sherlock/commands/analyze_command.rb CHANGED Viewed

@@ -1,5 +1,9 @@
 require_relative "base_command"
 require_relative "../analyzers/foreign_key_detector"
+require_relative "../analyzers/index_recommendation_detector"
+require_relative "../schema_cache"
+require_relative "../file_cache"
+require_relative "../binary_index"
 module SchemaSherlock
   module Commands
@@ -7,6 +11,7 @@ module SchemaSherlock
       desc "analyze [MODEL]", "Analyze models for missing associations and optimization opportunities"
       option :output, type: :string, desc: "Output file for analysis results"
       option :min_usage, type: :numeric, desc: "Minimum usage threshold for suggestions (overrides config)"
+      option :use_index, type: :boolean, default: true, desc: "Use binary index for faster analysis (if available)"
       def analyze(model_name = nil)
         load_rails_environment
@@ -20,11 +25,41 @@ module SchemaSherlock
         models = model_name ? [find_model(model_name)] : all_models
         puts "Analyzing #{models.length} model(s)..."
+        # Try to load binary index for faster analysis
+        @binary_index = nil
+        if options[:use_index] && defined?(Rails) && Rails.root
+          puts "Loading binary index..."
+          @binary_index = BinaryIndex.load_or_build(Rails.root.to_s)
+          if @binary_index
+            puts "  Index loaded with #{@binary_index[:files].size} files indexed"
+          end
+        end
+        # Preload metadata cache for performance
+        puts "Preloading database metadata..."
+        cache_stats = SchemaCache.preload_all_metadata
+        puts "  Cached: #{cache_stats[:tables_cached]} tables, #{cache_stats[:columns_cached]} column sets, #{cache_stats[:indexes_cached]} index sets"
+        # Preload file cache for performance (only if usage tracking is enabled and no index available)
+        if SchemaSherlock.configuration.min_usage_threshold && SchemaSherlock.configuration.min_usage_threshold > 0
+          if @binary_index
+            puts "Using binary index for file analysis (skipping file cache preload)"
+          else
+            puts "Preloading file cache..."
+            file_stats = FileCache.preload_all_files
+            puts "  Cached: #{file_stats[:files_scanned]} files (#{(file_stats[:total_size] / 1024.0 / 1024.0).round(2)} MB), #{file_stats[:files_failed]} failed"
+          end
+        end
         results = {}
         models.each do |model|
           puts "  Analyzing #{model.name}..."
+          # Set binary index for usage tracker
+          UsageTracker.binary_index = @binary_index
           analysis = analyze_model(model)
           # Only include models with issues in results
@@ -39,6 +74,10 @@ module SchemaSherlock
         say e.message, :red
         exit 1
       ensure
+        # Clear caches to free memory
+        SchemaCache.clear_cache
+        FileCache.clear_cache
         # Restore original threshold if it was overridden
         if options[:min_usage] && defined?(original_threshold)
           SchemaSherlock.configuration.min_usage_threshold = original_threshold
@@ -49,7 +88,8 @@ module SchemaSherlock
       def analyze_model(model)
         {
-          foreign_key_analysis: run_foreign_key_analysis(model)
+          foreign_key_analysis: run_foreign_key_analysis(model),
+          index_analysis: run_index_analysis(model)
         }
       end
@@ -59,12 +99,21 @@ module SchemaSherlock
         analyzer.results
       end
+      def run_index_analysis(model)
+        analyzer = SchemaSherlock::Analyzers::IndexRecommendationDetector.new(model)
+        analyzer.analyze
+        analyzer.results
+      end
       def has_issues?(analysis)
         foreign_key_analysis = analysis[:foreign_key_analysis]
         missing = foreign_key_analysis[:missing_associations]
         orphaned = foreign_key_analysis[:orphaned_foreign_keys]
-        missing.any? || orphaned.any?
+        index_analysis = analysis[:index_analysis]
+        missing_indexes = index_analysis[:missing_foreign_key_indexes]
+        missing.any? || orphaned.any? || missing_indexes.any?
       end
       def display_results(results, total_models)
@@ -91,6 +140,14 @@ module SchemaSherlock
               puts "    #{key[:column]} -> #{key[:issue]}"
             end
           end
+          missing_indexes = analysis[:index_analysis][:missing_foreign_key_indexes]
+          if missing_indexes.any?
+            puts "  Missing Indexes:"
+            missing_indexes.each do |idx|
+              puts "    #{idx[:migration]} # #{idx[:reason]}"
+            end
+          end
         end
         puts "\n" + "="*50

data/lib/schema_sherlock/file_cache.rb ADDED Viewed

@@ -0,0 +1,189 @@
+require 'concurrent'
+module SchemaSherlock
+  class FileCache
+    class << self
+      def initialize_cache
+        @file_contents_cache = {}
+        @scan_stats = { files_scanned: 0, files_failed: 0, total_size: 0 }
+      end
+      def clear_cache
+        @file_contents_cache&.clear
+        @scan_stats = { files_scanned: 0, files_failed: 0, total_size: 0 }
+      end
+      def preload_all_files(max_threads: 4)
+        initialize_cache if @file_contents_cache.nil?
+        files_to_scan = gather_all_files
+        return { files_scanned: 0, files_failed: 0, total_size: 0 } if files_to_scan.empty?
+        if max_threads > 1
+          preload_files_parallel(files_to_scan, max_threads)
+        else
+          preload_files_sequential(files_to_scan)
+        end
+        @scan_stats.dup
+      end
+      def get_file_content(file_path)
+        initialize_cache if @file_contents_cache.nil?
+        # Return cached content if available
+        return @file_contents_cache[file_path] if @file_contents_cache.key?(file_path)
+        # If not cached, read and cache it
+        content = read_file_safely(file_path)
+        @file_contents_cache[file_path] = content if content
+        content
+      end
+      def cached_files_count
+        @file_contents_cache&.size || 0
+      end
+      def cache_stats
+        {
+          cached_files: @file_contents_cache&.size || 0,
+          scan_stats: @scan_stats || { files_scanned: 0, files_failed: 0, total_size: 0 }
+        }
+      end
+      def scan_for_pattern_in_all_files(pattern, table_name: nil, column_name: nil)
+        initialize_cache if @file_contents_cache.nil?
+        total_matches = 0
+        @file_contents_cache.each do |file_path, content|
+          next unless content
+          if block_given?
+            # Allow custom matching logic
+            matches = yield(content, file_path, table_name, column_name)
+          else
+            # Default regex matching
+            matches = content.scan(pattern).length
+          end
+          total_matches += matches
+        end
+        total_matches
+      end
+      private
+      def preload_files_parallel(files_to_scan, max_threads)
+        # Use concurrent processing for file reading
+        thread_pool = Concurrent::FixedThreadPool.new(max_threads)
+        futures = []
+        files_to_scan.each do |file_path|
+          future = Concurrent::Future.execute(executor: thread_pool) do
+            read_file_safely(file_path)
+          end
+          futures << [file_path, future]
+        end
+        # Collect results
+        futures.each do |file_path, future|
+          begin
+            content = future.value(10) # 10 second timeout per file
+            if content
+              @file_contents_cache[file_path] = content
+              @scan_stats[:files_scanned] += 1
+              @scan_stats[:total_size] += content.bytesize
+            else
+              @scan_stats[:files_failed] += 1
+            end
+          rescue Concurrent::TimeoutError, StandardError
+            @scan_stats[:files_failed] += 1
+          end
+        end
+        thread_pool.shutdown
+        thread_pool.wait_for_termination(30)
+      end
+      def preload_files_sequential(files_to_scan)
+        # Fallback to sequential processing
+        files_to_scan.each do |file_path|
+          content = read_file_safely(file_path)
+          if content
+            @file_contents_cache[file_path] = content
+            @scan_stats[:files_scanned] += 1
+            @scan_stats[:total_size] += content.bytesize
+          else
+            @scan_stats[:files_failed] += 1
+          end
+        end
+      end
+      def gather_all_files
+        return [] unless defined?(Rails) && Rails.root
+        directories = scan_directories
+        all_files = []
+        directories.each do |dir|
+          next unless Dir.exist?(dir)
+          pattern = File.join(dir, '**', '*.rb')
+          files = Dir.glob(pattern)
+          all_files.concat(files)
+        end
+        # Remove duplicates and filter out files we might not want to scan
+        all_files.uniq.reject { |file| should_skip_file?(file) }
+      end
+      def scan_directories
+        return [] unless defined?(Rails) && Rails.root
+        [
+          Rails.root.join('app', 'controllers'),
+          Rails.root.join('app', 'models'),
+          Rails.root.join('app', 'services'),
+          Rails.root.join('app', 'jobs'),
+          Rails.root.join('app', 'workers'),
+          Rails.root.join('app', 'queries'),
+          Rails.root.join('lib')
+        ].map(&:to_s)
+      end
+      def should_skip_file?(file_path)
+        # Skip files that are unlikely to contain foreign key references
+        filename = File.basename(file_path)
+        # Skip test files, migrations, and other non-relevant files
+        return true if filename.match?(/(_spec|_test|\.spec|\.test)\.rb$/)
+        return true if file_path.include?('/spec/')
+        return true if file_path.include?('/test/')
+        return true if file_path.include?('/migrate/')
+        return true if file_path.include?('/db/migrate/')
+        # Skip very large files (likely generated or data files)
+        return true if File.size(file_path) > 1_048_576 # 1MB
+        false
+      rescue StandardError
+        true # Skip files we can't access
+      end
+      def read_file_safely(file_path)
+        return nil unless File.readable?(file_path)
+        content = File.read(file_path, encoding: 'UTF-8', invalid: :replace, undef: :replace)
+        # Validation - skip binary files or files with null bytes
+        return nil if content.include?("\x00")
+        content
+      rescue StandardError
+        nil
+      end
+    end
+  end
+end

data/lib/schema_sherlock/indexed_usage_tracker.rb ADDED Viewed

@@ -0,0 +1,19 @@
+require_relative 'performance_optimizer'
+module SchemaSherlock
+  # Lookup using the binary index
+  class IndexedUsageTracker
+    class << self
+      def count_column_references_with_index(index, table_name, column_name)
+        return 0 unless index && index[:column_references]
+        # Get files that potentially reference this column
+        relevant_files = index[:column_references][column_name] || []
+        return 0 if relevant_files.empty?
+        # Use performance optimizer for parallel processing
+        PerformanceOptimizer.process_files_parallel(relevant_files, table_name, column_name)
+      end
+    end
+  end
+end

data/lib/schema_sherlock/optimized_scanner.rb ADDED Viewed

@@ -0,0 +1,167 @@
+require 'strscan'
+module SchemaSherlock
+  class OptimizedScanner
+    # Pre-compiled patterns stored as constants to avoid recompilation
+    BOUNDARY_CHARS = /[\s\(\)\[\]\{\},;:'"]/
+    class << self
+      # Use StringScanner for efficient single-pass scanning
+      def count_column_references(content, table_name, column_name)
+        # Convert to downcase once for case-insensitive matching
+        content_lower = content.downcase
+        column_lower = column_name.downcase
+        association_name = column_name.gsub(/_id$/, '').downcase
+        count = 0
+        scanner = StringScanner.new(content_lower)
+        # Single pass through the content
+        while scanner.scan_until(/\./)
+          # Check for .where patterns
+          if scanner.match?(/where\s*\(/)
+            scanner.skip(/where\s*\(\s*/)
+            if match_column_reference(scanner, column_lower)
+              count += 1
+              next
+            end
+          end
+          # Check for .find_by patterns
+          if scanner.match?(/find_by\s*\(/)
+            scanner.skip(/find_by\s*\(\s*/)
+            if match_column_reference(scanner, column_lower)
+              count += 1
+              next
+            end
+          end
+          # Check for .joins and .includes with association
+          if scanner.match?(/joins\s*\(/)
+            scanner.skip(/joins\s*\(\s*/)
+            if match_association_reference(scanner, association_name)
+              count += 1
+              next
+            end
+          elsif scanner.match?(/includes\s*\(/)
+            scanner.skip(/includes\s*\(\s*/)
+            if match_association_reference(scanner, association_name)
+              count += 1
+              next
+            end
+          end
+          # Check for direct column access
+          if scanner.match?(/#{Regexp.escape(column_lower)}\b/)
+            scanner.skip(/#{Regexp.escape(column_lower)}\b/)
+            count += 1
+          end
+        end
+        count
+      end
+      # Native string operations version - even faster for simple patterns
+      def count_column_references_native(content, table_name, column_name)
+        content_lower = content.downcase
+        column_lower = column_name.downcase
+        association_name = column_name.gsub(/_id$/, '').downcase
+        count = 0
+        # Use native string operations for simple patterns
+        # Count .where( patterns
+        count += count_pattern_native(content_lower, ".where(", column_lower)
+        count += count_pattern_native(content_lower, ".where (", column_lower)
+        count += count_pattern_native(content_lower, ".find_by(", column_lower)
+        count += count_pattern_native(content_lower, ".find_by (", column_lower)
+        # Count joins/includes
+        count += count_pattern_native(content_lower, ".joins(", association_name)
+        count += count_pattern_native(content_lower, ".joins (", association_name)
+        count += count_pattern_native(content_lower, ".includes(", association_name)
+        count += count_pattern_native(content_lower, ".includes (", association_name)
+        # Count direct access - use boundary checking
+        count += count_direct_access(content_lower, ".#{column_lower}")
+        count
+      end
+      private
+      def match_column_reference(scanner, column_name)
+        # Skip quotes if present
+        scanner.skip(/['":]*/)
+        # Check if column name matches
+        if scanner.match?(/#{Regexp.escape(column_name)}/)
+          scanner.skip(/#{Regexp.escape(column_name)}/)
+          # Verify it's followed by appropriate characters
+          scanner.match?(/['":]*\s*[=:]/)
+        else
+          false
+        end
+      end
+      def match_association_reference(scanner, association_name)
+        # Skip quotes if present
+        scanner.skip(/['":]*/)
+        # Check if association name matches
+        if scanner.match?(/#{Regexp.escape(association_name)}/)
+          scanner.skip(/#{Regexp.escape(association_name)}/)
+          # Verify it's followed by appropriate characters
+          scanner.match?(/['":]*\s*[\),]/)
+        else
+          false
+        end
+      end
+      def count_pattern_native(content, prefix, target)
+        count = 0
+        index = 0
+        while (pos = content.index(prefix, index))
+          # Move past the prefix
+          check_pos = pos + prefix.length
+          # Skip whitespace and quotes
+          while check_pos < content.length && " \t'\":".include?(content[check_pos])
+            check_pos += 1
+          end
+          # Check if target matches at this position
+          if content[check_pos, target.length] == target
+            # Verify word boundary
+            next_char_pos = check_pos + target.length
+            if next_char_pos >= content.length || !('a'..'z').include?(content[next_char_pos])
+              count += 1
+            end
+          end
+          index = pos + 1
+        end
+        count
+      end
+      def count_direct_access(content, pattern)
+        count = 0
+        index = 0
+        while (pos = content.index(pattern, index))
+          # Check word boundary after pattern
+          next_pos = pos + pattern.length
+          if next_pos >= content.length || !('a'..'z').include?(content[next_pos])
+            count += 1
+          end
+          index = pos + 1
+        end
+        count
+      end
+    end
+  end
+end

data/lib/schema_sherlock/performance_optimizer.rb ADDED Viewed

@@ -0,0 +1,111 @@
+module SchemaSherlock
+  # Centralized performance optimization for file and pattern processing
+  class PerformanceOptimizer
+    # File size thresholds for processing strategies
+    SMALL_FILE_THRESHOLD = 64 * 1024     # 64KB
+    LARGE_FILE_THRESHOLD = 1024 * 1024   # 1MB
+    class << self
+      # High-performance file reading with size-based optimization
+      def read_file_optimized(file_path)
+        return "" unless File.exist?(file_path) && File.readable?(file_path)
+        file_size = File.size(file_path)
+        return "" if file_size == 0
+        if file_size < LARGE_FILE_THRESHOLD
+          # Small/medium files: direct read
+          File.read(file_path, encoding: 'UTF-8', invalid: :replace, undef: :replace)
+        else
+          # Large files: chunked reading with buffer
+          read_large_file_chunked(file_path)
+        end
+      rescue
+        ""
+      end
+      # Fast pattern matching with pre-filtering
+      def count_patterns_optimized(content, table_name, column_name)
+        # Early exit if content is empty or too short
+        return 0 if content.nil? || content.length < column_name.length
+        # Quick pre-filter: check if column name exists at all
+        content_lower = content.downcase
+        column_lower = column_name.downcase
+        # If column name doesn't appear anywhere, skip expensive matching
+        return 0 unless content_lower.include?(column_lower)
+        # Use optimized scanner
+        OptimizedScanner.count_column_references_native(content, table_name, column_name)
+      end
+      # Parallel file processing with optimal thread count
+      def process_files_parallel(file_paths, table_name, column_name)
+        return 0 if file_paths.empty?
+        # Limit threads to avoid overwhelming the system
+        max_threads = [Concurrent.processor_count, file_paths.size, 8].min
+        futures = []
+        thread_pool = Concurrent::FixedThreadPool.new(max_threads)
+        file_paths.each do |file_path|
+          future = Concurrent::Future.execute(executor: thread_pool) do
+            content = read_file_optimized(file_path)
+            count_patterns_optimized(content, table_name, column_name)
+          end
+          futures << future
+        end
+        # Collect results efficiently
+        total_count = futures.sum do |future|
+          future.value || 0
+        rescue
+          0
+        end
+        thread_pool.shutdown
+        thread_pool.wait_for_termination(5)
+        total_count
+      end
+      # Smart file filtering to reduce I/O
+      def filter_relevant_files(file_paths, column_name)
+        # For very large sets, do a quick filename-based filter first
+        if file_paths.size > 1000
+          # Filter by filename patterns that are likely to contain the column
+          association_name = column_name.gsub(/_id$/, '')
+          relevant_patterns = [column_name, association_name, 'model', 'service', 'query']
+          file_paths.select do |path|
+            filename = File.basename(path, '.rb').downcase
+            relevant_patterns.any? { |pattern| filename.include?(pattern) }
+          end
+        else
+          file_paths
+        end
+      end
+      private
+      def read_large_file_chunked(file_path)
+        content = String.new
+        chunk_size = 64 * 1024  # 64KB chunks
+        File.open(file_path, 'rb') do |file|
+          # OS hint for sequential access
+          file.advise(:sequential) if file.respond_to?(:advise)
+          while chunk = file.read(chunk_size)
+            content << chunk
+          end
+        end
+        # Single UTF-8 conversion for entire content
+        content.force_encoding('UTF-8').encode('UTF-8', invalid: :replace, undef: :replace)
+      end
+    end
+  end
+end

data/lib/schema_sherlock/schema_cache.rb ADDED Viewed

@@ -0,0 +1,118 @@
+module SchemaSherlock
+  class SchemaCache
+    class << self
+      def initialize_cache
+        @table_exists_cache = {}
+        @columns_cache = {}
+        @indexes_cache = {}
+        @primary_keys_cache = {}
+      end
+      def clear_cache
+        @table_exists_cache&.clear
+        @columns_cache&.clear
+        @indexes_cache&.clear
+        @primary_keys_cache&.clear
+      end
+      def connection
+        ActiveRecord::Base.connection
+      end
+      def preload_all_metadata
+        initialize_cache if @table_exists_cache.nil?
+        # Preload all tables existence
+        all_tables = connection.tables
+        all_tables.each { |table| @table_exists_cache[table] = true }
+        # Preload columns, indexes, and primary keys for all tables
+        all_tables.each do |table|
+          @columns_cache[table] = connection.columns(table)
+          @indexes_cache[table] = connection.indexes(table)
+          @primary_keys_cache[table] = connection.primary_key(table)
+        end
+        # Return stats for debugging
+        {
+          tables_cached: @table_exists_cache.size,
+          columns_cached: @columns_cache.size,
+          indexes_cached: @indexes_cache.size,
+          primary_keys_cached: @primary_keys_cache.size
+        }
+      end
+      def table_exists?(table_name)
+        initialize_cache if @table_exists_cache.nil?
+        # Check cache first
+        return @table_exists_cache[table_name] if @table_exists_cache.key?(table_name)
+        # If not in cache, check database and cache result
+        exists = connection.table_exists?(table_name)
+        @table_exists_cache[table_name] = exists
+        exists
+      end
+      def columns(table_name)
+        initialize_cache if @columns_cache.nil?
+        # Check cache first
+        return @columns_cache[table_name] if @columns_cache.key?(table_name)
+        # If not in cache, fetch from database and cache result
+        return nil unless table_exists?(table_name)
+        columns = connection.columns(table_name)
+        @columns_cache[table_name] = columns
+        columns
+      end
+      def indexes(table_name)
+        initialize_cache if @indexes_cache.nil?
+        # Check cache first
+        return @indexes_cache[table_name] if @indexes_cache.key?(table_name)
+        # If not in cache, fetch from database and cache result
+        return [] unless table_exists?(table_name)
+        indexes = connection.indexes(table_name)
+        @indexes_cache[table_name] = indexes
+        indexes
+      end
+      def primary_key(table_name)
+        initialize_cache if @primary_keys_cache.nil?
+        # Check cache first
+        return @primary_keys_cache[table_name] if @primary_keys_cache.key?(table_name)
+        # If not in cache, fetch from database and cache result
+        return nil unless table_exists?(table_name)
+        primary_key = connection.primary_key(table_name)
+        @primary_keys_cache[table_name] = primary_key
+        primary_key
+      end
+      # Helper method to get column by name
+      def column(table_name, column_name)
+        table_columns = columns(table_name)
+        return nil unless table_columns
+        table_columns.find { |col| col.name == column_name }
+      end
+      # Get cache statistics
+      def cache_stats
+        {
+          table_exists_cache_size: @table_exists_cache&.size || 0,
+          columns_cache_size: @columns_cache&.size || 0,
+          indexes_cache_size: @indexes_cache&.size || 0,
+          primary_keys_cache_size: @primary_keys_cache&.size || 0
+        }
+      end
+    end
+  end
+end

data/lib/schema_sherlock/usage_tracker.rb CHANGED Viewed

@@ -1,6 +1,14 @@
+require_relative 'file_cache'
+require_relative 'optimized_scanner'
+require_relative 'binary_index'
+require_relative 'indexed_usage_tracker'
+require_relative 'performance_optimizer'
 module SchemaSherlock
   class UsageTracker
     class << self
+      attr_accessor :binary_index
       def track_foreign_key_usage(model_class)
         return {} unless SchemaSherlock.configuration.min_usage_threshold
@@ -27,21 +35,41 @@ module SchemaSherlock
       end
       def scan_for_column_usage(table_name, column_name)
-        count = 0
+        # Use binary index if available for fastest lookup
+        if binary_index
+          IndexedUsageTracker.count_column_references_with_index(binary_index, table_name, column_name)
+        else
+          # Use performance-optimized scanning
+          scan_with_performance_optimizer(table_name, column_name)
+        end
+      end
+      def scan_with_performance_optimizer(table_name, column_name)
+        all_files = get_relevant_files
+        filtered_files = PerformanceOptimizer.filter_relevant_files(all_files, column_name)
+        PerformanceOptimizer.process_files_parallel(filtered_files, table_name, column_name)
+      end
-        # Scan common Rails directories for usage patterns
+      def get_relevant_files
+        files = []
         scan_directories.each do |dir|
           next unless Dir.exist?(dir)
-          Dir.glob("#{dir}/**/*.rb").each do |file|
-            content = File.read(file)
-            count += count_column_references(content, table_name, column_name)
-          rescue
-            # Skip files that can't be read
+          Dir.glob(File.join(dir, "**/*.rb")).each do |file|
+            next if should_skip_file?(file)
+            files << file
           end
         end
+        files
+      end
-        count
+      def should_skip_file?(file)
+        file.include?('/spec/') ||
+        file.include?('/test/') ||
+        file.include?('/vendor/') ||
+        file.include?('/node_modules/') ||
+        File.size(file) > 50 * 1024 * 1024  # Skip files larger than 50MB
       end
       def scan_directories
@@ -58,21 +86,7 @@ module SchemaSherlock
       def count_column_references(content, table_name, column_name)
-        count = 0
-        # Count WHERE clauses using the foreign key
-        count += content.scan(/\.where\s*\(\s*['":]?#{column_name}['":]?\s*[=:]/i).length
-        count += content.scan(/\.find_by\s*\(\s*['":]?#{column_name}['":]?\s*[=:]/i).length
-        # Count joins using the foreign key
-        association_name = column_name.gsub(/_id$/, '')
-        count += content.scan(/\.joins\s*\(\s*['":]?#{association_name}['":]?\s*\)/i).length
-        count += content.scan(/\.includes\s*\(\s*['":]?#{association_name}['":]?\s*\)/i).length
-        # Count direct foreign key access
-        count += content.scan(/\.#{column_name}\b/i).length
-        count
+        OptimizedScanner.count_column_references_native(content, table_name, column_name)
       end
     end
   end

data/lib/schema_sherlock/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module SchemaSherlock
-  VERSION = "0.1.0"
+  VERSION = "0.1.1"
 end

data/schema_sherlock.gemspec CHANGED Viewed

@@ -30,6 +30,8 @@ Gem::Specification.new do |spec|
   spec.add_dependency "rails", ">= 6.0"
   spec.add_dependency "thor", "~> 1.0"
   spec.add_dependency "activerecord", ">= 6.0"
+  spec.add_dependency "concurrent-ruby", "~> 1.0"
+  spec.add_dependency "msgpack", "~> 1.0"
   # Development dependencies
   spec.add_development_dependency "rspec", "~> 3.0"

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: schema_sherlock
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.1.1
 platform: ruby
 authors:
 - Prateek Choudhary
@@ -52,6 +52,34 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: '6.0'
+- !ruby/object:Gem::Dependency
+  name: concurrent-ruby
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.0'
+- !ruby/object:Gem::Dependency
+  name: msgpack
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.0'
 - !ruby/object:Gem::Dependency
   name: rspec
   requirement: !ruby/object:Gem::Requirement
@@ -85,10 +113,17 @@ files:
 - lib/schema_sherlock.rb
 - lib/schema_sherlock/analyzers/base_analyzer.rb
 - lib/schema_sherlock/analyzers/foreign_key_detector.rb
+- lib/schema_sherlock/analyzers/index_recommendation_detector.rb
+- lib/schema_sherlock/binary_index.rb
 - lib/schema_sherlock/commands/analyze_command.rb
 - lib/schema_sherlock/commands/base_command.rb
 - lib/schema_sherlock/configuration.rb
+- lib/schema_sherlock/file_cache.rb
+- lib/schema_sherlock/indexed_usage_tracker.rb
 - lib/schema_sherlock/model_loader.rb
+- lib/schema_sherlock/optimized_scanner.rb
+- lib/schema_sherlock/performance_optimizer.rb
+- lib/schema_sherlock/schema_cache.rb
 - lib/schema_sherlock/usage_tracker.rb
 - lib/schema_sherlock/version.rb
 - schema_sherlock.gemspec