schema_sherlock 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5c7488ddd91bb0bb1721c72fc0d3e5045461274a68ce2fea71d0ebe280f1c627
4
- data.tar.gz: 400b9b670b80856f543659a55e24e2d138509f578125af5b2821b96fd766d17f
3
+ metadata.gz: 5c1ac4a467622a053245b33bb795bd7a08b4cc4408cf99c9a74088f29e46c948
4
+ data.tar.gz: af8c4a4e1700ed94300bc6e95f72d70218477ca93de0049699ac94d08ad2807a
5
5
  SHA512:
6
- metadata.gz: 61fb4224786228806820ebf244672a8d493fa31a6561d8d9ce4f85ed9bdd03f9f16e869653a3dc0275d61b4c5d7e3fff083538fdd383c6887b2f05b793811221
7
- data.tar.gz: 0234a850f322b90c9479542affc4ae8fae40df67c38395fd580f1f72ff683e2a17dee230d4fa918e336968a718cb4ad996165e59253adca3dc9f62b9acca0afe
6
+ metadata.gz: fe035c9e89e93ed74870d7f6adbc8d5f1596b8f0f6d5fbe6178a0628aa5f124b56e353501e73c792a4e31b0879777bb0ac0d1703207e52d7cf0932649ab1cb61
7
+ data.tar.gz: f80d18cc2a292c8c85c401dd359980e362dce3666c9bd516e75c0f9acdada5005f0d6845a6734cb7d729da4f490a9b3db6140692a6f1b798aae2badcdd5a85e3
data/CHANGELOG.md CHANGED
@@ -16,4 +16,13 @@
16
16
  - Filters suggestions based on actual usage frequency
17
17
  - Provides detailed analysis reports
18
18
  - Supports complex foreign key types (integer, bigint, UUID, string)
19
- - Smart table and model inference
19
+ - Smart table and model inference
20
+
21
+ ## [0.1.1] - 2025-01-24
22
+
23
+ ### Features
24
+ - Adds recommendations for missing indices for foreign keys
25
+
26
+ ### Bugs and improvements
27
+ - Make analysis 80% fast
28
+ - Safe load models to avoid break constraint errors
data/Gemfile.lock CHANGED
@@ -1,8 +1,10 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- schema_sherlock (0.1.0)
4
+ schema_sherlock (0.1.1)
5
5
  activerecord (>= 6.0)
6
+ concurrent-ruby (~> 1.0)
7
+ msgpack (~> 1.0)
6
8
  rails (>= 6.0)
7
9
  thor (~> 1.0)
8
10
 
@@ -113,6 +115,7 @@ GEM
113
115
  marcel (1.0.4)
114
116
  mini_mime (1.1.5)
115
117
  minitest (5.25.5)
118
+ msgpack (1.8.0)
116
119
  net-imap (0.5.8)
117
120
  date
118
121
  net-protocol
data/README.md CHANGED
@@ -1,5 +1,7 @@
1
1
  # SchemaSherlock
2
2
 
3
+ [![Gem Version](https://badge.fury.io/rb/schema_sherlock.svg?icon=si%3Arubygems)](https://badge.fury.io/rb/schema_sherlock)
4
+
3
5
  Intelligent Rails model analysis and annotation tool that extends beyond traditional schema annotation to provide intelligent analysis and actionable suggestions for Rails model code quality, performance, and maintainability.
4
6
 
5
7
  ## Installation
@@ -1,5 +1,6 @@
1
1
  require_relative "base_analyzer"
2
2
  require_relative "../usage_tracker"
3
+ require_relative "../schema_cache"
3
4
 
4
5
  module SchemaSherlock
5
6
  module Analyzers
@@ -77,7 +78,7 @@ module SchemaSherlock
77
78
  end
78
79
 
79
80
  def table_exists?(table_name)
80
- ActiveRecord::Base.connection.table_exists?(table_name)
81
+ SchemaCache.table_exists?(table_name)
81
82
  end
82
83
 
83
84
  def valid_foreign_key?(column)
@@ -155,13 +156,11 @@ module SchemaSherlock
155
156
  # Fallback method when model class is not available
156
157
  # Check if the table has an 'id' column with compatible type
157
158
  begin
158
- connection = ActiveRecord::Base.connection
159
- primary_key_name = connection.primary_key(table_name)
159
+ primary_key_name = SchemaCache.primary_key(table_name)
160
160
 
161
161
  return false unless primary_key_name
162
162
 
163
- table_columns = connection.columns(table_name)
164
- primary_key_column = table_columns.find { |col| col.name == primary_key_name }
163
+ primary_key_column = SchemaCache.column(table_name, primary_key_name)
165
164
 
166
165
  return false unless primary_key_column
167
166
 
@@ -0,0 +1,44 @@
1
+ require_relative "base_analyzer"
2
+ require_relative "../schema_cache"
3
+
4
+ module SchemaSherlock
5
+ module Analyzers
6
+ class IndexRecommendationDetector < BaseAnalyzer
7
+ def analyze
8
+ @results = {
9
+ missing_foreign_key_indexes: find_missing_foreign_key_indexes,
10
+ }
11
+ end
12
+
13
+ private
14
+
15
+ def find_missing_foreign_key_indexes
16
+ foreign_key_columns.reject do |column|
17
+ has_index_on_column?(column.name)
18
+ end.map do |column|
19
+ {
20
+ column: column.name,
21
+ table: table_name,
22
+ migration: "add_index :#{table_name}, :#{column.name}",
23
+ reason: "Foreign key without index"
24
+ }
25
+ end
26
+ end
27
+
28
+ def foreign_key_columns
29
+ @foreign_key_columns ||= columns.select { |col| col.name.end_with?('_id') && col.name != 'id' }
30
+ end
31
+
32
+ def has_index_on_column?(column_name)
33
+ existing_indexes.any? do |index|
34
+ index_columns = Array(index.columns)
35
+ index_columns.include?(column_name) && index_columns.size == 1
36
+ end
37
+ end
38
+
39
+ def existing_indexes
40
+ @existing_indexes ||= SchemaCache.indexes(table_name)
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,113 @@
1
+ require 'msgpack'
2
+ require 'digest'
3
+
4
+ module SchemaSherlock
5
+ class BinaryIndex
6
+ INDEX_VERSION = "1.0"
7
+ INDEX_FILE = "tmp/.schema_sherlock_index"
8
+
9
+ class << self
10
+ def load_or_build(root_path)
11
+ index_path = File.join(root_path, INDEX_FILE)
12
+
13
+ if File.exist?(index_path) && index_valid?(index_path, root_path)
14
+ load_index(index_path)
15
+ else
16
+ build_and_save_index(root_path, index_path)
17
+ end
18
+ end
19
+
20
+ def load_index(path)
21
+ data = File.binread(path)
22
+ MessagePack.unpack(data, symbolize_keys: true)
23
+ rescue => e
24
+ Rails.logger.warn("Failed to load index: #{e.message}") if defined?(Rails)
25
+ nil
26
+ end
27
+
28
+ def build_and_save_index(root_path, index_path)
29
+ index = build_index(root_path)
30
+
31
+ # Binary serialization
32
+ packed_data = MessagePack.pack(index)
33
+ File.binwrite(index_path, packed_data)
34
+
35
+ index
36
+ rescue => e
37
+ Rails.logger.warn("Failed to save index: #{e.message}") if defined?(Rails)
38
+ index
39
+ end
40
+
41
+ private
42
+
43
+ def index_valid?(index_path, root_path)
44
+ return false unless File.exist?(index_path)
45
+
46
+ index = load_index(index_path)
47
+ return false unless index && index[:version] == INDEX_VERSION
48
+
49
+ # Check if any files have been modified since index was built
50
+ index_time = File.mtime(index_path)
51
+
52
+ # If any Ruby file is newer than index, rebuild
53
+ Dir.glob(File.join(root_path, "**/*.rb")).any? do |file|
54
+ File.mtime(file) > index_time
55
+ end == false
56
+ end
57
+
58
+ def build_index(root_path)
59
+ index = {
60
+ version: INDEX_VERSION,
61
+ created_at: Time.now.to_i,
62
+ files: {},
63
+ column_references: {},
64
+ file_checksums: {}
65
+ }
66
+
67
+ # Build file index with checksums
68
+ Dir.glob(File.join(root_path, "**/*.rb")).each do |file|
69
+ next if should_skip_file?(file)
70
+
71
+ content = File.read(file, encoding: 'UTF-8', invalid: :replace, undef: :replace)
72
+ checksum = Digest::MD5.hexdigest(content)
73
+
74
+ index[:files][file] = {
75
+ size: File.size(file),
76
+ mtime: File.mtime(file).to_i,
77
+ checksum: checksum
78
+ }
79
+
80
+ # Pre-scan for common patterns and cache results
81
+ pre_scan_content(content, file, index)
82
+ end
83
+
84
+ index
85
+ end
86
+
87
+ def pre_scan_content(content, file, index)
88
+ # Pre-scan for column references
89
+ content.scan(/\.(\w+)_id\b/) do |match|
90
+ column = "#{match[0]}_id"
91
+ index[:column_references][column] ||= []
92
+ index[:column_references][column] << file
93
+ end
94
+
95
+ # Pre-scan for associations
96
+ content.scan(/\.(?:joins|includes)\s*\(\s*['":]?(\w+)/) do |match|
97
+ association = match[0]
98
+ column = "#{association}_id"
99
+ index[:column_references][column] ||= []
100
+ index[:column_references][column] << file
101
+ end
102
+ end
103
+
104
+ def should_skip_file?(file)
105
+ file.include?('/spec/') ||
106
+ file.include?('/test/') ||
107
+ file.include?('/vendor/') ||
108
+ file.include?('/node_modules/')
109
+ end
110
+ end
111
+ end
112
+
113
+ end
@@ -1,5 +1,9 @@
1
1
  require_relative "base_command"
2
2
  require_relative "../analyzers/foreign_key_detector"
3
+ require_relative "../analyzers/index_recommendation_detector"
4
+ require_relative "../schema_cache"
5
+ require_relative "../file_cache"
6
+ require_relative "../binary_index"
3
7
 
4
8
  module SchemaSherlock
5
9
  module Commands
@@ -7,6 +11,7 @@ module SchemaSherlock
7
11
  desc "analyze [MODEL]", "Analyze models for missing associations and optimization opportunities"
8
12
  option :output, type: :string, desc: "Output file for analysis results"
9
13
  option :min_usage, type: :numeric, desc: "Minimum usage threshold for suggestions (overrides config)"
14
+ option :use_index, type: :boolean, default: true, desc: "Use binary index for faster analysis (if available)"
10
15
 
11
16
  def analyze(model_name = nil)
12
17
  load_rails_environment
@@ -20,11 +25,41 @@ module SchemaSherlock
20
25
  models = model_name ? [find_model(model_name)] : all_models
21
26
 
22
27
  puts "Analyzing #{models.length} model(s)..."
28
+
29
+ # Try to load binary index for faster analysis
30
+ @binary_index = nil
31
+ if options[:use_index] && defined?(Rails) && Rails.root
32
+ puts "Loading binary index..."
33
+ @binary_index = BinaryIndex.load_or_build(Rails.root.to_s)
34
+ if @binary_index
35
+ puts " Index loaded with #{@binary_index[:files].size} files indexed"
36
+ end
37
+ end
38
+
39
+ # Preload metadata cache for performance
40
+ puts "Preloading database metadata..."
41
+ cache_stats = SchemaCache.preload_all_metadata
42
+ puts " Cached: #{cache_stats[:tables_cached]} tables, #{cache_stats[:columns_cached]} column sets, #{cache_stats[:indexes_cached]} index sets"
43
+
44
+ # Preload file cache for performance (only if usage tracking is enabled and no index available)
45
+ if SchemaSherlock.configuration.min_usage_threshold && SchemaSherlock.configuration.min_usage_threshold > 0
46
+ if @binary_index
47
+ puts "Using binary index for file analysis (skipping file cache preload)"
48
+ else
49
+ puts "Preloading file cache..."
50
+ file_stats = FileCache.preload_all_files
51
+ puts " Cached: #{file_stats[:files_scanned]} files (#{(file_stats[:total_size] / 1024.0 / 1024.0).round(2)} MB), #{file_stats[:files_failed]} failed"
52
+ end
53
+ end
23
54
 
24
55
  results = {}
25
56
 
26
57
  models.each do |model|
27
58
  puts " Analyzing #{model.name}..."
59
+
60
+ # Set binary index for usage tracker
61
+ UsageTracker.binary_index = @binary_index
62
+
28
63
  analysis = analyze_model(model)
29
64
 
30
65
  # Only include models with issues in results
@@ -39,6 +74,10 @@ module SchemaSherlock
39
74
  say e.message, :red
40
75
  exit 1
41
76
  ensure
77
+ # Clear caches to free memory
78
+ SchemaCache.clear_cache
79
+ FileCache.clear_cache
80
+
42
81
  # Restore original threshold if it was overridden
43
82
  if options[:min_usage] && defined?(original_threshold)
44
83
  SchemaSherlock.configuration.min_usage_threshold = original_threshold
@@ -49,7 +88,8 @@ module SchemaSherlock
49
88
 
50
89
  def analyze_model(model)
51
90
  {
52
- foreign_key_analysis: run_foreign_key_analysis(model)
91
+ foreign_key_analysis: run_foreign_key_analysis(model),
92
+ index_analysis: run_index_analysis(model)
53
93
  }
54
94
  end
55
95
 
@@ -59,12 +99,21 @@ module SchemaSherlock
59
99
  analyzer.results
60
100
  end
61
101
 
102
+ def run_index_analysis(model)
103
+ analyzer = SchemaSherlock::Analyzers::IndexRecommendationDetector.new(model)
104
+ analyzer.analyze
105
+ analyzer.results
106
+ end
107
+
62
108
  def has_issues?(analysis)
63
109
  foreign_key_analysis = analysis[:foreign_key_analysis]
64
110
  missing = foreign_key_analysis[:missing_associations]
65
111
  orphaned = foreign_key_analysis[:orphaned_foreign_keys]
66
112
 
67
- missing.any? || orphaned.any?
113
+ index_analysis = analysis[:index_analysis]
114
+ missing_indexes = index_analysis[:missing_foreign_key_indexes]
115
+
116
+ missing.any? || orphaned.any? || missing_indexes.any?
68
117
  end
69
118
 
70
119
  def display_results(results, total_models)
@@ -91,6 +140,14 @@ module SchemaSherlock
91
140
  puts " #{key[:column]} -> #{key[:issue]}"
92
141
  end
93
142
  end
143
+
144
+ missing_indexes = analysis[:index_analysis][:missing_foreign_key_indexes]
145
+ if missing_indexes.any?
146
+ puts " Missing Indexes:"
147
+ missing_indexes.each do |idx|
148
+ puts " #{idx[:migration]} # #{idx[:reason]}"
149
+ end
150
+ end
94
151
  end
95
152
 
96
153
  puts "\n" + "="*50
@@ -0,0 +1,189 @@
1
+ require 'concurrent'
2
+
3
+ module SchemaSherlock
4
+ class FileCache
5
+ class << self
6
+ def initialize_cache
7
+ @file_contents_cache = {}
8
+ @scan_stats = { files_scanned: 0, files_failed: 0, total_size: 0 }
9
+ end
10
+
11
+ def clear_cache
12
+ @file_contents_cache&.clear
13
+ @scan_stats = { files_scanned: 0, files_failed: 0, total_size: 0 }
14
+ end
15
+
16
+ def preload_all_files(max_threads: 4)
17
+ initialize_cache if @file_contents_cache.nil?
18
+
19
+ files_to_scan = gather_all_files
20
+ return { files_scanned: 0, files_failed: 0, total_size: 0 } if files_to_scan.empty?
21
+
22
+ if max_threads > 1
23
+ preload_files_parallel(files_to_scan, max_threads)
24
+ else
25
+ preload_files_sequential(files_to_scan)
26
+ end
27
+
28
+ @scan_stats.dup
29
+ end
30
+
31
+ def get_file_content(file_path)
32
+ initialize_cache if @file_contents_cache.nil?
33
+
34
+ # Return cached content if available
35
+ return @file_contents_cache[file_path] if @file_contents_cache.key?(file_path)
36
+
37
+ # If not cached, read and cache it
38
+ content = read_file_safely(file_path)
39
+ @file_contents_cache[file_path] = content if content
40
+ content
41
+ end
42
+
43
+ def cached_files_count
44
+ @file_contents_cache&.size || 0
45
+ end
46
+
47
+ def cache_stats
48
+ {
49
+ cached_files: @file_contents_cache&.size || 0,
50
+ scan_stats: @scan_stats || { files_scanned: 0, files_failed: 0, total_size: 0 }
51
+ }
52
+ end
53
+
54
+ def scan_for_pattern_in_all_files(pattern, table_name: nil, column_name: nil)
55
+ initialize_cache if @file_contents_cache.nil?
56
+
57
+ total_matches = 0
58
+
59
+ @file_contents_cache.each do |file_path, content|
60
+ next unless content
61
+
62
+ if block_given?
63
+ # Allow custom matching logic
64
+ matches = yield(content, file_path, table_name, column_name)
65
+ else
66
+ # Default regex matching
67
+ matches = content.scan(pattern).length
68
+ end
69
+
70
+ total_matches += matches
71
+ end
72
+
73
+ total_matches
74
+ end
75
+
76
+ private
77
+
78
+ def preload_files_parallel(files_to_scan, max_threads)
79
+ # Use concurrent processing for file reading
80
+ thread_pool = Concurrent::FixedThreadPool.new(max_threads)
81
+ futures = []
82
+
83
+ files_to_scan.each do |file_path|
84
+ future = Concurrent::Future.execute(executor: thread_pool) do
85
+ read_file_safely(file_path)
86
+ end
87
+ futures << [file_path, future]
88
+ end
89
+
90
+ # Collect results
91
+ futures.each do |file_path, future|
92
+ begin
93
+ content = future.value(10) # 10 second timeout per file
94
+ if content
95
+ @file_contents_cache[file_path] = content
96
+ @scan_stats[:files_scanned] += 1
97
+ @scan_stats[:total_size] += content.bytesize
98
+ else
99
+ @scan_stats[:files_failed] += 1
100
+ end
101
+ rescue Concurrent::TimeoutError, StandardError
102
+ @scan_stats[:files_failed] += 1
103
+ end
104
+ end
105
+
106
+ thread_pool.shutdown
107
+ thread_pool.wait_for_termination(30)
108
+ end
109
+
110
+ def preload_files_sequential(files_to_scan)
111
+ # Fallback to sequential processing
112
+ files_to_scan.each do |file_path|
113
+ content = read_file_safely(file_path)
114
+ if content
115
+ @file_contents_cache[file_path] = content
116
+ @scan_stats[:files_scanned] += 1
117
+ @scan_stats[:total_size] += content.bytesize
118
+ else
119
+ @scan_stats[:files_failed] += 1
120
+ end
121
+ end
122
+ end
123
+
124
+ def gather_all_files
125
+ return [] unless defined?(Rails) && Rails.root
126
+
127
+ directories = scan_directories
128
+ all_files = []
129
+
130
+ directories.each do |dir|
131
+ next unless Dir.exist?(dir)
132
+
133
+ pattern = File.join(dir, '**', '*.rb')
134
+ files = Dir.glob(pattern)
135
+ all_files.concat(files)
136
+ end
137
+
138
+ # Remove duplicates and filter out files we might not want to scan
139
+ all_files.uniq.reject { |file| should_skip_file?(file) }
140
+ end
141
+
142
+ def scan_directories
143
+ return [] unless defined?(Rails) && Rails.root
144
+
145
+ [
146
+ Rails.root.join('app', 'controllers'),
147
+ Rails.root.join('app', 'models'),
148
+ Rails.root.join('app', 'services'),
149
+ Rails.root.join('app', 'jobs'),
150
+ Rails.root.join('app', 'workers'),
151
+ Rails.root.join('app', 'queries'),
152
+ Rails.root.join('lib')
153
+ ].map(&:to_s)
154
+ end
155
+
156
+ def should_skip_file?(file_path)
157
+ # Skip files that are unlikely to contain foreign key references
158
+ filename = File.basename(file_path)
159
+
160
+ # Skip test files, migrations, and other non-relevant files
161
+ return true if filename.match?(/(_spec|_test|\.spec|\.test)\.rb$/)
162
+ return true if file_path.include?('/spec/')
163
+ return true if file_path.include?('/test/')
164
+ return true if file_path.include?('/migrate/')
165
+ return true if file_path.include?('/db/migrate/')
166
+
167
+ # Skip very large files (likely generated or data files)
168
+ return true if File.size(file_path) > 1_048_576 # 1MB
169
+
170
+ false
171
+ rescue StandardError
172
+ true # Skip files we can't access
173
+ end
174
+
175
+ def read_file_safely(file_path)
176
+ return nil unless File.readable?(file_path)
177
+
178
+ content = File.read(file_path, encoding: 'UTF-8', invalid: :replace, undef: :replace)
179
+
180
+ # Validation - skip binary files or files with null bytes
181
+ return nil if content.include?("\x00")
182
+
183
+ content
184
+ rescue StandardError
185
+ nil
186
+ end
187
+ end
188
+ end
189
+ end
@@ -0,0 +1,19 @@
1
+ require_relative 'performance_optimizer'
2
+
3
+ module SchemaSherlock
4
+ # Lookup using the binary index
5
+ class IndexedUsageTracker
6
+ class << self
7
+ def count_column_references_with_index(index, table_name, column_name)
8
+ return 0 unless index && index[:column_references]
9
+
10
+ # Get files that potentially reference this column
11
+ relevant_files = index[:column_references][column_name] || []
12
+ return 0 if relevant_files.empty?
13
+
14
+ # Use performance optimizer for parallel processing
15
+ PerformanceOptimizer.process_files_parallel(relevant_files, table_name, column_name)
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,167 @@
1
+ require 'strscan'
2
+
3
+ module SchemaSherlock
4
+ class OptimizedScanner
5
+ # Pre-compiled patterns stored as constants to avoid recompilation
6
+ BOUNDARY_CHARS = /[\s\(\)\[\]\{\},;:'"]/
7
+
8
+ class << self
9
+ # Use StringScanner for efficient single-pass scanning
10
+ def count_column_references(content, table_name, column_name)
11
+ # Convert to downcase once for case-insensitive matching
12
+ content_lower = content.downcase
13
+ column_lower = column_name.downcase
14
+ association_name = column_name.gsub(/_id$/, '').downcase
15
+
16
+ count = 0
17
+ scanner = StringScanner.new(content_lower)
18
+
19
+ # Single pass through the content
20
+ while scanner.scan_until(/\./)
21
+
22
+ # Check for .where patterns
23
+ if scanner.match?(/where\s*\(/)
24
+ scanner.skip(/where\s*\(\s*/)
25
+ if match_column_reference(scanner, column_lower)
26
+ count += 1
27
+ next
28
+ end
29
+ end
30
+
31
+ # Check for .find_by patterns
32
+ if scanner.match?(/find_by\s*\(/)
33
+ scanner.skip(/find_by\s*\(\s*/)
34
+ if match_column_reference(scanner, column_lower)
35
+ count += 1
36
+ next
37
+ end
38
+ end
39
+
40
+ # Check for .joins and .includes with association
41
+ if scanner.match?(/joins\s*\(/)
42
+ scanner.skip(/joins\s*\(\s*/)
43
+ if match_association_reference(scanner, association_name)
44
+ count += 1
45
+ next
46
+ end
47
+ elsif scanner.match?(/includes\s*\(/)
48
+ scanner.skip(/includes\s*\(\s*/)
49
+ if match_association_reference(scanner, association_name)
50
+ count += 1
51
+ next
52
+ end
53
+ end
54
+
55
+ # Check for direct column access
56
+ if scanner.match?(/#{Regexp.escape(column_lower)}\b/)
57
+ scanner.skip(/#{Regexp.escape(column_lower)}\b/)
58
+ count += 1
59
+ end
60
+ end
61
+
62
+ count
63
+ end
64
+
65
+ # Native string operations version - even faster for simple patterns
66
+ def count_column_references_native(content, table_name, column_name)
67
+ content_lower = content.downcase
68
+ column_lower = column_name.downcase
69
+ association_name = column_name.gsub(/_id$/, '').downcase
70
+
71
+ count = 0
72
+
73
+ # Use native string operations for simple patterns
74
+ # Count .where( patterns
75
+ count += count_pattern_native(content_lower, ".where(", column_lower)
76
+ count += count_pattern_native(content_lower, ".where (", column_lower)
77
+ count += count_pattern_native(content_lower, ".find_by(", column_lower)
78
+ count += count_pattern_native(content_lower, ".find_by (", column_lower)
79
+
80
+ # Count joins/includes
81
+ count += count_pattern_native(content_lower, ".joins(", association_name)
82
+ count += count_pattern_native(content_lower, ".joins (", association_name)
83
+ count += count_pattern_native(content_lower, ".includes(", association_name)
84
+ count += count_pattern_native(content_lower, ".includes (", association_name)
85
+
86
+ # Count direct access - use boundary checking
87
+ count += count_direct_access(content_lower, ".#{column_lower}")
88
+
89
+ count
90
+ end
91
+
92
+ private
93
+
94
+ def match_column_reference(scanner, column_name)
95
+ # Skip quotes if present
96
+ scanner.skip(/['":]*/)
97
+
98
+ # Check if column name matches
99
+ if scanner.match?(/#{Regexp.escape(column_name)}/)
100
+ scanner.skip(/#{Regexp.escape(column_name)}/)
101
+ # Verify it's followed by appropriate characters
102
+ scanner.match?(/['":]*\s*[=:]/)
103
+ else
104
+ false
105
+ end
106
+ end
107
+
108
+ def match_association_reference(scanner, association_name)
109
+ # Skip quotes if present
110
+ scanner.skip(/['":]*/)
111
+
112
+ # Check if association name matches
113
+ if scanner.match?(/#{Regexp.escape(association_name)}/)
114
+ scanner.skip(/#{Regexp.escape(association_name)}/)
115
+ # Verify it's followed by appropriate characters
116
+ scanner.match?(/['":]*\s*[\),]/)
117
+ else
118
+ false
119
+ end
120
+ end
121
+
122
+ def count_pattern_native(content, prefix, target)
123
+ count = 0
124
+ index = 0
125
+
126
+ while (pos = content.index(prefix, index))
127
+ # Move past the prefix
128
+ check_pos = pos + prefix.length
129
+
130
+ # Skip whitespace and quotes
131
+ while check_pos < content.length && " \t'\":".include?(content[check_pos])
132
+ check_pos += 1
133
+ end
134
+
135
+ # Check if target matches at this position
136
+ if content[check_pos, target.length] == target
137
+ # Verify word boundary
138
+ next_char_pos = check_pos + target.length
139
+ if next_char_pos >= content.length || !('a'..'z').include?(content[next_char_pos])
140
+ count += 1
141
+ end
142
+ end
143
+
144
+ index = pos + 1
145
+ end
146
+
147
+ count
148
+ end
149
+
150
+ def count_direct_access(content, pattern)
151
+ count = 0
152
+ index = 0
153
+
154
+ while (pos = content.index(pattern, index))
155
+ # Check word boundary after pattern
156
+ next_pos = pos + pattern.length
157
+ if next_pos >= content.length || !('a'..'z').include?(content[next_pos])
158
+ count += 1
159
+ end
160
+ index = pos + 1
161
+ end
162
+
163
+ count
164
+ end
165
+ end
166
+ end
167
+ end
@@ -0,0 +1,111 @@
1
+ module SchemaSherlock
2
+ # Centralized performance optimization for file and pattern processing
3
+ class PerformanceOptimizer
4
+ # File size thresholds for processing strategies
5
+ SMALL_FILE_THRESHOLD = 64 * 1024 # 64KB
6
+ LARGE_FILE_THRESHOLD = 1024 * 1024 # 1MB
7
+
8
+ class << self
9
+ # High-performance file reading with size-based optimization
10
+ def read_file_optimized(file_path)
11
+ return "" unless File.exist?(file_path) && File.readable?(file_path)
12
+
13
+ file_size = File.size(file_path)
14
+ return "" if file_size == 0
15
+
16
+ if file_size < LARGE_FILE_THRESHOLD
17
+ # Small/medium files: direct read
18
+ File.read(file_path, encoding: 'UTF-8', invalid: :replace, undef: :replace)
19
+ else
20
+ # Large files: chunked reading with buffer
21
+ read_large_file_chunked(file_path)
22
+ end
23
+ rescue
24
+ ""
25
+ end
26
+
27
+ # Fast pattern matching with pre-filtering
28
+ def count_patterns_optimized(content, table_name, column_name)
29
+ # Early exit if content is empty or too short
30
+ return 0 if content.nil? || content.length < column_name.length
31
+
32
+ # Quick pre-filter: check if column name exists at all
33
+ content_lower = content.downcase
34
+ column_lower = column_name.downcase
35
+
36
+ # If column name doesn't appear anywhere, skip expensive matching
37
+ return 0 unless content_lower.include?(column_lower)
38
+
39
+ # Use optimized scanner
40
+ OptimizedScanner.count_column_references_native(content, table_name, column_name)
41
+ end
42
+
43
+ # Parallel file processing with optimal thread count
44
+ def process_files_parallel(file_paths, table_name, column_name)
45
+ return 0 if file_paths.empty?
46
+
47
+ # Limit threads to avoid overwhelming the system
48
+ max_threads = [Concurrent.processor_count, file_paths.size, 8].min
49
+
50
+ futures = []
51
+ thread_pool = Concurrent::FixedThreadPool.new(max_threads)
52
+
53
+ file_paths.each do |file_path|
54
+ future = Concurrent::Future.execute(executor: thread_pool) do
55
+ content = read_file_optimized(file_path)
56
+ count_patterns_optimized(content, table_name, column_name)
57
+ end
58
+ futures << future
59
+ end
60
+
61
+ # Collect results efficiently
62
+ total_count = futures.sum do |future|
63
+ future.value || 0
64
+ rescue
65
+ 0
66
+ end
67
+
68
+ thread_pool.shutdown
69
+ thread_pool.wait_for_termination(5)
70
+
71
+ total_count
72
+ end
73
+
74
+ # Smart file filtering to reduce I/O
75
+ def filter_relevant_files(file_paths, column_name)
76
+ # For very large sets, do a quick filename-based filter first
77
+ if file_paths.size > 1000
78
+ # Filter by filename patterns that are likely to contain the column
79
+ association_name = column_name.gsub(/_id$/, '')
80
+ relevant_patterns = [column_name, association_name, 'model', 'service', 'query']
81
+
82
+ file_paths.select do |path|
83
+ filename = File.basename(path, '.rb').downcase
84
+ relevant_patterns.any? { |pattern| filename.include?(pattern) }
85
+ end
86
+ else
87
+ file_paths
88
+ end
89
+ end
90
+
91
+ private
92
+
93
+ def read_large_file_chunked(file_path)
94
+ content = String.new
95
+ chunk_size = 64 * 1024 # 64KB chunks
96
+
97
+ File.open(file_path, 'rb') do |file|
98
+ # OS hint for sequential access
99
+ file.advise(:sequential) if file.respond_to?(:advise)
100
+
101
+ while chunk = file.read(chunk_size)
102
+ content << chunk
103
+ end
104
+ end
105
+
106
+ # Single UTF-8 conversion for entire content
107
+ content.force_encoding('UTF-8').encode('UTF-8', invalid: :replace, undef: :replace)
108
+ end
109
+ end
110
+ end
111
+ end
@@ -0,0 +1,118 @@
1
+ module SchemaSherlock
2
+ class SchemaCache
3
+ class << self
4
+ def initialize_cache
5
+ @table_exists_cache = {}
6
+ @columns_cache = {}
7
+ @indexes_cache = {}
8
+ @primary_keys_cache = {}
9
+ end
10
+
11
+ def clear_cache
12
+ @table_exists_cache&.clear
13
+ @columns_cache&.clear
14
+ @indexes_cache&.clear
15
+ @primary_keys_cache&.clear
16
+ end
17
+
18
+ def connection
19
+ ActiveRecord::Base.connection
20
+ end
21
+
22
+ def preload_all_metadata
23
+ initialize_cache if @table_exists_cache.nil?
24
+
25
+ # Preload all tables existence
26
+ all_tables = connection.tables
27
+ all_tables.each { |table| @table_exists_cache[table] = true }
28
+
29
+ # Preload columns, indexes, and primary keys for all tables
30
+ all_tables.each do |table|
31
+ @columns_cache[table] = connection.columns(table)
32
+ @indexes_cache[table] = connection.indexes(table)
33
+ @primary_keys_cache[table] = connection.primary_key(table)
34
+ end
35
+
36
+ # Return stats for debugging
37
+ {
38
+ tables_cached: @table_exists_cache.size,
39
+ columns_cached: @columns_cache.size,
40
+ indexes_cached: @indexes_cache.size,
41
+ primary_keys_cached: @primary_keys_cache.size
42
+ }
43
+ end
44
+
45
+ def table_exists?(table_name)
46
+ initialize_cache if @table_exists_cache.nil?
47
+
48
+ # Check cache first
49
+ return @table_exists_cache[table_name] if @table_exists_cache.key?(table_name)
50
+
51
+ # If not in cache, check database and cache result
52
+ exists = connection.table_exists?(table_name)
53
+ @table_exists_cache[table_name] = exists
54
+ exists
55
+ end
56
+
57
+ def columns(table_name)
58
+ initialize_cache if @columns_cache.nil?
59
+
60
+ # Check cache first
61
+ return @columns_cache[table_name] if @columns_cache.key?(table_name)
62
+
63
+ # If not in cache, fetch from database and cache result
64
+ return nil unless table_exists?(table_name)
65
+
66
+ columns = connection.columns(table_name)
67
+ @columns_cache[table_name] = columns
68
+ columns
69
+ end
70
+
71
+ def indexes(table_name)
72
+ initialize_cache if @indexes_cache.nil?
73
+
74
+ # Check cache first
75
+ return @indexes_cache[table_name] if @indexes_cache.key?(table_name)
76
+
77
+ # If not in cache, fetch from database and cache result
78
+ return [] unless table_exists?(table_name)
79
+
80
+ indexes = connection.indexes(table_name)
81
+ @indexes_cache[table_name] = indexes
82
+ indexes
83
+ end
84
+
85
+ def primary_key(table_name)
86
+ initialize_cache if @primary_keys_cache.nil?
87
+
88
+ # Check cache first
89
+ return @primary_keys_cache[table_name] if @primary_keys_cache.key?(table_name)
90
+
91
+ # If not in cache, fetch from database and cache result
92
+ return nil unless table_exists?(table_name)
93
+
94
+ primary_key = connection.primary_key(table_name)
95
+ @primary_keys_cache[table_name] = primary_key
96
+ primary_key
97
+ end
98
+
99
+ # Helper method to get column by name
100
+ def column(table_name, column_name)
101
+ table_columns = columns(table_name)
102
+ return nil unless table_columns
103
+
104
+ table_columns.find { |col| col.name == column_name }
105
+ end
106
+
107
+ # Get cache statistics
108
+ def cache_stats
109
+ {
110
+ table_exists_cache_size: @table_exists_cache&.size || 0,
111
+ columns_cache_size: @columns_cache&.size || 0,
112
+ indexes_cache_size: @indexes_cache&.size || 0,
113
+ primary_keys_cache_size: @primary_keys_cache&.size || 0
114
+ }
115
+ end
116
+ end
117
+ end
118
+ end
@@ -1,6 +1,14 @@
1
+ require_relative 'file_cache'
2
+ require_relative 'optimized_scanner'
3
+ require_relative 'binary_index'
4
+ require_relative 'indexed_usage_tracker'
5
+ require_relative 'performance_optimizer'
6
+
1
7
  module SchemaSherlock
2
8
  class UsageTracker
3
9
  class << self
10
+ attr_accessor :binary_index
11
+
4
12
  def track_foreign_key_usage(model_class)
5
13
  return {} unless SchemaSherlock.configuration.min_usage_threshold
6
14
 
@@ -27,21 +35,41 @@ module SchemaSherlock
27
35
  end
28
36
 
29
37
  def scan_for_column_usage(table_name, column_name)
30
- count = 0
38
+ # Use binary index if available for fastest lookup
39
+ if binary_index
40
+ IndexedUsageTracker.count_column_references_with_index(binary_index, table_name, column_name)
41
+ else
42
+ # Use performance-optimized scanning
43
+ scan_with_performance_optimizer(table_name, column_name)
44
+ end
45
+ end
46
+
47
+ def scan_with_performance_optimizer(table_name, column_name)
48
+ all_files = get_relevant_files
49
+ filtered_files = PerformanceOptimizer.filter_relevant_files(all_files, column_name)
50
+
51
+ PerformanceOptimizer.process_files_parallel(filtered_files, table_name, column_name)
52
+ end
31
53
 
32
- # Scan common Rails directories for usage patterns
54
+ def get_relevant_files
55
+ files = []
33
56
  scan_directories.each do |dir|
34
57
  next unless Dir.exist?(dir)
35
58
 
36
- Dir.glob("#{dir}/**/*.rb").each do |file|
37
- content = File.read(file)
38
- count += count_column_references(content, table_name, column_name)
39
- rescue
40
- # Skip files that can't be read
59
+ Dir.glob(File.join(dir, "**/*.rb")).each do |file|
60
+ next if should_skip_file?(file)
61
+ files << file
41
62
  end
42
63
  end
64
+ files
65
+ end
43
66
 
44
- count
67
+ def should_skip_file?(file)
68
+ file.include?('/spec/') ||
69
+ file.include?('/test/') ||
70
+ file.include?('/vendor/') ||
71
+ file.include?('/node_modules/') ||
72
+ File.size(file) > 50 * 1024 * 1024 # Skip files larger than 50MB
45
73
  end
46
74
 
47
75
  def scan_directories
@@ -58,21 +86,7 @@ module SchemaSherlock
58
86
 
59
87
 
60
88
  def count_column_references(content, table_name, column_name)
61
- count = 0
62
-
63
- # Count WHERE clauses using the foreign key
64
- count += content.scan(/\.where\s*\(\s*['":]?#{column_name}['":]?\s*[=:]/i).length
65
- count += content.scan(/\.find_by\s*\(\s*['":]?#{column_name}['":]?\s*[=:]/i).length
66
-
67
- # Count joins using the foreign key
68
- association_name = column_name.gsub(/_id$/, '')
69
- count += content.scan(/\.joins\s*\(\s*['":]?#{association_name}['":]?\s*\)/i).length
70
- count += content.scan(/\.includes\s*\(\s*['":]?#{association_name}['":]?\s*\)/i).length
71
-
72
- # Count direct foreign key access
73
- count += content.scan(/\.#{column_name}\b/i).length
74
-
75
- count
89
+ OptimizedScanner.count_column_references_native(content, table_name, column_name)
76
90
  end
77
91
  end
78
92
  end
@@ -1,3 +1,3 @@
1
1
  module SchemaSherlock
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.1"
3
3
  end
@@ -30,6 +30,8 @@ Gem::Specification.new do |spec|
30
30
  spec.add_dependency "rails", ">= 6.0"
31
31
  spec.add_dependency "thor", "~> 1.0"
32
32
  spec.add_dependency "activerecord", ">= 6.0"
33
+ spec.add_dependency "concurrent-ruby", "~> 1.0"
34
+ spec.add_dependency "msgpack", "~> 1.0"
33
35
 
34
36
  # Development dependencies
35
37
  spec.add_development_dependency "rspec", "~> 3.0"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: schema_sherlock
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Prateek Choudhary
@@ -52,6 +52,34 @@ dependencies:
52
52
  - - ">="
53
53
  - !ruby/object:Gem::Version
54
54
  version: '6.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: concurrent-ruby
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: msgpack
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1.0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '1.0'
55
83
  - !ruby/object:Gem::Dependency
56
84
  name: rspec
57
85
  requirement: !ruby/object:Gem::Requirement
@@ -85,10 +113,17 @@ files:
85
113
  - lib/schema_sherlock.rb
86
114
  - lib/schema_sherlock/analyzers/base_analyzer.rb
87
115
  - lib/schema_sherlock/analyzers/foreign_key_detector.rb
116
+ - lib/schema_sherlock/analyzers/index_recommendation_detector.rb
117
+ - lib/schema_sherlock/binary_index.rb
88
118
  - lib/schema_sherlock/commands/analyze_command.rb
89
119
  - lib/schema_sherlock/commands/base_command.rb
90
120
  - lib/schema_sherlock/configuration.rb
121
+ - lib/schema_sherlock/file_cache.rb
122
+ - lib/schema_sherlock/indexed_usage_tracker.rb
91
123
  - lib/schema_sherlock/model_loader.rb
124
+ - lib/schema_sherlock/optimized_scanner.rb
125
+ - lib/schema_sherlock/performance_optimizer.rb
126
+ - lib/schema_sherlock/schema_cache.rb
92
127
  - lib/schema_sherlock/usage_tracker.rb
93
128
  - lib/schema_sherlock/version.rb
94
129
  - schema_sherlock.gemspec