codebase_index 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +29 -0
  3. data/CODE_OF_CONDUCT.md +83 -0
  4. data/CONTRIBUTING.md +65 -0
  5. data/LICENSE.txt +21 -0
  6. data/README.md +481 -0
  7. data/exe/codebase-console-mcp +22 -0
  8. data/exe/codebase-index-mcp +61 -0
  9. data/exe/codebase-index-mcp-http +64 -0
  10. data/exe/codebase-index-mcp-start +58 -0
  11. data/lib/codebase_index/ast/call_site_extractor.rb +106 -0
  12. data/lib/codebase_index/ast/method_extractor.rb +76 -0
  13. data/lib/codebase_index/ast/node.rb +88 -0
  14. data/lib/codebase_index/ast/parser.rb +653 -0
  15. data/lib/codebase_index/ast.rb +6 -0
  16. data/lib/codebase_index/builder.rb +137 -0
  17. data/lib/codebase_index/chunking/chunk.rb +84 -0
  18. data/lib/codebase_index/chunking/semantic_chunker.rb +290 -0
  19. data/lib/codebase_index/console/adapters/cache_adapter.rb +58 -0
  20. data/lib/codebase_index/console/adapters/good_job_adapter.rb +66 -0
  21. data/lib/codebase_index/console/adapters/sidekiq_adapter.rb +66 -0
  22. data/lib/codebase_index/console/adapters/solid_queue_adapter.rb +66 -0
  23. data/lib/codebase_index/console/audit_logger.rb +75 -0
  24. data/lib/codebase_index/console/bridge.rb +170 -0
  25. data/lib/codebase_index/console/confirmation.rb +90 -0
  26. data/lib/codebase_index/console/connection_manager.rb +173 -0
  27. data/lib/codebase_index/console/console_response_renderer.rb +78 -0
  28. data/lib/codebase_index/console/model_validator.rb +81 -0
  29. data/lib/codebase_index/console/safe_context.rb +82 -0
  30. data/lib/codebase_index/console/server.rb +557 -0
  31. data/lib/codebase_index/console/sql_validator.rb +172 -0
  32. data/lib/codebase_index/console/tools/tier1.rb +118 -0
  33. data/lib/codebase_index/console/tools/tier2.rb +117 -0
  34. data/lib/codebase_index/console/tools/tier3.rb +110 -0
  35. data/lib/codebase_index/console/tools/tier4.rb +79 -0
  36. data/lib/codebase_index/coordination/pipeline_lock.rb +109 -0
  37. data/lib/codebase_index/cost_model/embedding_cost.rb +88 -0
  38. data/lib/codebase_index/cost_model/estimator.rb +128 -0
  39. data/lib/codebase_index/cost_model/provider_pricing.rb +67 -0
  40. data/lib/codebase_index/cost_model/storage_cost.rb +52 -0
  41. data/lib/codebase_index/cost_model.rb +22 -0
  42. data/lib/codebase_index/db/migrations/001_create_units.rb +38 -0
  43. data/lib/codebase_index/db/migrations/002_create_edges.rb +35 -0
  44. data/lib/codebase_index/db/migrations/003_create_embeddings.rb +37 -0
  45. data/lib/codebase_index/db/migrations/004_create_snapshots.rb +45 -0
  46. data/lib/codebase_index/db/migrations/005_create_snapshot_units.rb +40 -0
  47. data/lib/codebase_index/db/migrator.rb +71 -0
  48. data/lib/codebase_index/db/schema_version.rb +73 -0
  49. data/lib/codebase_index/dependency_graph.rb +227 -0
  50. data/lib/codebase_index/embedding/indexer.rb +130 -0
  51. data/lib/codebase_index/embedding/openai.rb +105 -0
  52. data/lib/codebase_index/embedding/provider.rb +135 -0
  53. data/lib/codebase_index/embedding/text_preparer.rb +112 -0
  54. data/lib/codebase_index/evaluation/baseline_runner.rb +115 -0
  55. data/lib/codebase_index/evaluation/evaluator.rb +146 -0
  56. data/lib/codebase_index/evaluation/metrics.rb +79 -0
  57. data/lib/codebase_index/evaluation/query_set.rb +148 -0
  58. data/lib/codebase_index/evaluation/report_generator.rb +90 -0
  59. data/lib/codebase_index/extracted_unit.rb +145 -0
  60. data/lib/codebase_index/extractor.rb +956 -0
  61. data/lib/codebase_index/extractors/action_cable_extractor.rb +228 -0
  62. data/lib/codebase_index/extractors/ast_source_extraction.rb +46 -0
  63. data/lib/codebase_index/extractors/behavioral_profile.rb +309 -0
  64. data/lib/codebase_index/extractors/caching_extractor.rb +261 -0
  65. data/lib/codebase_index/extractors/callback_analyzer.rb +232 -0
  66. data/lib/codebase_index/extractors/concern_extractor.rb +253 -0
  67. data/lib/codebase_index/extractors/configuration_extractor.rb +219 -0
  68. data/lib/codebase_index/extractors/controller_extractor.rb +494 -0
  69. data/lib/codebase_index/extractors/database_view_extractor.rb +278 -0
  70. data/lib/codebase_index/extractors/decorator_extractor.rb +260 -0
  71. data/lib/codebase_index/extractors/engine_extractor.rb +204 -0
  72. data/lib/codebase_index/extractors/event_extractor.rb +211 -0
  73. data/lib/codebase_index/extractors/factory_extractor.rb +289 -0
  74. data/lib/codebase_index/extractors/graphql_extractor.rb +917 -0
  75. data/lib/codebase_index/extractors/i18n_extractor.rb +117 -0
  76. data/lib/codebase_index/extractors/job_extractor.rb +369 -0
  77. data/lib/codebase_index/extractors/lib_extractor.rb +249 -0
  78. data/lib/codebase_index/extractors/mailer_extractor.rb +339 -0
  79. data/lib/codebase_index/extractors/manager_extractor.rb +202 -0
  80. data/lib/codebase_index/extractors/middleware_extractor.rb +133 -0
  81. data/lib/codebase_index/extractors/migration_extractor.rb +469 -0
  82. data/lib/codebase_index/extractors/model_extractor.rb +960 -0
  83. data/lib/codebase_index/extractors/phlex_extractor.rb +252 -0
  84. data/lib/codebase_index/extractors/policy_extractor.rb +214 -0
  85. data/lib/codebase_index/extractors/poro_extractor.rb +246 -0
  86. data/lib/codebase_index/extractors/pundit_extractor.rb +223 -0
  87. data/lib/codebase_index/extractors/rails_source_extractor.rb +473 -0
  88. data/lib/codebase_index/extractors/rake_task_extractor.rb +343 -0
  89. data/lib/codebase_index/extractors/route_extractor.rb +181 -0
  90. data/lib/codebase_index/extractors/scheduled_job_extractor.rb +331 -0
  91. data/lib/codebase_index/extractors/serializer_extractor.rb +334 -0
  92. data/lib/codebase_index/extractors/service_extractor.rb +254 -0
  93. data/lib/codebase_index/extractors/shared_dependency_scanner.rb +91 -0
  94. data/lib/codebase_index/extractors/shared_utility_methods.rb +99 -0
  95. data/lib/codebase_index/extractors/state_machine_extractor.rb +398 -0
  96. data/lib/codebase_index/extractors/test_mapping_extractor.rb +225 -0
  97. data/lib/codebase_index/extractors/validator_extractor.rb +225 -0
  98. data/lib/codebase_index/extractors/view_component_extractor.rb +310 -0
  99. data/lib/codebase_index/extractors/view_template_extractor.rb +261 -0
  100. data/lib/codebase_index/feedback/gap_detector.rb +89 -0
  101. data/lib/codebase_index/feedback/store.rb +119 -0
  102. data/lib/codebase_index/flow_analysis/operation_extractor.rb +209 -0
  103. data/lib/codebase_index/flow_analysis/response_code_mapper.rb +154 -0
  104. data/lib/codebase_index/flow_assembler.rb +290 -0
  105. data/lib/codebase_index/flow_document.rb +191 -0
  106. data/lib/codebase_index/flow_precomputer.rb +102 -0
  107. data/lib/codebase_index/formatting/base.rb +40 -0
  108. data/lib/codebase_index/formatting/claude_adapter.rb +98 -0
  109. data/lib/codebase_index/formatting/generic_adapter.rb +56 -0
  110. data/lib/codebase_index/formatting/gpt_adapter.rb +64 -0
  111. data/lib/codebase_index/formatting/human_adapter.rb +78 -0
  112. data/lib/codebase_index/graph_analyzer.rb +374 -0
  113. data/lib/codebase_index/mcp/index_reader.rb +394 -0
  114. data/lib/codebase_index/mcp/renderers/claude_renderer.rb +81 -0
  115. data/lib/codebase_index/mcp/renderers/json_renderer.rb +17 -0
  116. data/lib/codebase_index/mcp/renderers/markdown_renderer.rb +352 -0
  117. data/lib/codebase_index/mcp/renderers/plain_renderer.rb +240 -0
  118. data/lib/codebase_index/mcp/server.rb +935 -0
  119. data/lib/codebase_index/mcp/tool_response_renderer.rb +62 -0
  120. data/lib/codebase_index/model_name_cache.rb +51 -0
  121. data/lib/codebase_index/notion/client.rb +217 -0
  122. data/lib/codebase_index/notion/exporter.rb +219 -0
  123. data/lib/codebase_index/notion/mapper.rb +39 -0
  124. data/lib/codebase_index/notion/mappers/column_mapper.rb +65 -0
  125. data/lib/codebase_index/notion/mappers/migration_mapper.rb +39 -0
  126. data/lib/codebase_index/notion/mappers/model_mapper.rb +164 -0
  127. data/lib/codebase_index/notion/rate_limiter.rb +68 -0
  128. data/lib/codebase_index/observability/health_check.rb +81 -0
  129. data/lib/codebase_index/observability/instrumentation.rb +34 -0
  130. data/lib/codebase_index/observability/structured_logger.rb +75 -0
  131. data/lib/codebase_index/operator/error_escalator.rb +81 -0
  132. data/lib/codebase_index/operator/pipeline_guard.rb +99 -0
  133. data/lib/codebase_index/operator/status_reporter.rb +80 -0
  134. data/lib/codebase_index/railtie.rb +26 -0
  135. data/lib/codebase_index/resilience/circuit_breaker.rb +99 -0
  136. data/lib/codebase_index/resilience/index_validator.rb +185 -0
  137. data/lib/codebase_index/resilience/retryable_provider.rb +108 -0
  138. data/lib/codebase_index/retrieval/context_assembler.rb +249 -0
  139. data/lib/codebase_index/retrieval/query_classifier.rb +131 -0
  140. data/lib/codebase_index/retrieval/ranker.rb +273 -0
  141. data/lib/codebase_index/retrieval/search_executor.rb +327 -0
  142. data/lib/codebase_index/retriever.rb +160 -0
  143. data/lib/codebase_index/ruby_analyzer/class_analyzer.rb +190 -0
  144. data/lib/codebase_index/ruby_analyzer/dataflow_analyzer.rb +78 -0
  145. data/lib/codebase_index/ruby_analyzer/fqn_builder.rb +18 -0
  146. data/lib/codebase_index/ruby_analyzer/mermaid_renderer.rb +275 -0
  147. data/lib/codebase_index/ruby_analyzer/method_analyzer.rb +143 -0
  148. data/lib/codebase_index/ruby_analyzer/trace_enricher.rb +139 -0
  149. data/lib/codebase_index/ruby_analyzer.rb +87 -0
  150. data/lib/codebase_index/session_tracer/file_store.rb +111 -0
  151. data/lib/codebase_index/session_tracer/middleware.rb +143 -0
  152. data/lib/codebase_index/session_tracer/redis_store.rb +112 -0
  153. data/lib/codebase_index/session_tracer/session_flow_assembler.rb +263 -0
  154. data/lib/codebase_index/session_tracer/session_flow_document.rb +223 -0
  155. data/lib/codebase_index/session_tracer/solid_cache_store.rb +145 -0
  156. data/lib/codebase_index/session_tracer/store.rb +67 -0
  157. data/lib/codebase_index/storage/graph_store.rb +120 -0
  158. data/lib/codebase_index/storage/metadata_store.rb +169 -0
  159. data/lib/codebase_index/storage/pgvector.rb +163 -0
  160. data/lib/codebase_index/storage/qdrant.rb +172 -0
  161. data/lib/codebase_index/storage/vector_store.rb +156 -0
  162. data/lib/codebase_index/temporal/snapshot_store.rb +341 -0
  163. data/lib/codebase_index/version.rb +5 -0
  164. data/lib/codebase_index.rb +223 -0
  165. data/lib/generators/codebase_index/install_generator.rb +32 -0
  166. data/lib/generators/codebase_index/pgvector_generator.rb +37 -0
  167. data/lib/generators/codebase_index/templates/add_pgvector_to_codebase_index.rb.erb +15 -0
  168. data/lib/generators/codebase_index/templates/create_codebase_index_tables.rb.erb +43 -0
  169. data/lib/tasks/codebase_index.rake +583 -0
  170. data/lib/tasks/codebase_index_evaluation.rake +115 -0
  171. metadata +252 -0
@@ -0,0 +1,583 @@
1
+ # frozen_string_literal: true
2
+
3
+ # lib/tasks/codebase_index.rake
4
+ #
5
+ # Rake tasks for codebase indexing.
6
+ # These can be run manually or integrated into CI pipelines.
7
+ #
8
+ # Usage:
9
+ # bundle exec rake codebase_index:extract # Full extraction
10
+ # bundle exec rake codebase_index:incremental # Changed files only
11
+ # bundle exec rake codebase_index:extract_framework # Rails/gem sources only
12
+ # bundle exec rake codebase_index:validate # Validate index integrity
13
+ # bundle exec rake codebase_index:stats # Show index statistics
14
+ # bundle exec rake codebase_index:clean # Remove index
15
+ # bundle exec rake codebase_index:self_analyze # Analyze gem's own source
16
+ # bundle exec rake codebase_index:flow[EntryPoint] # Generate execution flow
17
+
18
+ namespace :codebase_index do
19
+ desc 'Full extraction of codebase for indexing'
20
+ task extract: :environment do
21
+ require 'codebase_index/extractor'
22
+
23
+ output_dir = ENV.fetch('CODEBASE_INDEX_OUTPUT', Rails.root.join('tmp/codebase_index'))
24
+
25
+ puts 'Starting full codebase extraction...'
26
+ puts "Output directory: #{output_dir}"
27
+ puts
28
+
29
+ extractor = CodebaseIndex::Extractor.new(output_dir: output_dir)
30
+ results = extractor.extract_all
31
+
32
+ puts
33
+ puts 'Extraction complete!'
34
+ puts '=' * 50
35
+ results.each do |type, units|
36
+ puts " #{type.to_s.ljust(15)}: #{units.size} units"
37
+ end
38
+ puts '=' * 50
39
+ puts " Total: #{results.values.sum(&:size)} units"
40
+ puts
41
+ puts "Output written to: #{output_dir}"
42
+ end
43
+
44
+ desc 'Incremental extraction based on git changes'
45
+ task incremental: :environment do
46
+ require 'codebase_index/extractor'
47
+
48
+ output_dir = ENV.fetch('CODEBASE_INDEX_OUTPUT', Rails.root.join('tmp/codebase_index'))
49
+
50
+ # Determine changed files from CI environment or git
51
+ require 'open3'
52
+
53
+ changed_files = if ENV['CHANGED_FILES']
54
+ # Explicit list from CI
55
+ ENV['CHANGED_FILES'].split(',').map(&:strip)
56
+ elsif ENV['CI_COMMIT_BEFORE_SHA']
57
+ # GitLab CI
58
+ output, = Open3.capture2('git', 'diff', '--name-only',
59
+ "#{ENV['CI_COMMIT_BEFORE_SHA']}..#{ENV.fetch('CI_COMMIT_SHA', nil)}")
60
+ output.lines.map(&:strip)
61
+ elsif ENV['GITHUB_BASE_REF']
62
+ # GitHub Actions PR
63
+ output, = Open3.capture2('git', 'diff', '--name-only',
64
+ "origin/#{ENV['GITHUB_BASE_REF']}...HEAD")
65
+ output.lines.map(&:strip)
66
+ else
67
+ # Default: changes since last commit
68
+ output, = Open3.capture2('git', 'diff', '--name-only', 'HEAD~1')
69
+ output.lines.map(&:strip)
70
+ end
71
+
72
+ # Filter to relevant files
73
+ relevant_patterns = [
74
+ %r{^app/models/},
75
+ %r{^app/controllers/},
76
+ %r{^app/services/},
77
+ %r{^app/components/},
78
+ %r{^app/views/components/},
79
+ %r{^app/views/.*\.rb$}, # Phlex views
80
+ %r{^app/interactors/},
81
+ %r{^app/operations/},
82
+ %r{^app/commands/},
83
+ %r{^app/use_cases/},
84
+ %r{^app/jobs/},
85
+ %r{^app/workers/}, # Sidekiq workers
86
+ %r{^app/mailers/},
87
+ %r{^app/graphql/}, # GraphQL types/mutations/resolvers
88
+ %r{^app/serializers/},
89
+ %r{^app/decorators/},
90
+ %r{^app/blueprinters/},
91
+ %r{^db/migrate/},
92
+ %r{^db/schema\.rb$}, # Schema changes affect model metadata
93
+ %r{^config/routes\.rb$},
94
+ /^Gemfile\.lock$/ # Dependency changes trigger framework re-index
95
+ ]
96
+
97
+ changed_files = changed_files.select do |f|
98
+ relevant_patterns.any? { |p| f.match?(p) }
99
+ end
100
+
101
+ if changed_files.empty?
102
+ puts 'No relevant files changed. Skipping extraction.'
103
+ exit 0
104
+ end
105
+
106
+ puts "Incremental extraction for #{changed_files.size} changed files..."
107
+ changed_files.each { |f| puts " - #{f}" }
108
+ puts
109
+
110
+ extractor = CodebaseIndex::Extractor.new(output_dir: output_dir)
111
+ affected = extractor.extract_changed(changed_files)
112
+
113
+ puts
114
+ puts "Re-extracted #{affected.size} affected units."
115
+ end
116
+
117
+ desc 'Extract only Rails/gem framework sources (run when dependencies change)'
118
+ task extract_framework: :environment do
119
+ require 'codebase_index/extractors/rails_source_extractor'
120
+
121
+ output_dir = ENV.fetch('CODEBASE_INDEX_OUTPUT', Rails.root.join('tmp/codebase_index'))
122
+
123
+ puts 'Extracting Rails and gem framework sources...'
124
+ puts "Rails version: #{Rails.version}"
125
+ puts
126
+
127
+ extractor = CodebaseIndex::Extractors::RailsSourceExtractor.new
128
+ units = extractor.extract_all
129
+
130
+ # Write output
131
+ framework_dir = Pathname.new(output_dir).join('rails_source')
132
+ FileUtils.mkdir_p(framework_dir)
133
+
134
+ units.each do |unit|
135
+ file_name = "#{unit.identifier.gsub('/', '__').gsub('::', '__')}.json"
136
+ File.write(
137
+ framework_dir.join(file_name),
138
+ JSON.pretty_generate(unit.to_h)
139
+ )
140
+ end
141
+
142
+ puts "Extracted #{units.size} framework source units."
143
+ puts "Output: #{framework_dir}"
144
+ end
145
+
146
+ desc 'Validate extracted index integrity'
147
+ task validate: :environment do
148
+ output_dir = Pathname.new(ENV.fetch('CODEBASE_INDEX_OUTPUT', Rails.root.join('tmp/codebase_index')))
149
+
150
+ unless output_dir.exist?
151
+ puts "ERROR: Index directory does not exist: #{output_dir}"
152
+ exit 1
153
+ end
154
+
155
+ manifest_path = output_dir.join('manifest.json')
156
+ unless manifest_path.exist?
157
+ puts 'ERROR: Manifest not found. Run extraction first.'
158
+ exit 1
159
+ end
160
+
161
+ manifest = JSON.parse(File.read(manifest_path))
162
+
163
+ puts 'Validating index...'
164
+ puts " Extracted at: #{manifest['extracted_at']}"
165
+ puts " Git SHA: #{manifest['git_sha']}"
166
+ puts
167
+
168
+ errors = []
169
+ warnings = []
170
+
171
+ # Check each type directory
172
+ manifest['counts'].each do |type, expected_count|
173
+ type_dir = output_dir.join(type)
174
+ unless type_dir.exist?
175
+ errors << "Missing directory: #{type}"
176
+ next
177
+ end
178
+
179
+ actual_count = Dir[type_dir.join('*.json')].reject { |f| f.end_with?('_index.json') }.size
180
+
181
+ warnings << "#{type}: expected #{expected_count}, found #{actual_count}" if actual_count != expected_count
182
+
183
+ # Validate each unit file is valid JSON
184
+ Dir[type_dir.join('*.json')].each do |file|
185
+ next if file.end_with?('_index.json')
186
+
187
+ begin
188
+ data = JSON.parse(File.read(file))
189
+ errors << "#{file}: missing identifier" unless data['identifier']
190
+ errors << "#{file}: missing source_code" unless data['source_code']
191
+ rescue JSON::ParserError => e
192
+ errors << "#{file}: invalid JSON - #{e.message}"
193
+ end
194
+ end
195
+ end
196
+
197
+ # Check dependency graph
198
+ graph_path = output_dir.join('dependency_graph.json')
199
+ if graph_path.exist?
200
+ begin
201
+ JSON.parse(File.read(graph_path))
202
+ rescue JSON::ParserError
203
+ errors << 'dependency_graph.json: invalid JSON'
204
+ end
205
+ else
206
+ errors << 'Missing dependency_graph.json'
207
+ end
208
+
209
+ # Report
210
+ if errors.any?
211
+ puts 'ERRORS:'
212
+ errors.each { |e| puts " ✗ #{e}" }
213
+ end
214
+
215
+ if warnings.any?
216
+ puts 'WARNINGS:'
217
+ warnings.each { |w| puts " ⚠ #{w}" }
218
+ end
219
+
220
+ if errors.empty? && warnings.empty?
221
+ puts '✓ Index is valid.'
222
+ elsif errors.empty?
223
+ puts "\n✓ Index is valid with #{warnings.size} warning(s)."
224
+ else
225
+ puts "\n✗ Index has #{errors.size} error(s)."
226
+ exit 1
227
+ end
228
+ end
229
+
230
+ desc 'Show index statistics'
231
+ task stats: :environment do
232
+ output_dir = Pathname.new(ENV.fetch('CODEBASE_INDEX_OUTPUT', Rails.root.join('tmp/codebase_index')))
233
+
234
+ unless output_dir.exist?
235
+ puts 'Index directory does not exist. Run extraction first.'
236
+ exit 1
237
+ end
238
+
239
+ manifest_path = output_dir.join('manifest.json')
240
+ manifest = manifest_path.exist? ? JSON.parse(File.read(manifest_path)) : {}
241
+
242
+ puts 'Codebase Index Statistics'
243
+ puts '=' * 50
244
+ puts " Extracted at: #{manifest['extracted_at'] || 'unknown'}"
245
+ puts " Rails version: #{manifest['rails_version'] || 'unknown'}"
246
+ puts " Ruby version: #{manifest['ruby_version'] || 'unknown'}"
247
+ puts " Git SHA: #{manifest['git_sha'] || 'unknown'}"
248
+ puts " Git branch: #{manifest['git_branch'] || 'unknown'}"
249
+ puts
250
+
251
+ puts 'Units by Type'
252
+ puts '-' * 50
253
+
254
+ total_size = 0
255
+ total_units = 0
256
+ total_chunks = 0
257
+
258
+ (manifest['counts'] || {}).each do |type, count|
259
+ type_dir = output_dir.join(type)
260
+ next unless type_dir.exist?
261
+
262
+ type_size = Dir[type_dir.join('*.json')].sum { |f| File.size(f) }
263
+ total_size += type_size
264
+ total_units += count
265
+
266
+ # Count chunks from index
267
+ index_path = type_dir.join('_index.json')
268
+ type_chunks = 0
269
+ if index_path.exist?
270
+ index = JSON.parse(File.read(index_path))
271
+ type_chunks = index.sum { |u| u['chunk_count'] || 0 }
272
+ total_chunks += type_chunks
273
+ end
274
+
275
+ puts " #{type.ljust(15)}: #{count.to_s.rjust(4)} units, #{type_chunks.to_s.rjust(4)} chunks, #{(type_size / 1024.0).round(1).to_s.rjust(8)} KB"
276
+ end
277
+
278
+ puts '-' * 50
279
+ puts " #{'Total'.ljust(15)}: #{total_units.to_s.rjust(4)} units, #{total_chunks.to_s.rjust(4)} chunks, #{(total_size / 1024.0).round(1).to_s.rjust(8)} KB"
280
+ puts
281
+
282
+ # Dependency graph stats
283
+ graph_path = output_dir.join('dependency_graph.json')
284
+ if graph_path.exist?
285
+ graph = JSON.parse(File.read(graph_path))
286
+ stats = graph['stats'] || {}
287
+ puts 'Dependency Graph'
288
+ puts '-' * 50
289
+ puts " Nodes: #{stats['node_count'] || 'unknown'}"
290
+ puts " Edges: #{stats['edge_count'] || 'unknown'}"
291
+ end
292
+ end
293
+
294
+ desc 'Clean extracted index'
295
+ task clean: :environment do
296
+ output_dir = Pathname.new(ENV.fetch('CODEBASE_INDEX_OUTPUT', Rails.root.join('tmp/codebase_index')))
297
+
298
+ if output_dir.exist?
299
+ puts "Removing #{output_dir}..."
300
+ FileUtils.rm_rf(output_dir)
301
+ puts 'Done.'
302
+ else
303
+ puts 'Index directory does not exist.'
304
+ end
305
+ end
306
+
307
+ desc 'Retrieve context for a query (for testing)'
308
+ task :retrieve, [:query] => :environment do |_t, args|
309
+ query = args[:query] || raise('Usage: rake codebase_index:retrieve[query]')
310
+
311
+ require 'codebase_index'
312
+ require 'codebase_index/retriever'
313
+ require 'codebase_index/embedding/provider'
314
+ require 'codebase_index/storage/vector_store'
315
+ require 'codebase_index/storage/metadata_store'
316
+ require 'codebase_index/storage/graph_store'
317
+ require 'codebase_index/formatting/human_adapter'
318
+
319
+ config = CodebaseIndex.configuration
320
+
321
+ provider = CodebaseIndex::Embedding::Provider::Ollama.new
322
+ vector_store = CodebaseIndex::Storage::VectorStore::InMemory.new
323
+ metadata_store = CodebaseIndex::Storage::MetadataStore::SQLite.new
324
+ graph_store = CodebaseIndex::Storage::GraphStore::Memory.new
325
+
326
+ retriever = CodebaseIndex::Retriever.new(
327
+ vector_store: vector_store,
328
+ metadata_store: metadata_store,
329
+ graph_store: graph_store,
330
+ embedding_provider: provider
331
+ )
332
+
333
+ result = retriever.retrieve(query, budget: config.max_context_tokens)
334
+
335
+ formatter = CodebaseIndex::Formatting::HumanAdapter.new
336
+ puts formatter.format(result)
337
+ end
338
+
339
+ desc 'Embed all extracted units'
340
+ task embed: :environment do
341
+ require 'codebase_index'
342
+ require 'codebase_index/embedding/indexer'
343
+ require 'codebase_index/embedding/text_preparer'
344
+ require 'codebase_index/embedding/provider'
345
+ require 'codebase_index/storage/vector_store'
346
+
347
+ config = CodebaseIndex.configuration
348
+ output_dir = ENV.fetch('CODEBASE_INDEX_OUTPUT', config.output_dir)
349
+
350
+ provider = CodebaseIndex::Embedding::Provider::Ollama.new
351
+ text_preparer = CodebaseIndex::Embedding::TextPreparer.new
352
+ vector_store = CodebaseIndex::Storage::VectorStore::InMemory.new
353
+
354
+ indexer = CodebaseIndex::Embedding::Indexer.new(
355
+ provider: provider,
356
+ text_preparer: text_preparer,
357
+ vector_store: vector_store,
358
+ output_dir: output_dir
359
+ )
360
+
361
+ puts 'Embedding all extracted units...'
362
+ stats = indexer.index_all
363
+
364
+ puts
365
+ puts 'Embedding complete!'
366
+ puts " Processed: #{stats[:processed]}"
367
+ puts " Skipped: #{stats[:skipped]}"
368
+ puts " Errors: #{stats[:errors]}"
369
+ end
370
+
371
+ desc 'Embed changed units only (incremental)'
372
+ task embed_incremental: :environment do
373
+ require 'codebase_index'
374
+ require 'codebase_index/embedding/indexer'
375
+ require 'codebase_index/embedding/text_preparer'
376
+ require 'codebase_index/embedding/provider'
377
+ require 'codebase_index/storage/vector_store'
378
+
379
+ config = CodebaseIndex.configuration
380
+ output_dir = ENV.fetch('CODEBASE_INDEX_OUTPUT', config.output_dir)
381
+
382
+ provider = CodebaseIndex::Embedding::Provider::Ollama.new
383
+ text_preparer = CodebaseIndex::Embedding::TextPreparer.new
384
+ vector_store = CodebaseIndex::Storage::VectorStore::InMemory.new
385
+
386
+ indexer = CodebaseIndex::Embedding::Indexer.new(
387
+ provider: provider,
388
+ text_preparer: text_preparer,
389
+ vector_store: vector_store,
390
+ output_dir: output_dir
391
+ )
392
+
393
+ puts 'Embedding changed units (incremental)...'
394
+ stats = indexer.index_incremental
395
+
396
+ puts
397
+ puts 'Incremental embedding complete!'
398
+ puts " Processed: #{stats[:processed]}"
399
+ puts " Skipped: #{stats[:skipped]}"
400
+ puts " Errors: #{stats[:errors]}"
401
+ end
402
+
403
+ desc "Analyze the gem's own source code and generate self-analysis output"
404
+ task :self_analyze do
405
+ require 'digest'
406
+ require 'json'
407
+ require 'fileutils'
408
+ require 'codebase_index/ruby_analyzer'
409
+ require 'codebase_index/dependency_graph'
410
+ require 'codebase_index/graph_analyzer'
411
+ require 'codebase_index/ruby_analyzer/mermaid_renderer'
412
+
413
+ gem_root = File.expand_path('../..', __dir__)
414
+ json_dir = File.join(gem_root, 'tmp', 'codebase_index_self')
415
+ docs_dir = File.join(gem_root, 'docs', 'self-analysis')
416
+ manifest_path = File.join(json_dir, 'manifest.json')
417
+
418
+ # 1. Check staleness via source_checksum
419
+ lib_files = Dir.glob(File.join(gem_root, 'lib', '**', '*.rb'))
420
+ source_content = lib_files.map { |f| File.read(f) }.join
421
+ source_checksum = Digest::SHA256.hexdigest(source_content)
422
+
423
+ if File.exist?(manifest_path)
424
+ existing = JSON.parse(File.read(manifest_path))
425
+ if existing['source_checksum'] == source_checksum
426
+ puts 'Source unchanged — skipping self-analysis.'
427
+ next
428
+ end
429
+ end
430
+
431
+ puts 'Running self-analysis on gem source...'
432
+
433
+ # 2. Run RubyAnalyzer
434
+ units = CodebaseIndex::RubyAnalyzer.analyze(paths: [File.join(gem_root, 'lib', 'codebase_index')])
435
+ puts " Analyzed #{units.size} units"
436
+
437
+ # 3. Build DependencyGraph + GraphAnalyzer
438
+ graph = CodebaseIndex::DependencyGraph.new
439
+ units.each { |unit| graph.register(unit) }
440
+ analyzer = CodebaseIndex::GraphAnalyzer.new(graph)
441
+ analysis = analyzer.analyze
442
+ graph_data = graph.to_h
443
+
444
+ # 4. Write JSON to tmp/codebase_index_self/
445
+ FileUtils.mkdir_p(json_dir)
446
+
447
+ units.each do |unit|
448
+ file_name = "#{unit.identifier.gsub(/[^a-zA-Z0-9_]/, '_')}.json"
449
+ File.write(
450
+ File.join(json_dir, file_name),
451
+ JSON.pretty_generate(unit.to_h)
452
+ )
453
+ end
454
+
455
+ File.write(
456
+ File.join(json_dir, 'dependency_graph.json'),
457
+ JSON.pretty_generate(graph_data)
458
+ )
459
+
460
+ File.write(
461
+ File.join(json_dir, 'analysis.json'),
462
+ JSON.pretty_generate(analysis)
463
+ )
464
+
465
+ manifest = {
466
+ 'source_checksum' => source_checksum,
467
+ 'generated_at' => Time.now.iso8601,
468
+ 'unit_count' => units.size,
469
+ 'node_count' => graph_data[:stats][:node_count],
470
+ 'edge_count' => graph_data[:stats][:edge_count]
471
+ }
472
+ File.write(manifest_path, JSON.pretty_generate(manifest))
473
+
474
+ # 5. Render Mermaid to docs/self-analysis/
475
+ FileUtils.mkdir_p(docs_dir)
476
+ renderer = CodebaseIndex::RubyAnalyzer::MermaidRenderer.new
477
+
478
+ File.write(
479
+ File.join(docs_dir, 'architecture.md'),
480
+ renderer.render_architecture(units, graph_data, analysis)
481
+ )
482
+
483
+ File.write(
484
+ File.join(docs_dir, 'call-graph.md'),
485
+ "# Call Graph\n\n```mermaid\n#{renderer.render_call_graph(units)}\n```\n"
486
+ )
487
+
488
+ File.write(
489
+ File.join(docs_dir, 'dependency-map.md'),
490
+ "# Dependency Map\n\n```mermaid\n#{renderer.render_dependency_map(graph_data)}\n```\n"
491
+ )
492
+
493
+ File.write(
494
+ File.join(docs_dir, 'dataflow.md'),
495
+ "# Data Flow\n\n```mermaid\n#{renderer.render_dataflow(units)}\n```\n"
496
+ )
497
+
498
+ puts " JSON output: #{json_dir}"
499
+ puts " Mermaid docs: #{docs_dir}"
500
+ puts 'Self-analysis complete.'
501
+ end
502
+
503
+ desc 'Generate execution flow document for a Rails entry point'
504
+ task :flow, [:entry_point] => :environment do |_t, args|
505
+ require 'json'
506
+ require 'codebase_index/flow_assembler'
507
+ require 'codebase_index/dependency_graph'
508
+
509
+ entry_point = args[:entry_point]
510
+ unless entry_point
511
+ puts 'Usage: rake codebase_index:flow[EntryPoint#method]'
512
+ exit 1
513
+ end
514
+
515
+ output_dir = ENV.fetch('CODEBASE_INDEX_OUTPUT', Rails.root.join('tmp/codebase_index'))
516
+ graph_path = File.join(output_dir, 'dependency_graph.json')
517
+
518
+ unless File.exist?(graph_path)
519
+ puts "ERROR: Dependency graph not found at #{graph_path}"
520
+ puts 'Run codebase_index:extract first.'
521
+ exit 1
522
+ end
523
+
524
+ graph_data = JSON.parse(File.read(graph_path))
525
+ graph = CodebaseIndex::DependencyGraph.from_h(graph_data)
526
+
527
+ max_depth = ENV.fetch('MAX_DEPTH', 5).to_i
528
+ assembler = CodebaseIndex::FlowAssembler.new(graph: graph, extracted_dir: output_dir)
529
+ flow = assembler.assemble(entry_point, max_depth: max_depth)
530
+
531
+ format = ENV.fetch('FORMAT', 'markdown').downcase
532
+
533
+ case format
534
+ when 'json'
535
+ puts JSON.pretty_generate(flow.to_h)
536
+ else
537
+ puts flow.to_markdown
538
+ end
539
+ end
540
+
541
+ desc 'Sync extraction data to Notion databases (Data Models + Columns)'
542
+ task notion_sync: :environment do
543
+ require 'codebase_index/notion/exporter'
544
+
545
+ config = CodebaseIndex.configuration
546
+ # Env var takes precedence over configured value
547
+ config.notion_api_token = ENV.fetch('NOTION_API_TOKEN', nil) || config.notion_api_token
548
+
549
+ unless config.notion_api_token
550
+ puts 'ERROR: Notion API token not configured.'
551
+ puts 'Set NOTION_API_TOKEN env var or configure notion_api_token in CodebaseIndex.configure.'
552
+ exit 1
553
+ end
554
+
555
+ output_dir = ENV.fetch('CODEBASE_INDEX_OUTPUT', config.output_dir)
556
+
557
+ db_ids = config.notion_database_ids || {}
558
+ if db_ids.empty?
559
+ puts 'ERROR: No Notion database IDs configured.'
560
+ puts 'Set notion_database_ids in CodebaseIndex.configure:'
561
+ puts ' config.notion_database_ids = { data_models: "db-uuid", columns: "db-uuid" }'
562
+ exit 1
563
+ end
564
+
565
+ puts 'Syncing extraction data to Notion...'
566
+ puts " Output dir: #{output_dir}"
567
+ puts " Databases: #{db_ids.keys.join(', ')}"
568
+ puts
569
+
570
+ exporter = CodebaseIndex::Notion::Exporter.new(index_dir: output_dir)
571
+ stats = exporter.sync_all
572
+
573
+ puts 'Sync complete!'
574
+ puts " Data Models: #{stats[:data_models]} synced"
575
+ puts " Columns: #{stats[:columns]} synced"
576
+
577
+ if stats[:errors].any?
578
+ puts " Errors: #{stats[:errors].size}"
579
+ stats[:errors].first(5).each { |e| puts " - #{e}" }
580
+ puts " ... and #{stats[:errors].size - 5} more" if stats[:errors].size > 5
581
+ end
582
+ end
583
+ end
@@ -0,0 +1,115 @@
1
+ # frozen_string_literal: true
2
+
3
+ # lib/tasks/codebase_index_evaluation.rake
4
+ #
5
+ # Rake tasks for evaluating retrieval quality.
6
+ #
7
+ # Usage:
8
+ # bundle exec rake codebase_index:evaluate # Run evaluation
9
+ # bundle exec rake codebase_index:evaluate:baseline[grep] # Run baseline comparison
10
+
11
+ namespace :codebase_index do
12
+ desc 'Run evaluation queries against the retrieval pipeline'
13
+ task evaluate: :environment do
14
+ require 'codebase_index/retriever'
15
+ require 'codebase_index/evaluation/query_set'
16
+ require 'codebase_index/evaluation/evaluator'
17
+ require 'codebase_index/evaluation/report_generator'
18
+
19
+ run_evaluation
20
+ end
21
+
22
+ namespace :evaluate do
23
+ desc 'Run baseline comparison'
24
+ task :baseline, [:strategy] => :environment do |_t, args|
25
+ require 'codebase_index/evaluation/query_set'
26
+ require 'codebase_index/evaluation/baseline_runner'
27
+ require 'codebase_index/evaluation/metrics'
28
+
29
+ run_baseline(args)
30
+ end
31
+ end
32
+ end
33
+
34
+ def run_evaluation
35
+ query_set_path = ENV.fetch('EVAL_QUERY_SET', 'config/eval_queries.json')
36
+ output_path = ENV.fetch('EVAL_OUTPUT', 'tmp/eval_report.json')
37
+ budget = ENV.fetch('EVAL_BUDGET', '8000').to_i
38
+
39
+ puts "Loading query set from: #{query_set_path}"
40
+ query_set = CodebaseIndex::Evaluation::QuerySet.load(query_set_path)
41
+ puts "Loaded #{query_set.size} queries — building retriever..."
42
+
43
+ evaluator = CodebaseIndex::Evaluation::Evaluator.new(
44
+ retriever: build_eval_retriever, query_set: query_set, budget: budget
45
+ )
46
+ report = evaluator.evaluate
47
+
48
+ CodebaseIndex::Evaluation::ReportGenerator.new
49
+ .save(report, output_path, metadata: { 'query_set' => query_set_path })
50
+
51
+ print_eval_report(report, output_path)
52
+ end
53
+
54
+ def run_baseline(args)
55
+ strategy = (args[:strategy] || ENV.fetch('EVAL_BASELINE_STRATEGY', 'grep')).to_sym
56
+ query_set_path = ENV.fetch('EVAL_QUERY_SET', 'config/eval_queries.json')
57
+ limit = ENV.fetch('EVAL_BASELINE_LIMIT', '10').to_i
58
+
59
+ puts "Loading query set from: #{query_set_path}"
60
+ query_set = CodebaseIndex::Evaluation::QuerySet.load(query_set_path)
61
+ puts "Running #{strategy} baseline (limit: #{limit})..."
62
+
63
+ runner = CodebaseIndex::Evaluation::BaselineRunner.new(
64
+ metadata_store: CodebaseIndex.metadata_store
65
+ )
66
+
67
+ totals = compute_baseline_totals(query_set, runner, strategy, limit)
68
+ print_baseline_report(strategy, query_set.size, totals)
69
+ end
70
+
71
+ def compute_baseline_totals(query_set, runner, strategy, limit)
72
+ total_mrr = 0.0
73
+ total_recall = 0.0
74
+
75
+ query_set.queries.each do |query|
76
+ results = runner.run(query.query, strategy: strategy, limit: limit)
77
+ total_mrr += CodebaseIndex::Evaluation::Metrics.mrr(results, query.expected_units)
78
+ total_recall += CodebaseIndex::Evaluation::Metrics.recall(results, query.expected_units)
79
+ end
80
+
81
+ { mrr: total_mrr, recall: total_recall }
82
+ end
83
+
84
+ def print_eval_report(report, output_path)
85
+ puts
86
+ puts 'Evaluation complete!'
87
+ puts '=' * 50
88
+ report.aggregates.each do |key, value|
89
+ formatted = value.is_a?(Float) ? format('%.4f', value) : value.to_s
90
+ puts " #{key.to_s.ljust(25)}: #{formatted}"
91
+ end
92
+ puts '=' * 50
93
+ puts "Report saved to: #{output_path}"
94
+ end
95
+
96
+ def print_baseline_report(strategy, count, totals)
97
+ puts
98
+ puts "Baseline: #{strategy}"
99
+ puts '=' * 50
100
+ puts " Mean MRR: #{format('%.4f', count.positive? ? totals[:mrr] / count : 0.0)}"
101
+ puts " Mean Recall: #{format('%.4f', count.positive? ? totals[:recall] / count : 0.0)}"
102
+ puts '=' * 50
103
+ end
104
+
105
+ # Build a retriever for evaluation (requires Rails environment with stores configured).
106
+ #
107
+ # @return [CodebaseIndex::Retriever]
108
+ def build_eval_retriever
109
+ CodebaseIndex::Retriever.new(
110
+ vector_store: CodebaseIndex.vector_store,
111
+ metadata_store: CodebaseIndex.metadata_store,
112
+ graph_store: CodebaseIndex.graph_store,
113
+ embedding_provider: CodebaseIndex.embedding_provider
114
+ )
115
+ end