codebase_index 0.2.1 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +60 -0
  3. data/README.md +95 -300
  4. data/exe/codebase-index-mcp +3 -31
  5. data/exe/codebase-index-mcp-http +3 -31
  6. data/lib/codebase_index/ast/method_extractor.rb +3 -8
  7. data/lib/codebase_index/ast/node.rb +28 -0
  8. data/lib/codebase_index/ast/parser.rb +53 -92
  9. data/lib/codebase_index/builder.rb +67 -4
  10. data/lib/codebase_index/cache/cache_middleware.rb +199 -0
  11. data/lib/codebase_index/cache/cache_store.rb +264 -0
  12. data/lib/codebase_index/cache/redis_cache_store.rb +116 -0
  13. data/lib/codebase_index/cache/solid_cache_store.rb +111 -0
  14. data/lib/codebase_index/chunking/semantic_chunker.rb +29 -24
  15. data/lib/codebase_index/console/adapters/good_job_adapter.rb +7 -40
  16. data/lib/codebase_index/console/adapters/job_adapter.rb +68 -0
  17. data/lib/codebase_index/console/adapters/sidekiq_adapter.rb +7 -40
  18. data/lib/codebase_index/console/adapters/solid_queue_adapter.rb +7 -40
  19. data/lib/codebase_index/console/bridge.rb +7 -0
  20. data/lib/codebase_index/console/console_response_renderer.rb +3 -7
  21. data/lib/codebase_index/console/embedded_executor.rb +2 -1
  22. data/lib/codebase_index/console/server.rb +1 -4
  23. data/lib/codebase_index/dependency_graph.rb +28 -19
  24. data/lib/codebase_index/embedding/indexer.rb +18 -8
  25. data/lib/codebase_index/embedding/openai.rb +27 -6
  26. data/lib/codebase_index/embedding/provider.rb +29 -2
  27. data/lib/codebase_index/evaluation/evaluator.rb +5 -12
  28. data/lib/codebase_index/extractor.rb +40 -44
  29. data/lib/codebase_index/extractors/action_cable_extractor.rb +9 -36
  30. data/lib/codebase_index/extractors/callback_analyzer.rb +22 -8
  31. data/lib/codebase_index/extractors/controller_extractor.rb +3 -93
  32. data/lib/codebase_index/extractors/decorator_extractor.rb +7 -14
  33. data/lib/codebase_index/extractors/engine_extractor.rb +20 -1
  34. data/lib/codebase_index/extractors/graphql_extractor.rb +4 -29
  35. data/lib/codebase_index/extractors/job_extractor.rb +11 -6
  36. data/lib/codebase_index/extractors/lib_extractor.rb +0 -31
  37. data/lib/codebase_index/extractors/mailer_extractor.rb +15 -85
  38. data/lib/codebase_index/extractors/manager_extractor.rb +1 -15
  39. data/lib/codebase_index/extractors/model_extractor.rb +20 -53
  40. data/lib/codebase_index/extractors/phlex_extractor.rb +8 -8
  41. data/lib/codebase_index/extractors/policy_extractor.rb +1 -24
  42. data/lib/codebase_index/extractors/poro_extractor.rb +0 -17
  43. data/lib/codebase_index/extractors/serializer_extractor.rb +12 -7
  44. data/lib/codebase_index/extractors/service_extractor.rb +1 -38
  45. data/lib/codebase_index/extractors/shared_utility_methods.rb +183 -1
  46. data/lib/codebase_index/extractors/validator_extractor.rb +3 -17
  47. data/lib/codebase_index/extractors/view_component_extractor.rb +10 -9
  48. data/lib/codebase_index/filename_utils.rb +32 -0
  49. data/lib/codebase_index/flow_analysis/operation_extractor.rb +1 -4
  50. data/lib/codebase_index/formatting/base.rb +0 -10
  51. data/lib/codebase_index/graph_analyzer.rb +1 -1
  52. data/lib/codebase_index/mcp/bootstrapper.rb +58 -0
  53. data/lib/codebase_index/mcp/renderers/markdown_renderer.rb +35 -34
  54. data/lib/codebase_index/mcp/renderers/plain_renderer.rb +29 -29
  55. data/lib/codebase_index/mcp/server.rb +59 -68
  56. data/lib/codebase_index/mcp/tool_response_renderer.rb +23 -0
  57. data/lib/codebase_index/notion/client.rb +2 -2
  58. data/lib/codebase_index/notion/mapper.rb +1 -0
  59. data/lib/codebase_index/notion/mappers/column_mapper.rb +3 -11
  60. data/lib/codebase_index/notion/mappers/model_mapper.rb +20 -23
  61. data/lib/codebase_index/notion/mappers/shared.rb +22 -0
  62. data/lib/codebase_index/observability/health_check.rb +0 -2
  63. data/lib/codebase_index/observability/structured_logger.rb +12 -30
  64. data/lib/codebase_index/operator/pipeline_guard.rb +0 -7
  65. data/lib/codebase_index/resilience/index_validator.rb +3 -21
  66. data/lib/codebase_index/retrieval/context_assembler.rb +19 -7
  67. data/lib/codebase_index/retrieval/query_classifier.rb +14 -12
  68. data/lib/codebase_index/retrieval/ranker.rb +6 -2
  69. data/lib/codebase_index/retrieval/search_executor.rb +8 -19
  70. data/lib/codebase_index/retriever.rb +1 -9
  71. data/lib/codebase_index/ruby_analyzer/class_analyzer.rb +5 -25
  72. data/lib/codebase_index/ruby_analyzer/dataflow_analyzer.rb +6 -7
  73. data/lib/codebase_index/ruby_analyzer/mermaid_renderer.rb +58 -53
  74. data/lib/codebase_index/ruby_analyzer/trace_enricher.rb +11 -7
  75. data/lib/codebase_index/session_tracer/file_store.rb +1 -8
  76. data/lib/codebase_index/session_tracer/redis_store.rb +1 -7
  77. data/lib/codebase_index/session_tracer/session_flow_assembler.rb +4 -13
  78. data/lib/codebase_index/session_tracer/solid_cache_store.rb +1 -7
  79. data/lib/codebase_index/session_tracer/store.rb +14 -0
  80. data/lib/codebase_index/storage/metadata_store.rb +37 -10
  81. data/lib/codebase_index/storage/pgvector.rb +37 -5
  82. data/lib/codebase_index/storage/qdrant.rb +39 -6
  83. data/lib/codebase_index/storage/vector_store.rb +11 -0
  84. data/lib/codebase_index/temporal/snapshot_store.rb +14 -10
  85. data/lib/codebase_index/token_utils.rb +19 -0
  86. data/lib/codebase_index/version.rb +1 -1
  87. data/lib/codebase_index.rb +25 -6
  88. data/lib/tasks/codebase_index.rake +2 -2
  89. metadata +11 -2
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'json'
4
- require 'digest'
4
+ require_relative '../filename_utils'
5
5
 
6
6
  module CodebaseIndex
7
7
  module Resilience
@@ -18,6 +18,8 @@ module CodebaseIndex
18
18
  # report = validator.validate
19
19
  # puts report.errors if !report.valid?
20
20
  class IndexValidator
21
+ include CodebaseIndex::FilenameUtils
22
+
21
23
  # Report produced by {#validate}.
22
24
  #
23
25
  # @!attribute [r] valid?
@@ -160,26 +162,6 @@ module CodebaseIndex
160
162
  warnings << "Stale file not in index: #{type_name}/#{basename}"
161
163
  end
162
164
  end
163
-
164
- # Convert an identifier to a safe filename (legacy format, mirrors Extractor#safe_filename).
165
- #
166
- # @param identifier [String] The unit identifier (e.g., "Admin::UsersController")
167
- # @return [String] A filesystem-safe filename (e.g., "Admin__UsersController.json")
168
- def safe_filename(identifier)
169
- "#{identifier.gsub('::', '__').gsub(/[^a-zA-Z0-9_-]/, '_')}.json"
170
- end
171
-
172
- # Convert an identifier to a collision-safe filename (current format).
173
- # Mirrors {Extractor#collision_safe_filename} — appends a short SHA256 digest
174
- # to disambiguate identifiers that normalize to the same safe_filename.
175
- #
176
- # @param identifier [String] The unit identifier
177
- # @return [String] Collision-safe filename (e.g., "Admin__UsersController_a1b2c3d4.json")
178
- def collision_safe_filename(identifier)
179
- base = identifier.gsub('::', '__').gsub(/[^a-zA-Z0-9_-]/, '_')
180
- digest = Digest::SHA256.hexdigest(identifier)[0, 8]
181
- "#{base}_#{digest}.json"
182
- end
183
165
  end
184
166
  end
185
167
  end
@@ -54,6 +54,9 @@ module CodebaseIndex
54
54
  sources = []
55
55
  tokens_used = 0
56
56
 
57
+ # Pre-fetch all candidate metadata in one batch query
58
+ @unit_cache = @metadata_store.find_batch(candidates.map(&:identifier))
59
+
57
60
  # 1. Structural context (always first if provided)
58
61
  tokens_used = add_structural_section(sections, structural_context, tokens_used, effective_budget)
59
62
 
@@ -141,7 +144,7 @@ module CodebaseIndex
141
144
 
142
145
  # Append a single candidate to the section. Returns updated tokens_used, or nil to stop.
143
146
  def append_candidate(parts, sources, candidate, budget, tokens_used)
144
- unit = @metadata_store.find(candidate.identifier)
147
+ unit = @unit_cache[candidate.identifier]
145
148
  return tokens_used unless unit
146
149
 
147
150
  text = format_unit(unit, candidate)
@@ -165,10 +168,10 @@ module CodebaseIndex
165
168
  # @param candidate [Candidate] The search candidate
166
169
  # @return [String]
167
170
  def format_unit(unit, _candidate)
168
- identifier = unit[:identifier] || unit['identifier']
169
- type = unit[:type] || unit['type']
170
- file_path = unit[:file_path] || unit['file_path']
171
- source = unit[:source_code] || unit['source_code'] || ''
171
+ identifier = unit_field(unit, :identifier)
172
+ type = unit_field(unit, :type)
173
+ file_path = unit_field(unit, :file_path)
174
+ source = unit_field(unit, :source_code) || ''
172
175
 
173
176
  <<~UNIT.strip
174
177
  ## #{identifier} (#{type})
@@ -184,14 +187,23 @@ module CodebaseIndex
184
187
  def build_source_attribution(candidate, unit, truncated: false)
185
188
  attribution = {
186
189
  identifier: candidate.identifier,
187
- type: unit[:type] || unit['type'],
190
+ type: unit_field(unit, :type),
188
191
  score: candidate.score,
189
- file_path: unit[:file_path] || unit['file_path']
192
+ file_path: unit_field(unit, :file_path)
190
193
  }
191
194
  attribution[:truncated] = true if truncated
192
195
  attribution
193
196
  end
194
197
 
198
+ # Read a field from a unit hash, accepting either symbol or string keys.
199
+ #
200
+ # @param unit [Hash]
201
+ # @param key [Symbol]
202
+ # @return [Object, nil]
203
+ def unit_field(unit, key)
204
+ unit[key] || unit[key.to_s]
205
+ end
206
+
195
207
  # Check if a candidate is framework source.
196
208
  #
197
209
  # @param candidate [Candidate]
@@ -87,28 +87,30 @@ module CodebaseIndex
87
87
  # @param query [String]
88
88
  # @return [Symbol]
89
89
  def detect_intent(query)
90
- INTENT_PATTERNS.each do |intent, pattern|
91
- return intent if query.match?(pattern)
92
- end
93
- :understand # default
90
+ match_first(INTENT_PATTERNS, query, default: :understand)
94
91
  end
95
92
 
96
93
  # @param query [String]
97
94
  # @return [Symbol]
98
95
  def detect_scope(query)
99
- SCOPE_PATTERNS.each do |scope, pattern|
100
- return scope if query.match?(pattern)
101
- end
102
- :focused # default
96
+ match_first(SCOPE_PATTERNS, query, default: :focused)
103
97
  end
104
98
 
105
99
  # @param query [String]
106
100
  # @return [Symbol, nil]
107
101
  def detect_target_type(query)
108
- TARGET_PATTERNS.each do |type, pattern|
109
- return type if query.match?(pattern)
110
- end
111
- nil # no specific type detected
102
+ match_first(TARGET_PATTERNS, query, default: nil)
103
+ end
104
+
105
+ # Match query against a hash of {key => pattern}, returning the first matching key.
106
+ #
107
+ # @param patterns [Hash{Symbol => Regexp}]
108
+ # @param query [String]
109
+ # @param default [Object] value if no pattern matches
110
+ # @return [Object]
111
+ def match_first(patterns, query, default:)
112
+ patterns.each { |key, pattern| return key if query.match?(pattern) }
113
+ default
112
114
  end
113
115
 
114
116
  # @param query [String]
@@ -102,8 +102,9 @@ module CodebaseIndex
102
102
  #
103
103
  # @return [Array<Candidate>]
104
104
  def rebuild_rrf_candidates(candidates, rrf_scores, metadata_map)
105
+ original_by_id = candidates.index_by(&:identifier)
105
106
  rrf_scores.sort_by { |_id, score| -score }.map do |identifier, score|
106
- original = candidates.find { |c| c.identifier == identifier }
107
+ original = original_by_id[identifier]
107
108
  build_candidate(
108
109
  identifier: identifier,
109
110
  score: score,
@@ -119,8 +120,11 @@ module CodebaseIndex
119
120
  # @param classification [QueryClassifier::Classification]
120
121
  # @return [Array<Hash>]
121
122
  def score_candidates(candidates, classification)
123
+ # Batch-fetch all metadata in one query instead of per-candidate lookups
124
+ unit_map = @metadata_store.find_batch(candidates.map(&:identifier))
125
+
122
126
  candidates.map do |candidate|
123
- unit = @metadata_store.find(candidate.identifier)
127
+ unit = unit_map[candidate.identifier]
124
128
 
125
129
  {
126
130
  candidate: candidate,
@@ -33,26 +33,11 @@ module CodebaseIndex
33
33
 
34
34
  # Strategy mapping from (intent, scope) → strategy.
35
35
  #
36
- # Pinpoint scope always uses :direct for locate/reference.
37
- # Comprehensive/exploratory scopes use :hybrid.
38
- # Framework intent always uses :keyword against framework sources.
36
+ # Covers pinpoint overrides for locate/reference (:direct).
37
+ # Trace and framework intents are handled before this map is consulted.
39
38
  STRATEGY_MAP = {
40
- # [intent, scope] => strategy
41
- # Pinpoint
42
39
  %i[locate pinpoint] => :direct,
43
- %i[reference pinpoint] => :direct,
44
-
45
- # Trace always uses graph
46
- %i[trace pinpoint] => :graph,
47
- %i[trace focused] => :graph,
48
- %i[trace exploratory] => :graph,
49
- %i[trace comprehensive] => :graph,
50
-
51
- # Framework always keyword
52
- %i[framework pinpoint] => :keyword,
53
- %i[framework focused] => :keyword,
54
- %i[framework exploratory] => :keyword,
55
- %i[framework comprehensive] => :keyword
40
+ %i[reference pinpoint] => :direct
56
41
  }.freeze
57
42
 
58
43
  # @param vector_store [Storage::VectorStore::Interface] Vector store adapter
@@ -93,7 +78,11 @@ module CodebaseIndex
93
78
  intent = classification.intent
94
79
  scope = classification.scope
95
80
 
96
- # Check explicit mapping first
81
+ # Intent-level overrides (apply regardless of scope)
82
+ return :graph if intent == :trace
83
+ return :keyword if intent == :framework
84
+
85
+ # Pinpoint overrides for locate/reference
97
86
  mapped = STRATEGY_MAP[[intent, scope]]
98
87
  return mapped if mapped
99
88
 
@@ -140,7 +140,7 @@ module CodebaseIndex
140
140
  return nil if total.zero?
141
141
 
142
142
  type_counts = STRUCTURAL_TYPES.filter_map do |type|
143
- count = count_by_type(type)
143
+ count = @metadata_store.find_by_type(type).size
144
144
  "#{count} #{type}s" if count.positive?
145
145
  end
146
146
 
@@ -148,13 +148,5 @@ module CodebaseIndex
148
148
  rescue StandardError
149
149
  nil
150
150
  end
151
-
152
- # Count units of a given type in the metadata store.
153
- #
154
- # @param type [String] The unit type to count
155
- # @return [Integer] Number of units of this type
156
- def count_by_type(type)
157
- @metadata_store.find_by_type(type).size
158
- end
159
151
  end
160
152
  end
@@ -18,6 +18,7 @@ module CodebaseIndex
18
18
  #
19
19
  class ClassAnalyzer
20
20
  include FqnBuilder
21
+ include Ast::SourceSpan
21
22
 
22
23
  # @param parser [Ast::Parser, nil] Parser instance (creates default if nil)
23
24
  def initialize(parser: nil)
@@ -43,9 +44,9 @@ module CodebaseIndex
43
44
 
44
45
  case node.type
45
46
  when :class
46
- process_class(node, source, file_path, namespace_stack, units)
47
+ process_definition(node, :ruby_class, source, file_path, namespace_stack, units)
47
48
  when :module
48
- process_module(node, source, file_path, namespace_stack, units)
49
+ process_definition(node, :ruby_module, source, file_path, namespace_stack, units)
49
50
  else
50
51
  (node.children || []).each do |child|
51
52
  extract_definitions(child, source, file_path, namespace_stack, units)
@@ -53,14 +54,6 @@ module CodebaseIndex
53
54
  end
54
55
  end
55
56
 
56
- def process_class(node, source, file_path, namespace_stack, units)
57
- process_definition(node, :ruby_class, source, file_path, namespace_stack, units)
58
- end
59
-
60
- def process_module(node, source, file_path, namespace_stack, units)
61
- process_definition(node, :ruby_module, source, file_path, namespace_stack, units)
62
- end
63
-
64
57
  def process_definition(node, type, source, file_path, namespace_stack, units)
65
58
  name = node.method_name
66
59
  fqn = build_fqn(name, namespace_stack)
@@ -144,13 +137,7 @@ module CodebaseIndex
144
137
 
145
138
  # Count def and defs nodes in body children (non-recursive — only direct methods).
146
139
  def count_methods(body_children)
147
- count = 0
148
- body_children.each do |child|
149
- next unless child.is_a?(Ast::Node)
150
-
151
- count += 1 if %i[def defs].include?(child.type)
152
- end
153
- count
140
+ body_children.count { |child| child.is_a?(Ast::Node) && %i[def defs].include?(child.type) }
154
141
  end
155
142
 
156
143
  # Build the constant name from a :const node (may have receiver for namespaced).
@@ -163,14 +150,7 @@ module CodebaseIndex
163
150
 
164
151
  # Extract source text for a node using line range.
165
152
  def extract_source(node, source)
166
- return nil unless node.line && node.end_line
167
-
168
- lines = source.lines
169
- start_idx = node.line - 1
170
- end_idx = node.end_line - 1
171
- return nil if start_idx.negative? || end_idx >= lines.length
172
-
173
- lines[start_idx..end_idx].join
153
+ extract_source_span(source, node.line, node.end_line)
174
154
  end
175
155
 
176
156
  # Build dependency list from superclass, includes, and extends.
@@ -21,6 +21,11 @@ module CodebaseIndex
21
21
  CONSTRUCTION_METHODS = %w[new].freeze
22
22
  SERIALIZATION_METHODS = %w[to_h to_json to_a serialize as_json].freeze
23
23
  DESERIALIZATION_METHODS = %w[from_json parse].freeze
24
+ CATEGORY_BY_METHOD = [
25
+ *CONSTRUCTION_METHODS.map { |m| [m, :construction] },
26
+ *SERIALIZATION_METHODS.map { |m| [m, :serialization] },
27
+ *DESERIALIZATION_METHODS.map { |m| [m, :deserialization] }
28
+ ].to_h.freeze
24
29
 
25
30
  # @param parser [Ast::Parser, nil] Parser instance (creates default if nil)
26
31
  def initialize(parser: nil)
@@ -65,13 +70,7 @@ module CodebaseIndex
65
70
  end
66
71
 
67
72
  def categorize(method_name)
68
- if CONSTRUCTION_METHODS.include?(method_name)
69
- :construction
70
- elsif SERIALIZATION_METHODS.include?(method_name)
71
- :serialization
72
- elsif DESERIALIZATION_METHODS.include?(method_name)
73
- :deserialization
74
- end
73
+ CATEGORY_BY_METHOD[method_name]
75
74
  end
76
75
  end
77
76
  end
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'set'
4
+
3
5
  module CodebaseIndex
4
6
  module RubyAnalyzer
5
7
  # Renders Mermaid-format diagrams from extracted units, dependency graphs,
@@ -25,31 +27,24 @@ module CodebaseIndex
25
27
  lines = ['graph TD']
26
28
  return lines.join("\n") if units.nil? || units.empty?
27
29
 
28
- seen_nodes = {}
29
- edges = []
30
+ seen_nodes = Set.new
31
+ seen_edges = Set.new
30
32
 
31
33
  units.each do |unit|
32
34
  node_id = sanitize_id(unit.identifier)
33
- unless seen_nodes[node_id]
34
- seen_nodes[node_id] = true
35
- lines << " #{node_id}[\"#{escape_label(unit.identifier)}\"]"
36
- end
35
+ lines << " #{node_id}[\"#{escape_label(unit.identifier)}\"]" if seen_nodes.add?(node_id)
37
36
 
38
37
  (unit.dependencies || []).each do |dep|
39
38
  target = dep[:target] || dep['target']
40
39
  next unless target
41
40
 
42
41
  target_id = sanitize_id(target)
43
- unless seen_nodes[target_id]
44
- seen_nodes[target_id] = true
45
- lines << " #{target_id}[\"#{escape_label(target)}\"]"
46
- end
42
+ lines << " #{target_id}[\"#{escape_label(target)}\"]" if seen_nodes.add?(target_id)
47
43
 
48
44
  via = dep[:via] || dep['via']
49
45
  edge_key = "#{node_id}->#{target_id}"
50
- next if edges.include?(edge_key)
46
+ next unless seen_edges.add?(edge_key)
51
47
 
52
- edges << edge_key
53
48
  lines << if via
54
49
  " #{node_id} -->|#{via}| #{target_id}"
55
50
  else
@@ -95,16 +90,14 @@ module CodebaseIndex
95
90
  end
96
91
 
97
92
  # Render edges
98
- seen_edges = []
93
+ seen_edges = Set.new
99
94
  edges.each do |source, targets|
100
- targets = Array(targets)
101
- targets.each do |target|
95
+ Array(targets).each do |target|
102
96
  next unless nodes.key?(target)
103
97
 
104
98
  edge_key = "#{sanitize_id(source)}->#{sanitize_id(target)}"
105
- next if seen_edges.include?(edge_key)
99
+ next unless seen_edges.add?(edge_key)
106
100
 
107
- seen_edges << edge_key
108
101
  lines << " #{sanitize_id(source)} --> #{sanitize_id(target)}"
109
102
  end
110
103
  end
@@ -123,15 +116,14 @@ module CodebaseIndex
123
116
  lines = ['flowchart TD']
124
117
  return lines.join("\n") if units.nil? || units.empty?
125
118
 
126
- seen_nodes = {}
119
+ seen_nodes = Set.new
127
120
 
128
121
  units.each do |unit|
129
122
  transformations = unit.metadata[:data_transformations] || unit.metadata['data_transformations']
130
123
  next unless transformations.is_a?(Array) && transformations.any?
131
124
 
132
125
  node_id = sanitize_id(unit.identifier)
133
- unless seen_nodes[node_id]
134
- seen_nodes[node_id] = true
126
+ if seen_nodes.add?(node_id)
135
127
  shape = dataflow_shape(transformations)
136
128
  lines << " #{node_id}#{shape}"
137
129
  end
@@ -144,10 +136,7 @@ module CodebaseIndex
144
136
  category = (t[:category] || t['category'])&.to_s
145
137
  method_name = t[:method] || t['method']
146
138
 
147
- unless seen_nodes[receiver_id]
148
- seen_nodes[receiver_id] = true
149
- lines << " #{receiver_id}[\"#{escape_label(receiver)}\"]"
150
- end
139
+ lines << " #{receiver_id}[\"#{escape_label(receiver)}\"]" if seen_nodes.add?(receiver_id)
151
140
 
152
141
  label = [category, method_name].compact.join(': ')
153
142
  lines << " #{node_id} -->|#{label}| #{receiver_id}"
@@ -200,41 +189,57 @@ module CodebaseIndex
200
189
  # Analysis summary
201
190
  sections << '## Analysis Summary'
202
191
  sections << ''
203
- if analysis
204
- stats = analysis[:stats] || analysis['stats'] || {}
205
- sections << "- **Orphans:** #{stats[:orphan_count] || stats['orphan_count'] || 0}"
206
- sections << "- **Dead ends:** #{stats[:dead_end_count] || stats['dead_end_count'] || 0}"
207
- sections << "- **Hubs:** #{stats[:hub_count] || stats['hub_count'] || 0}"
208
- sections << "- **Cycles:** #{stats[:cycle_count] || stats['cycle_count'] || 0}"
209
-
210
- hubs = analysis[:hubs] || analysis['hubs'] || []
211
- if hubs.any?
212
- sections << ''
213
- sections << '### Top Hubs'
214
- sections << ''
215
- hubs.first(5).each do |hub|
216
- id = hub[:identifier] || hub['identifier']
217
- count = hub[:dependent_count] || hub['dependent_count']
218
- sections << "- #{id} (#{count} dependents)"
219
- end
220
- end
221
-
222
- cycles = analysis[:cycles] || analysis['cycles'] || []
223
- if cycles.any?
224
- sections << ''
225
- sections << '### Cycles'
226
- sections << ''
227
- cycles.each do |cycle|
228
- sections << "- #{cycle.join(' -> ')}"
229
- end
230
- end
231
- end
192
+ sections.concat(render_stats_section(analysis))
232
193
 
233
194
  sections.join("\n")
234
195
  end
235
196
 
236
197
  private
237
198
 
199
+ # Render the Analysis Summary section lines for a given analysis hash.
200
+ #
201
+ # @param analysis [Hash, nil] Graph analysis report from GraphAnalyzer#analyze
202
+ # @return [Array<String>] Lines to append to the architecture document
203
+ def render_stats_section(analysis)
204
+ lines = []
205
+ return lines unless analysis
206
+
207
+ stats = analysis[:stats] || analysis['stats'] || {}
208
+ lines << "- **Orphans:** #{stats[:orphan_count] || stats['orphan_count'] || 0}"
209
+ lines << "- **Dead ends:** #{stats[:dead_end_count] || stats['dead_end_count'] || 0}"
210
+ lines << "- **Hubs:** #{stats[:hub_count] || stats['hub_count'] || 0}"
211
+ lines << "- **Cycles:** #{stats[:cycle_count] || stats['cycle_count'] || 0}"
212
+
213
+ hubs = analysis[:hubs] || analysis['hubs'] || []
214
+ lines.concat(render_hubs_section(hubs))
215
+
216
+ cycles = analysis[:cycles] || analysis['cycles'] || []
217
+ if cycles.any?
218
+ lines << ''
219
+ lines << '### Cycles'
220
+ lines << ''
221
+ cycles.each { |cycle| lines << "- #{cycle.join(' -> ')}" }
222
+ end
223
+
224
+ lines
225
+ end
226
+
227
+ # Render the Top Hubs subsection lines.
228
+ #
229
+ # @param hubs [Array<Hash>] Hub entries with :identifier and :dependent_count keys
230
+ # @return [Array<String>] Lines to append, or empty array if no hubs
231
+ def render_hubs_section(hubs)
232
+ return [] unless hubs.any?
233
+
234
+ lines = ['', '### Top Hubs', '']
235
+ hubs.first(5).each do |hub|
236
+ id = hub[:identifier] || hub['identifier']
237
+ count = hub[:dependent_count] || hub['dependent_count']
238
+ lines << "- #{id} (#{count} dependents)"
239
+ end
240
+ lines
241
+ end
242
+
238
243
  # Sanitize an identifier for use as a Mermaid node ID.
239
244
  #
240
245
  # Replaces characters that Mermaid cannot use in node IDs with underscores.
@@ -64,19 +64,19 @@ module CodebaseIndex
64
64
 
65
65
  traces = grouped[key]
66
66
 
67
- calls = traces.select { |t| t['event'] == 'call' || t[:event] == 'call' }
68
- returns = traces.select { |t| t['event'] == 'return' || t[:event] == 'return' }
67
+ calls = traces.select { |t| fetch_key(t, :event) == 'call' }
68
+ returns = traces.select { |t| fetch_key(t, :event) == 'return' }
69
69
 
70
70
  callers = calls.filter_map do |t|
71
- caller_class = t['caller_class'] || t[:caller_class]
72
- caller_method = t['caller_method'] || t[:caller_method]
71
+ caller_class = fetch_key(t, :caller_class)
72
+ caller_method = fetch_key(t, :caller_method)
73
73
  next unless caller_class
74
74
 
75
75
  { 'caller_class' => caller_class, 'caller_method' => caller_method }
76
76
  end
77
77
 
78
78
  return_types = returns.filter_map do |t|
79
- t['return_class'] || t[:return_class]
79
+ fetch_key(t, :return_class)
80
80
  end.uniq
81
81
 
82
82
  unit.metadata[:trace] = {
@@ -90,11 +90,15 @@ module CodebaseIndex
90
90
  class << self
91
91
  private
92
92
 
93
+ def fetch_key(hash, key)
94
+ hash[key.to_s] || hash[key.to_sym]
95
+ end
96
+
93
97
  def group_traces(trace_data)
94
98
  grouped = Hash.new { |h, k| h[k] = [] }
95
99
  trace_data.each do |trace|
96
- class_name = trace['class_name'] || trace[:class_name]
97
- method_name = trace['method_name'] || trace[:method_name]
100
+ class_name = fetch_key(trace, :class_name)
101
+ method_name = fetch_key(trace, :method_name)
98
102
  next unless class_name && method_name
99
103
 
100
104
  key = "#{class_name}##{method_name}"
@@ -71,14 +71,7 @@ module CodebaseIndex
71
71
 
72
72
  files.first(limit).map do |file|
73
73
  session_id = File.basename(file, '.jsonl')
74
- requests = read(session_id)
75
-
76
- {
77
- 'session_id' => session_id,
78
- 'request_count' => requests.size,
79
- 'first_request' => requests.first&.fetch('timestamp', nil),
80
- 'last_request' => requests.last&.fetch('timestamp', nil)
81
- }
74
+ session_summary(session_id, read(session_id))
82
75
  end
83
76
  end
84
77
 
@@ -72,13 +72,7 @@ module CodebaseIndex
72
72
  expired.each { |id| @redis.srem(SESSIONS_KEY, id) } if expired.any?
73
73
 
74
74
  active.first(limit).map do |session_id|
75
- requests = read(session_id)
76
- {
77
- 'session_id' => session_id,
78
- 'request_count' => requests.size,
79
- 'first_request' => requests.first&.fetch('timestamp', nil),
80
- 'last_request' => requests.last&.fetch('timestamp', nil)
81
- }
75
+ session_summary(session_id, read(session_id))
82
76
  end
83
77
  end
84
78
 
@@ -2,6 +2,7 @@
2
2
 
3
3
  require 'json'
4
4
  require 'set'
5
+ require_relative '../token_utils'
5
6
  require_relative 'session_flow_document'
6
7
 
7
8
  module CodebaseIndex
@@ -220,10 +221,10 @@ module CodebaseIndex
220
221
  source = unit[:source_code]
221
222
  next unless source
222
223
 
223
- source_tokens = estimate_token_count(source)
224
+ source_tokens = TokenUtils.estimate_tokens(source)
224
225
  unit[:source_code] = "# source truncated (#{source_tokens} tokens)"
225
226
  total -= source_tokens
226
- total += estimate_token_count(unit[:source_code])
227
+ total += TokenUtils.estimate_tokens(unit[:source_code])
227
228
  end
228
229
 
229
230
  [total, 0].max
@@ -236,20 +237,10 @@ module CodebaseIndex
236
237
  def estimate_tokens(context_pool)
237
238
  context_pool.values.sum do |unit|
238
239
  source = unit[:source_code] || ''
239
- estimate_token_count(source) + 20 # overhead for tags/metadata
240
+ TokenUtils.estimate_tokens(source) + 20 # overhead for tags/metadata
240
241
  end
241
242
  end
242
243
 
243
- # Estimate token count for a string.
244
- # Uses project convention: (string.length / 4.0).ceil
245
- # See docs/TOKEN_BENCHMARK.md — conservative floor (~10.6% overestimate).
246
- #
247
- # @param text [String] Text to estimate
248
- # @return [Integer] Estimated token count
249
- def estimate_token_count(text)
250
- (text.length / 4.0).ceil
251
- end
252
-
253
244
  # Build an empty document for sessions with no requests.
254
245
  #
255
246
  # @param session_id [String]
@@ -73,13 +73,7 @@ module CodebaseIndex
73
73
  write_index(active) if active.size != index.size
74
74
 
75
75
  active.first(limit).map do |session_id|
76
- requests = read(session_id)
77
- {
78
- 'session_id' => session_id,
79
- 'request_count' => requests.size,
80
- 'first_request' => requests.first&.fetch('timestamp', nil),
81
- 'last_request' => requests.last&.fetch('timestamp', nil)
82
- }
76
+ session_summary(session_id, read(session_id))
83
77
  end
84
78
  end
85
79
 
@@ -62,6 +62,20 @@ module CodebaseIndex
62
62
  def sanitize_session_id(session_id)
63
63
  session_id.to_s.gsub(/[^a-zA-Z0-9_-]/, '_')
64
64
  end
65
+
66
+ # Build a session summary hash from a session ID and its requests.
67
+ #
68
+ # @param session_id [String]
69
+ # @param requests [Array<Hash>]
70
+ # @return [Hash]
71
+ def session_summary(session_id, requests)
72
+ {
73
+ 'session_id' => session_id,
74
+ 'request_count' => requests.size,
75
+ 'first_request' => requests.first&.fetch('timestamp', nil),
76
+ 'last_request' => requests.last&.fetch('timestamp', nil)
77
+ }
78
+ end
65
79
  end
66
80
  end
67
81
  end