codebase_index 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +29 -0
  3. data/CODE_OF_CONDUCT.md +83 -0
  4. data/CONTRIBUTING.md +65 -0
  5. data/LICENSE.txt +21 -0
  6. data/README.md +481 -0
  7. data/exe/codebase-console-mcp +22 -0
  8. data/exe/codebase-index-mcp +61 -0
  9. data/exe/codebase-index-mcp-http +64 -0
  10. data/exe/codebase-index-mcp-start +58 -0
  11. data/lib/codebase_index/ast/call_site_extractor.rb +106 -0
  12. data/lib/codebase_index/ast/method_extractor.rb +76 -0
  13. data/lib/codebase_index/ast/node.rb +88 -0
  14. data/lib/codebase_index/ast/parser.rb +653 -0
  15. data/lib/codebase_index/ast.rb +6 -0
  16. data/lib/codebase_index/builder.rb +137 -0
  17. data/lib/codebase_index/chunking/chunk.rb +84 -0
  18. data/lib/codebase_index/chunking/semantic_chunker.rb +290 -0
  19. data/lib/codebase_index/console/adapters/cache_adapter.rb +58 -0
  20. data/lib/codebase_index/console/adapters/good_job_adapter.rb +66 -0
  21. data/lib/codebase_index/console/adapters/sidekiq_adapter.rb +66 -0
  22. data/lib/codebase_index/console/adapters/solid_queue_adapter.rb +66 -0
  23. data/lib/codebase_index/console/audit_logger.rb +75 -0
  24. data/lib/codebase_index/console/bridge.rb +170 -0
  25. data/lib/codebase_index/console/confirmation.rb +90 -0
  26. data/lib/codebase_index/console/connection_manager.rb +173 -0
  27. data/lib/codebase_index/console/console_response_renderer.rb +78 -0
  28. data/lib/codebase_index/console/model_validator.rb +81 -0
  29. data/lib/codebase_index/console/safe_context.rb +82 -0
  30. data/lib/codebase_index/console/server.rb +557 -0
  31. data/lib/codebase_index/console/sql_validator.rb +172 -0
  32. data/lib/codebase_index/console/tools/tier1.rb +118 -0
  33. data/lib/codebase_index/console/tools/tier2.rb +117 -0
  34. data/lib/codebase_index/console/tools/tier3.rb +110 -0
  35. data/lib/codebase_index/console/tools/tier4.rb +79 -0
  36. data/lib/codebase_index/coordination/pipeline_lock.rb +109 -0
  37. data/lib/codebase_index/cost_model/embedding_cost.rb +88 -0
  38. data/lib/codebase_index/cost_model/estimator.rb +128 -0
  39. data/lib/codebase_index/cost_model/provider_pricing.rb +67 -0
  40. data/lib/codebase_index/cost_model/storage_cost.rb +52 -0
  41. data/lib/codebase_index/cost_model.rb +22 -0
  42. data/lib/codebase_index/db/migrations/001_create_units.rb +38 -0
  43. data/lib/codebase_index/db/migrations/002_create_edges.rb +35 -0
  44. data/lib/codebase_index/db/migrations/003_create_embeddings.rb +37 -0
  45. data/lib/codebase_index/db/migrations/004_create_snapshots.rb +45 -0
  46. data/lib/codebase_index/db/migrations/005_create_snapshot_units.rb +40 -0
  47. data/lib/codebase_index/db/migrator.rb +71 -0
  48. data/lib/codebase_index/db/schema_version.rb +73 -0
  49. data/lib/codebase_index/dependency_graph.rb +227 -0
  50. data/lib/codebase_index/embedding/indexer.rb +130 -0
  51. data/lib/codebase_index/embedding/openai.rb +105 -0
  52. data/lib/codebase_index/embedding/provider.rb +135 -0
  53. data/lib/codebase_index/embedding/text_preparer.rb +112 -0
  54. data/lib/codebase_index/evaluation/baseline_runner.rb +115 -0
  55. data/lib/codebase_index/evaluation/evaluator.rb +146 -0
  56. data/lib/codebase_index/evaluation/metrics.rb +79 -0
  57. data/lib/codebase_index/evaluation/query_set.rb +148 -0
  58. data/lib/codebase_index/evaluation/report_generator.rb +90 -0
  59. data/lib/codebase_index/extracted_unit.rb +145 -0
  60. data/lib/codebase_index/extractor.rb +956 -0
  61. data/lib/codebase_index/extractors/action_cable_extractor.rb +228 -0
  62. data/lib/codebase_index/extractors/ast_source_extraction.rb +46 -0
  63. data/lib/codebase_index/extractors/behavioral_profile.rb +309 -0
  64. data/lib/codebase_index/extractors/caching_extractor.rb +261 -0
  65. data/lib/codebase_index/extractors/callback_analyzer.rb +232 -0
  66. data/lib/codebase_index/extractors/concern_extractor.rb +253 -0
  67. data/lib/codebase_index/extractors/configuration_extractor.rb +219 -0
  68. data/lib/codebase_index/extractors/controller_extractor.rb +494 -0
  69. data/lib/codebase_index/extractors/database_view_extractor.rb +278 -0
  70. data/lib/codebase_index/extractors/decorator_extractor.rb +260 -0
  71. data/lib/codebase_index/extractors/engine_extractor.rb +204 -0
  72. data/lib/codebase_index/extractors/event_extractor.rb +211 -0
  73. data/lib/codebase_index/extractors/factory_extractor.rb +289 -0
  74. data/lib/codebase_index/extractors/graphql_extractor.rb +917 -0
  75. data/lib/codebase_index/extractors/i18n_extractor.rb +117 -0
  76. data/lib/codebase_index/extractors/job_extractor.rb +369 -0
  77. data/lib/codebase_index/extractors/lib_extractor.rb +249 -0
  78. data/lib/codebase_index/extractors/mailer_extractor.rb +339 -0
  79. data/lib/codebase_index/extractors/manager_extractor.rb +202 -0
  80. data/lib/codebase_index/extractors/middleware_extractor.rb +133 -0
  81. data/lib/codebase_index/extractors/migration_extractor.rb +469 -0
  82. data/lib/codebase_index/extractors/model_extractor.rb +960 -0
  83. data/lib/codebase_index/extractors/phlex_extractor.rb +252 -0
  84. data/lib/codebase_index/extractors/policy_extractor.rb +214 -0
  85. data/lib/codebase_index/extractors/poro_extractor.rb +246 -0
  86. data/lib/codebase_index/extractors/pundit_extractor.rb +223 -0
  87. data/lib/codebase_index/extractors/rails_source_extractor.rb +473 -0
  88. data/lib/codebase_index/extractors/rake_task_extractor.rb +343 -0
  89. data/lib/codebase_index/extractors/route_extractor.rb +181 -0
  90. data/lib/codebase_index/extractors/scheduled_job_extractor.rb +331 -0
  91. data/lib/codebase_index/extractors/serializer_extractor.rb +334 -0
  92. data/lib/codebase_index/extractors/service_extractor.rb +254 -0
  93. data/lib/codebase_index/extractors/shared_dependency_scanner.rb +91 -0
  94. data/lib/codebase_index/extractors/shared_utility_methods.rb +99 -0
  95. data/lib/codebase_index/extractors/state_machine_extractor.rb +398 -0
  96. data/lib/codebase_index/extractors/test_mapping_extractor.rb +225 -0
  97. data/lib/codebase_index/extractors/validator_extractor.rb +225 -0
  98. data/lib/codebase_index/extractors/view_component_extractor.rb +310 -0
  99. data/lib/codebase_index/extractors/view_template_extractor.rb +261 -0
  100. data/lib/codebase_index/feedback/gap_detector.rb +89 -0
  101. data/lib/codebase_index/feedback/store.rb +119 -0
  102. data/lib/codebase_index/flow_analysis/operation_extractor.rb +209 -0
  103. data/lib/codebase_index/flow_analysis/response_code_mapper.rb +154 -0
  104. data/lib/codebase_index/flow_assembler.rb +290 -0
  105. data/lib/codebase_index/flow_document.rb +191 -0
  106. data/lib/codebase_index/flow_precomputer.rb +102 -0
  107. data/lib/codebase_index/formatting/base.rb +40 -0
  108. data/lib/codebase_index/formatting/claude_adapter.rb +98 -0
  109. data/lib/codebase_index/formatting/generic_adapter.rb +56 -0
  110. data/lib/codebase_index/formatting/gpt_adapter.rb +64 -0
  111. data/lib/codebase_index/formatting/human_adapter.rb +78 -0
  112. data/lib/codebase_index/graph_analyzer.rb +374 -0
  113. data/lib/codebase_index/mcp/index_reader.rb +394 -0
  114. data/lib/codebase_index/mcp/renderers/claude_renderer.rb +81 -0
  115. data/lib/codebase_index/mcp/renderers/json_renderer.rb +17 -0
  116. data/lib/codebase_index/mcp/renderers/markdown_renderer.rb +352 -0
  117. data/lib/codebase_index/mcp/renderers/plain_renderer.rb +240 -0
  118. data/lib/codebase_index/mcp/server.rb +935 -0
  119. data/lib/codebase_index/mcp/tool_response_renderer.rb +62 -0
  120. data/lib/codebase_index/model_name_cache.rb +51 -0
  121. data/lib/codebase_index/notion/client.rb +217 -0
  122. data/lib/codebase_index/notion/exporter.rb +219 -0
  123. data/lib/codebase_index/notion/mapper.rb +39 -0
  124. data/lib/codebase_index/notion/mappers/column_mapper.rb +65 -0
  125. data/lib/codebase_index/notion/mappers/migration_mapper.rb +39 -0
  126. data/lib/codebase_index/notion/mappers/model_mapper.rb +164 -0
  127. data/lib/codebase_index/notion/rate_limiter.rb +68 -0
  128. data/lib/codebase_index/observability/health_check.rb +81 -0
  129. data/lib/codebase_index/observability/instrumentation.rb +34 -0
  130. data/lib/codebase_index/observability/structured_logger.rb +75 -0
  131. data/lib/codebase_index/operator/error_escalator.rb +81 -0
  132. data/lib/codebase_index/operator/pipeline_guard.rb +99 -0
  133. data/lib/codebase_index/operator/status_reporter.rb +80 -0
  134. data/lib/codebase_index/railtie.rb +26 -0
  135. data/lib/codebase_index/resilience/circuit_breaker.rb +99 -0
  136. data/lib/codebase_index/resilience/index_validator.rb +185 -0
  137. data/lib/codebase_index/resilience/retryable_provider.rb +108 -0
  138. data/lib/codebase_index/retrieval/context_assembler.rb +249 -0
  139. data/lib/codebase_index/retrieval/query_classifier.rb +131 -0
  140. data/lib/codebase_index/retrieval/ranker.rb +273 -0
  141. data/lib/codebase_index/retrieval/search_executor.rb +327 -0
  142. data/lib/codebase_index/retriever.rb +160 -0
  143. data/lib/codebase_index/ruby_analyzer/class_analyzer.rb +190 -0
  144. data/lib/codebase_index/ruby_analyzer/dataflow_analyzer.rb +78 -0
  145. data/lib/codebase_index/ruby_analyzer/fqn_builder.rb +18 -0
  146. data/lib/codebase_index/ruby_analyzer/mermaid_renderer.rb +275 -0
  147. data/lib/codebase_index/ruby_analyzer/method_analyzer.rb +143 -0
  148. data/lib/codebase_index/ruby_analyzer/trace_enricher.rb +139 -0
  149. data/lib/codebase_index/ruby_analyzer.rb +87 -0
  150. data/lib/codebase_index/session_tracer/file_store.rb +111 -0
  151. data/lib/codebase_index/session_tracer/middleware.rb +143 -0
  152. data/lib/codebase_index/session_tracer/redis_store.rb +112 -0
  153. data/lib/codebase_index/session_tracer/session_flow_assembler.rb +263 -0
  154. data/lib/codebase_index/session_tracer/session_flow_document.rb +223 -0
  155. data/lib/codebase_index/session_tracer/solid_cache_store.rb +145 -0
  156. data/lib/codebase_index/session_tracer/store.rb +67 -0
  157. data/lib/codebase_index/storage/graph_store.rb +120 -0
  158. data/lib/codebase_index/storage/metadata_store.rb +169 -0
  159. data/lib/codebase_index/storage/pgvector.rb +163 -0
  160. data/lib/codebase_index/storage/qdrant.rb +172 -0
  161. data/lib/codebase_index/storage/vector_store.rb +156 -0
  162. data/lib/codebase_index/temporal/snapshot_store.rb +341 -0
  163. data/lib/codebase_index/version.rb +5 -0
  164. data/lib/codebase_index.rb +223 -0
  165. data/lib/generators/codebase_index/install_generator.rb +32 -0
  166. data/lib/generators/codebase_index/pgvector_generator.rb +37 -0
  167. data/lib/generators/codebase_index/templates/add_pgvector_to_codebase_index.rb.erb +15 -0
  168. data/lib/generators/codebase_index/templates/create_codebase_index_tables.rb.erb +43 -0
  169. data/lib/tasks/codebase_index.rake +583 -0
  170. data/lib/tasks/codebase_index_evaluation.rake +115 -0
  171. metadata +252 -0
@@ -0,0 +1,148 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CodebaseIndex
4
+ module Evaluation
5
+ # Manages a set of evaluation queries with expected results.
6
+ #
7
+ # Each query has a natural language question, a list of expected unit
8
+ # identifiers (ground truth), an intent classification, scope, and tags
9
+ # for filtering. QuerySets can be loaded from and saved to JSON files.
10
+ #
11
+ # @example
12
+ # qs = QuerySet.load("spec/fixtures/eval_queries.json")
13
+ # qs.queries.each { |q| puts q.query }
14
+ # qs.filter(intent: :lookup).size
15
+ #
16
+ class QuerySet
17
+ # A single evaluation query with ground-truth annotations.
18
+ #
19
+ # @!attribute [r] query
20
+ # @return [String] Natural language query
21
+ # @!attribute [r] expected_units
22
+ # @return [Array<String>] Expected unit identifiers (ground truth)
23
+ # @!attribute [r] intent
24
+ # @return [Symbol] Query intent (:lookup, :trace, :explain, :compare)
25
+ # @!attribute [r] scope
26
+ # @return [Symbol] Query scope (:specific, :bounded, :broad)
27
+ # @!attribute [r] tags
28
+ # @return [Array<String>] Tags for filtering queries
29
+ Query = Struct.new(:query, :expected_units, :intent, :scope, :tags, keyword_init: true)
30
+
31
+ VALID_INTENTS = %i[lookup trace explain compare].freeze
32
+ VALID_SCOPES = %i[specific bounded broad].freeze
33
+
34
+ # @return [Array<Query>] The queries in this set
35
+ attr_reader :queries
36
+
37
+ # Initialize a QuerySet with an array of queries.
38
+ #
39
+ # @param queries [Array<Query>] Evaluation queries
40
+ def initialize(queries: [])
41
+ @queries = queries
42
+ end
43
+
44
+ # Load a QuerySet from a JSON file.
45
+ #
46
+ # @param path [String] Path to JSON file
47
+ # @return [QuerySet] Loaded query set
48
+ # @raise [CodebaseIndex::Error] if the file cannot be read or parsed
49
+ def self.load(path)
50
+ data = JSON.parse(File.read(path))
51
+ queries = data.fetch('queries', []).map { |q| parse_query(q) }
52
+ new(queries: queries)
53
+ rescue JSON::ParserError => e
54
+ raise CodebaseIndex::Error, "Invalid JSON in query set: #{e.message}"
55
+ rescue Errno::ENOENT => e
56
+ raise CodebaseIndex::Error, "Query set file not found: #{e.message}"
57
+ end
58
+
59
+ # Save this QuerySet to a JSON file.
60
+ #
61
+ # @param path [String] Path to write JSON file
62
+ # @return [void]
63
+ def save(path)
64
+ data = {
65
+ 'queries' => queries.map { |q| serialize_query(q) }
66
+ }
67
+ File.write(path, JSON.pretty_generate(data))
68
+ end
69
+
70
+ # Filter queries by intent, scope, or tags.
71
+ #
72
+ # @param intent [Symbol, nil] Filter by intent
73
+ # @param scope [Symbol, nil] Filter by scope
74
+ # @param tags [Array<String>, nil] Filter by tags (any match)
75
+ # @return [Array<Query>] Matching queries
76
+ def filter(intent: nil, scope: nil, tags: nil)
77
+ result = queries
78
+ result = result.select { |q| q.intent == intent } if intent
79
+ result = result.select { |q| q.scope == scope } if scope
80
+ result = result.select { |q| (q.tags & tags).any? } if tags
81
+ result
82
+ end
83
+
84
+ # Add a query to this set.
85
+ #
86
+ # @param query [Query] Query to add
87
+ # @return [void]
88
+ # @raise [ArgumentError] if intent or scope is invalid
89
+ def add(query)
90
+ validate_query!(query)
91
+ @queries << query
92
+ end
93
+
94
+ # Number of queries in this set.
95
+ #
96
+ # @return [Integer]
97
+ def size
98
+ @queries.size
99
+ end
100
+
101
+ private
102
+
103
+ # Parse a query hash from JSON into a Query struct.
104
+ #
105
+ # @param hash [Hash] Raw query data
106
+ # @return [Query]
107
+ def self.parse_query(hash)
108
+ Query.new(
109
+ query: hash.fetch('query'),
110
+ expected_units: hash.fetch('expected_units', []),
111
+ intent: hash.fetch('intent', 'lookup').to_sym,
112
+ scope: hash.fetch('scope', 'specific').to_sym,
113
+ tags: hash.fetch('tags', [])
114
+ )
115
+ end
116
+
117
+ private_class_method :parse_query
118
+
119
+ # Serialize a Query to a hash for JSON output.
120
+ #
121
+ # @param query [Query] Query to serialize
122
+ # @return [Hash]
123
+ def serialize_query(query)
124
+ {
125
+ 'query' => query.query,
126
+ 'expected_units' => query.expected_units,
127
+ 'intent' => query.intent.to_s,
128
+ 'scope' => query.scope.to_s,
129
+ 'tags' => query.tags
130
+ }
131
+ end
132
+
133
+ # Validate intent and scope values.
134
+ #
135
+ # @param query [Query] Query to validate
136
+ # @raise [ArgumentError] if intent or scope is invalid
137
+ def validate_query!(query)
138
+ unless VALID_INTENTS.include?(query.intent)
139
+ raise ArgumentError, "Invalid intent: #{query.intent}. Must be one of #{VALID_INTENTS.join(', ')}"
140
+ end
141
+
142
+ return if VALID_SCOPES.include?(query.scope)
143
+
144
+ raise ArgumentError, "Invalid scope: #{query.scope}. Must be one of #{VALID_SCOPES.join(', ')}"
145
+ end
146
+ end
147
+ end
148
+ end
@@ -0,0 +1,90 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module CodebaseIndex
6
+ module Evaluation
7
+ # Generates JSON reports from evaluation results.
8
+ #
9
+ # Takes an EvaluationReport and produces a structured JSON document
10
+ # with per-query scores, aggregate metrics, and metadata.
11
+ #
12
+ # @example
13
+ # generator = ReportGenerator.new
14
+ # json = generator.generate(report)
15
+ # generator.save(report, "tmp/eval_report.json")
16
+ #
17
+ class ReportGenerator
18
+ # Generate a JSON string from an evaluation report.
19
+ #
20
+ # @param report [Evaluator::EvaluationReport] Evaluation report
21
+ # @param metadata [Hash] Optional metadata to include
22
+ # @return [String] Pretty-printed JSON
23
+ def generate(report, metadata: {})
24
+ data = build_report_hash(report, metadata)
25
+ JSON.pretty_generate(data)
26
+ end
27
+
28
+ # Save an evaluation report to a JSON file.
29
+ #
30
+ # @param report [Evaluator::EvaluationReport] Evaluation report
31
+ # @param path [String] Output file path
32
+ # @param metadata [Hash] Optional metadata to include
33
+ # @return [void]
34
+ def save(report, path, metadata: {})
35
+ FileUtils.mkdir_p(File.dirname(path))
36
+ File.write(path, generate(report, metadata: metadata))
37
+ end
38
+
39
+ private
40
+
41
+ # Build the complete report hash.
42
+ #
43
+ # @param report [Evaluator::EvaluationReport] Evaluation report
44
+ # @param metadata [Hash] Additional metadata
45
+ # @return [Hash]
46
+ def build_report_hash(report, metadata)
47
+ {
48
+ 'metadata' => build_metadata(metadata),
49
+ 'aggregates' => serialize_aggregates(report.aggregates),
50
+ 'results' => report.results.map { |r| serialize_result(r) }
51
+ }
52
+ end
53
+
54
+ # Build the metadata section.
55
+ #
56
+ # @param extra [Hash] Additional metadata
57
+ # @return [Hash]
58
+ def build_metadata(extra)
59
+ {
60
+ 'generated_at' => Time.now.iso8601,
61
+ 'version' => defined?(CodebaseIndex::VERSION) ? CodebaseIndex::VERSION : 'unknown'
62
+ }.merge(extra.transform_keys(&:to_s))
63
+ end
64
+
65
+ # Serialize aggregate metrics.
66
+ #
67
+ # @param aggregates [Hash] Aggregate metrics with symbol keys
68
+ # @return [Hash] String-keyed hash
69
+ def serialize_aggregates(aggregates)
70
+ aggregates.transform_keys(&:to_s).transform_values do |v|
71
+ v.is_a?(Float) ? v.round(4) : v
72
+ end
73
+ end
74
+
75
+ # Serialize a single query result.
76
+ #
77
+ # @param result [Evaluator::QueryResult] Query result
78
+ # @return [Hash]
79
+ def serialize_result(result)
80
+ {
81
+ 'query' => result.query,
82
+ 'expected_units' => result.expected_units,
83
+ 'retrieved_units' => result.retrieved_units,
84
+ 'scores' => result.scores.transform_keys(&:to_s).transform_values { |v| v.round(4) },
85
+ 'tokens_used' => result.tokens_used
86
+ }
87
+ end
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,145 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'digest'
4
+ require 'json'
5
+
6
+ module CodebaseIndex
7
+ # ExtractedUnit represents a single meaningful unit of code from the codebase.
8
+ #
9
+ # This could be a model, controller, service, component, or framework source.
10
+ # Each unit is self-contained with its source code, metadata, and relationship
11
+ # information. Units are serialized to JSON for consumption by the indexing pipeline.
12
+ #
13
+ # @example Creating a model unit
14
+ # unit = ExtractedUnit.new(
15
+ # type: :model,
16
+ # identifier: "User",
17
+ # file_path: "app/models/user.rb"
18
+ # )
19
+ # unit.source_code = File.read(unit.file_path)
20
+ # unit.metadata = { associations: [...], callbacks: [...] }
21
+ # unit.dependencies = [{ type: :service, target: "UserService" }]
22
+ #
23
+ class ExtractedUnit
24
+ attr_accessor :type, # Symbol: :model, :controller, :service, :component, :job, :rails_source, :gem_source
25
+ :identifier, # String: Unique key, e.g., "User", "Users::RegistrationsController#create"
26
+ :file_path, # String: Absolute path to source file
27
+ :namespace, # String: Module namespace if any
28
+ :source_code, # String: The actual code, with concerns inlined for models
29
+ :metadata, # Hash: Type-specific structured data
30
+ :dependencies, # Array<Hash>: What this unit calls/references
31
+ :dependents, # Array<Hash>: What references this unit (populated in second pass)
32
+ :chunks # Array<Hash>: Pre-chunked versions if unit is large
33
+
34
+ def initialize(type:, identifier:, file_path:)
35
+ @type = type
36
+ @identifier = identifier
37
+ @file_path = file_path
38
+ @metadata = {}
39
+ @dependencies = []
40
+ @dependents = []
41
+ @chunks = []
42
+ end
43
+
44
+ # Serialize to hash for JSON output
45
+ #
46
+ # @return [Hash] Complete unit data for indexing pipeline
47
+ def to_h
48
+ {
49
+ type: type,
50
+ identifier: identifier,
51
+ file_path: file_path,
52
+ namespace: namespace,
53
+ source_code: source_code,
54
+ metadata: metadata,
55
+ dependencies: dependencies,
56
+ dependents: dependents,
57
+ chunks: chunks,
58
+ extracted_at: Time.now.iso8601,
59
+ source_hash: Digest::SHA256.hexdigest(source_code || '')
60
+ }
61
+ end
62
+
63
+ # Estimate token count for chunking decisions.
64
+ # Benchmarked against tiktoken (cl100k_base) on 19 Ruby source files.
65
+ # Actual mean is 4.41 chars/token. Uses 4.0 as a conservative floor
66
+ # (~10.6% overestimate). See docs/TOKEN_BENCHMARK.md.
67
+ #
68
+ # @return [Integer] Estimated token count
69
+ def estimated_tokens
70
+ source_tokens = source_code ? (source_code.length / 4.0).ceil : 0
71
+ metadata_tokens = metadata.any? ? (metadata.to_json.length / 4.0).ceil : 0
72
+ source_tokens + metadata_tokens
73
+ end
74
+
75
+ # Check if unit needs chunking based on size
76
+ #
77
+ # @param threshold [Integer] Token threshold for chunking (default: 1500)
78
+ # @return [Boolean]
79
+ def needs_chunking?(threshold: 1500)
80
+ estimated_tokens > threshold
81
+ end
82
+
83
+ # Build semantic chunks for large units
84
+ # Preserves context by including unit header in each chunk
85
+ #
86
+ # @param max_tokens [Integer] Maximum tokens per chunk
87
+ # @return [Array<Hash>] List of chunk hashes
88
+ def build_default_chunks(max_tokens: 1500)
89
+ return [] unless needs_chunking?
90
+
91
+ chunks = []
92
+ current_chunk = []
93
+ current_tokens = 0
94
+
95
+ # Always include a header with unit context
96
+ header = build_chunk_header
97
+ header_tokens = (header.length / 4.0).ceil
98
+
99
+ source_code.lines.each do |line|
100
+ line_tokens = (line.length / 4.0).ceil
101
+
102
+ if current_tokens + line_tokens > max_tokens && current_chunk.any?
103
+ content = header + current_chunk.join
104
+ chunks << {
105
+ chunk_index: chunks.size,
106
+ identifier: "#{identifier}#chunk_#{chunks.size}",
107
+ content: content,
108
+ content_hash: Digest::SHA256.hexdigest(content),
109
+ estimated_tokens: current_tokens + header_tokens
110
+ }
111
+ current_chunk = []
112
+ current_tokens = 0
113
+ end
114
+
115
+ current_chunk << line
116
+ current_tokens += line_tokens
117
+ end
118
+
119
+ # Final chunk
120
+ if current_chunk.any?
121
+ content = header + current_chunk.join
122
+ chunks << {
123
+ chunk_index: chunks.size,
124
+ identifier: "#{identifier}#chunk_#{chunks.size}",
125
+ content: content,
126
+ content_hash: Digest::SHA256.hexdigest(content),
127
+ estimated_tokens: current_tokens + header_tokens
128
+ }
129
+ end
130
+
131
+ chunks
132
+ end
133
+
134
+ private
135
+
136
+ def build_chunk_header
137
+ <<~HEADER
138
+ # Unit: #{identifier} (#{type})
139
+ # File: #{file_path}
140
+ # Namespace: #{namespace || '(root)'}
141
+ # ---
142
+ HEADER
143
+ end
144
+ end
145
+ end