codebase_index 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +29 -0
- data/CODE_OF_CONDUCT.md +83 -0
- data/CONTRIBUTING.md +65 -0
- data/LICENSE.txt +21 -0
- data/README.md +481 -0
- data/exe/codebase-console-mcp +22 -0
- data/exe/codebase-index-mcp +61 -0
- data/exe/codebase-index-mcp-http +64 -0
- data/exe/codebase-index-mcp-start +58 -0
- data/lib/codebase_index/ast/call_site_extractor.rb +106 -0
- data/lib/codebase_index/ast/method_extractor.rb +76 -0
- data/lib/codebase_index/ast/node.rb +88 -0
- data/lib/codebase_index/ast/parser.rb +653 -0
- data/lib/codebase_index/ast.rb +6 -0
- data/lib/codebase_index/builder.rb +137 -0
- data/lib/codebase_index/chunking/chunk.rb +84 -0
- data/lib/codebase_index/chunking/semantic_chunker.rb +290 -0
- data/lib/codebase_index/console/adapters/cache_adapter.rb +58 -0
- data/lib/codebase_index/console/adapters/good_job_adapter.rb +66 -0
- data/lib/codebase_index/console/adapters/sidekiq_adapter.rb +66 -0
- data/lib/codebase_index/console/adapters/solid_queue_adapter.rb +66 -0
- data/lib/codebase_index/console/audit_logger.rb +75 -0
- data/lib/codebase_index/console/bridge.rb +170 -0
- data/lib/codebase_index/console/confirmation.rb +90 -0
- data/lib/codebase_index/console/connection_manager.rb +173 -0
- data/lib/codebase_index/console/console_response_renderer.rb +78 -0
- data/lib/codebase_index/console/model_validator.rb +81 -0
- data/lib/codebase_index/console/safe_context.rb +82 -0
- data/lib/codebase_index/console/server.rb +557 -0
- data/lib/codebase_index/console/sql_validator.rb +172 -0
- data/lib/codebase_index/console/tools/tier1.rb +118 -0
- data/lib/codebase_index/console/tools/tier2.rb +117 -0
- data/lib/codebase_index/console/tools/tier3.rb +110 -0
- data/lib/codebase_index/console/tools/tier4.rb +79 -0
- data/lib/codebase_index/coordination/pipeline_lock.rb +109 -0
- data/lib/codebase_index/cost_model/embedding_cost.rb +88 -0
- data/lib/codebase_index/cost_model/estimator.rb +128 -0
- data/lib/codebase_index/cost_model/provider_pricing.rb +67 -0
- data/lib/codebase_index/cost_model/storage_cost.rb +52 -0
- data/lib/codebase_index/cost_model.rb +22 -0
- data/lib/codebase_index/db/migrations/001_create_units.rb +38 -0
- data/lib/codebase_index/db/migrations/002_create_edges.rb +35 -0
- data/lib/codebase_index/db/migrations/003_create_embeddings.rb +37 -0
- data/lib/codebase_index/db/migrations/004_create_snapshots.rb +45 -0
- data/lib/codebase_index/db/migrations/005_create_snapshot_units.rb +40 -0
- data/lib/codebase_index/db/migrator.rb +71 -0
- data/lib/codebase_index/db/schema_version.rb +73 -0
- data/lib/codebase_index/dependency_graph.rb +227 -0
- data/lib/codebase_index/embedding/indexer.rb +130 -0
- data/lib/codebase_index/embedding/openai.rb +105 -0
- data/lib/codebase_index/embedding/provider.rb +135 -0
- data/lib/codebase_index/embedding/text_preparer.rb +112 -0
- data/lib/codebase_index/evaluation/baseline_runner.rb +115 -0
- data/lib/codebase_index/evaluation/evaluator.rb +146 -0
- data/lib/codebase_index/evaluation/metrics.rb +79 -0
- data/lib/codebase_index/evaluation/query_set.rb +148 -0
- data/lib/codebase_index/evaluation/report_generator.rb +90 -0
- data/lib/codebase_index/extracted_unit.rb +145 -0
- data/lib/codebase_index/extractor.rb +956 -0
- data/lib/codebase_index/extractors/action_cable_extractor.rb +228 -0
- data/lib/codebase_index/extractors/ast_source_extraction.rb +46 -0
- data/lib/codebase_index/extractors/behavioral_profile.rb +309 -0
- data/lib/codebase_index/extractors/caching_extractor.rb +261 -0
- data/lib/codebase_index/extractors/callback_analyzer.rb +232 -0
- data/lib/codebase_index/extractors/concern_extractor.rb +253 -0
- data/lib/codebase_index/extractors/configuration_extractor.rb +219 -0
- data/lib/codebase_index/extractors/controller_extractor.rb +494 -0
- data/lib/codebase_index/extractors/database_view_extractor.rb +278 -0
- data/lib/codebase_index/extractors/decorator_extractor.rb +260 -0
- data/lib/codebase_index/extractors/engine_extractor.rb +204 -0
- data/lib/codebase_index/extractors/event_extractor.rb +211 -0
- data/lib/codebase_index/extractors/factory_extractor.rb +289 -0
- data/lib/codebase_index/extractors/graphql_extractor.rb +917 -0
- data/lib/codebase_index/extractors/i18n_extractor.rb +117 -0
- data/lib/codebase_index/extractors/job_extractor.rb +369 -0
- data/lib/codebase_index/extractors/lib_extractor.rb +249 -0
- data/lib/codebase_index/extractors/mailer_extractor.rb +339 -0
- data/lib/codebase_index/extractors/manager_extractor.rb +202 -0
- data/lib/codebase_index/extractors/middleware_extractor.rb +133 -0
- data/lib/codebase_index/extractors/migration_extractor.rb +469 -0
- data/lib/codebase_index/extractors/model_extractor.rb +960 -0
- data/lib/codebase_index/extractors/phlex_extractor.rb +252 -0
- data/lib/codebase_index/extractors/policy_extractor.rb +214 -0
- data/lib/codebase_index/extractors/poro_extractor.rb +246 -0
- data/lib/codebase_index/extractors/pundit_extractor.rb +223 -0
- data/lib/codebase_index/extractors/rails_source_extractor.rb +473 -0
- data/lib/codebase_index/extractors/rake_task_extractor.rb +343 -0
- data/lib/codebase_index/extractors/route_extractor.rb +181 -0
- data/lib/codebase_index/extractors/scheduled_job_extractor.rb +331 -0
- data/lib/codebase_index/extractors/serializer_extractor.rb +334 -0
- data/lib/codebase_index/extractors/service_extractor.rb +254 -0
- data/lib/codebase_index/extractors/shared_dependency_scanner.rb +91 -0
- data/lib/codebase_index/extractors/shared_utility_methods.rb +99 -0
- data/lib/codebase_index/extractors/state_machine_extractor.rb +398 -0
- data/lib/codebase_index/extractors/test_mapping_extractor.rb +225 -0
- data/lib/codebase_index/extractors/validator_extractor.rb +225 -0
- data/lib/codebase_index/extractors/view_component_extractor.rb +310 -0
- data/lib/codebase_index/extractors/view_template_extractor.rb +261 -0
- data/lib/codebase_index/feedback/gap_detector.rb +89 -0
- data/lib/codebase_index/feedback/store.rb +119 -0
- data/lib/codebase_index/flow_analysis/operation_extractor.rb +209 -0
- data/lib/codebase_index/flow_analysis/response_code_mapper.rb +154 -0
- data/lib/codebase_index/flow_assembler.rb +290 -0
- data/lib/codebase_index/flow_document.rb +191 -0
- data/lib/codebase_index/flow_precomputer.rb +102 -0
- data/lib/codebase_index/formatting/base.rb +40 -0
- data/lib/codebase_index/formatting/claude_adapter.rb +98 -0
- data/lib/codebase_index/formatting/generic_adapter.rb +56 -0
- data/lib/codebase_index/formatting/gpt_adapter.rb +64 -0
- data/lib/codebase_index/formatting/human_adapter.rb +78 -0
- data/lib/codebase_index/graph_analyzer.rb +374 -0
- data/lib/codebase_index/mcp/index_reader.rb +394 -0
- data/lib/codebase_index/mcp/renderers/claude_renderer.rb +81 -0
- data/lib/codebase_index/mcp/renderers/json_renderer.rb +17 -0
- data/lib/codebase_index/mcp/renderers/markdown_renderer.rb +352 -0
- data/lib/codebase_index/mcp/renderers/plain_renderer.rb +240 -0
- data/lib/codebase_index/mcp/server.rb +935 -0
- data/lib/codebase_index/mcp/tool_response_renderer.rb +62 -0
- data/lib/codebase_index/model_name_cache.rb +51 -0
- data/lib/codebase_index/notion/client.rb +217 -0
- data/lib/codebase_index/notion/exporter.rb +219 -0
- data/lib/codebase_index/notion/mapper.rb +39 -0
- data/lib/codebase_index/notion/mappers/column_mapper.rb +65 -0
- data/lib/codebase_index/notion/mappers/migration_mapper.rb +39 -0
- data/lib/codebase_index/notion/mappers/model_mapper.rb +164 -0
- data/lib/codebase_index/notion/rate_limiter.rb +68 -0
- data/lib/codebase_index/observability/health_check.rb +81 -0
- data/lib/codebase_index/observability/instrumentation.rb +34 -0
- data/lib/codebase_index/observability/structured_logger.rb +75 -0
- data/lib/codebase_index/operator/error_escalator.rb +81 -0
- data/lib/codebase_index/operator/pipeline_guard.rb +99 -0
- data/lib/codebase_index/operator/status_reporter.rb +80 -0
- data/lib/codebase_index/railtie.rb +26 -0
- data/lib/codebase_index/resilience/circuit_breaker.rb +99 -0
- data/lib/codebase_index/resilience/index_validator.rb +185 -0
- data/lib/codebase_index/resilience/retryable_provider.rb +108 -0
- data/lib/codebase_index/retrieval/context_assembler.rb +249 -0
- data/lib/codebase_index/retrieval/query_classifier.rb +131 -0
- data/lib/codebase_index/retrieval/ranker.rb +273 -0
- data/lib/codebase_index/retrieval/search_executor.rb +327 -0
- data/lib/codebase_index/retriever.rb +160 -0
- data/lib/codebase_index/ruby_analyzer/class_analyzer.rb +190 -0
- data/lib/codebase_index/ruby_analyzer/dataflow_analyzer.rb +78 -0
- data/lib/codebase_index/ruby_analyzer/fqn_builder.rb +18 -0
- data/lib/codebase_index/ruby_analyzer/mermaid_renderer.rb +275 -0
- data/lib/codebase_index/ruby_analyzer/method_analyzer.rb +143 -0
- data/lib/codebase_index/ruby_analyzer/trace_enricher.rb +139 -0
- data/lib/codebase_index/ruby_analyzer.rb +87 -0
- data/lib/codebase_index/session_tracer/file_store.rb +111 -0
- data/lib/codebase_index/session_tracer/middleware.rb +143 -0
- data/lib/codebase_index/session_tracer/redis_store.rb +112 -0
- data/lib/codebase_index/session_tracer/session_flow_assembler.rb +263 -0
- data/lib/codebase_index/session_tracer/session_flow_document.rb +223 -0
- data/lib/codebase_index/session_tracer/solid_cache_store.rb +145 -0
- data/lib/codebase_index/session_tracer/store.rb +67 -0
- data/lib/codebase_index/storage/graph_store.rb +120 -0
- data/lib/codebase_index/storage/metadata_store.rb +169 -0
- data/lib/codebase_index/storage/pgvector.rb +163 -0
- data/lib/codebase_index/storage/qdrant.rb +172 -0
- data/lib/codebase_index/storage/vector_store.rb +156 -0
- data/lib/codebase_index/temporal/snapshot_store.rb +341 -0
- data/lib/codebase_index/version.rb +5 -0
- data/lib/codebase_index.rb +223 -0
- data/lib/generators/codebase_index/install_generator.rb +32 -0
- data/lib/generators/codebase_index/pgvector_generator.rb +37 -0
- data/lib/generators/codebase_index/templates/add_pgvector_to_codebase_index.rb.erb +15 -0
- data/lib/generators/codebase_index/templates/create_codebase_index_tables.rb.erb +43 -0
- data/lib/tasks/codebase_index.rake +583 -0
- data/lib/tasks/codebase_index_evaluation.rake +115 -0
- metadata +252 -0
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module CodebaseIndex
|
|
4
|
+
module Evaluation
|
|
5
|
+
# Manages a set of evaluation queries with expected results.
|
|
6
|
+
#
|
|
7
|
+
# Each query has a natural language question, a list of expected unit
|
|
8
|
+
# identifiers (ground truth), an intent classification, scope, and tags
|
|
9
|
+
# for filtering. QuerySets can be loaded from and saved to JSON files.
|
|
10
|
+
#
|
|
11
|
+
# @example
|
|
12
|
+
# qs = QuerySet.load("spec/fixtures/eval_queries.json")
|
|
13
|
+
# qs.queries.each { |q| puts q.query }
|
|
14
|
+
# qs.filter(intent: :lookup).size
|
|
15
|
+
#
|
|
16
|
+
class QuerySet
|
|
17
|
+
# A single evaluation query with ground-truth annotations.
|
|
18
|
+
#
|
|
19
|
+
# @!attribute [r] query
|
|
20
|
+
# @return [String] Natural language query
|
|
21
|
+
# @!attribute [r] expected_units
|
|
22
|
+
# @return [Array<String>] Expected unit identifiers (ground truth)
|
|
23
|
+
# @!attribute [r] intent
|
|
24
|
+
# @return [Symbol] Query intent (:lookup, :trace, :explain, :compare)
|
|
25
|
+
# @!attribute [r] scope
|
|
26
|
+
# @return [Symbol] Query scope (:specific, :bounded, :broad)
|
|
27
|
+
# @!attribute [r] tags
|
|
28
|
+
# @return [Array<String>] Tags for filtering queries
|
|
29
|
+
Query = Struct.new(:query, :expected_units, :intent, :scope, :tags, keyword_init: true)
|
|
30
|
+
|
|
31
|
+
VALID_INTENTS = %i[lookup trace explain compare].freeze
|
|
32
|
+
VALID_SCOPES = %i[specific bounded broad].freeze
|
|
33
|
+
|
|
34
|
+
# @return [Array<Query>] The queries in this set
|
|
35
|
+
attr_reader :queries
|
|
36
|
+
|
|
37
|
+
# Initialize a QuerySet with an array of queries.
|
|
38
|
+
#
|
|
39
|
+
# @param queries [Array<Query>] Evaluation queries
|
|
40
|
+
def initialize(queries: [])
|
|
41
|
+
@queries = queries
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Load a QuerySet from a JSON file.
|
|
45
|
+
#
|
|
46
|
+
# @param path [String] Path to JSON file
|
|
47
|
+
# @return [QuerySet] Loaded query set
|
|
48
|
+
# @raise [CodebaseIndex::Error] if the file cannot be read or parsed
|
|
49
|
+
def self.load(path)
|
|
50
|
+
data = JSON.parse(File.read(path))
|
|
51
|
+
queries = data.fetch('queries', []).map { |q| parse_query(q) }
|
|
52
|
+
new(queries: queries)
|
|
53
|
+
rescue JSON::ParserError => e
|
|
54
|
+
raise CodebaseIndex::Error, "Invalid JSON in query set: #{e.message}"
|
|
55
|
+
rescue Errno::ENOENT => e
|
|
56
|
+
raise CodebaseIndex::Error, "Query set file not found: #{e.message}"
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Save this QuerySet to a JSON file.
|
|
60
|
+
#
|
|
61
|
+
# @param path [String] Path to write JSON file
|
|
62
|
+
# @return [void]
|
|
63
|
+
def save(path)
|
|
64
|
+
data = {
|
|
65
|
+
'queries' => queries.map { |q| serialize_query(q) }
|
|
66
|
+
}
|
|
67
|
+
File.write(path, JSON.pretty_generate(data))
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Filter queries by intent, scope, or tags.
|
|
71
|
+
#
|
|
72
|
+
# @param intent [Symbol, nil] Filter by intent
|
|
73
|
+
# @param scope [Symbol, nil] Filter by scope
|
|
74
|
+
# @param tags [Array<String>, nil] Filter by tags (any match)
|
|
75
|
+
# @return [Array<Query>] Matching queries
|
|
76
|
+
def filter(intent: nil, scope: nil, tags: nil)
|
|
77
|
+
result = queries
|
|
78
|
+
result = result.select { |q| q.intent == intent } if intent
|
|
79
|
+
result = result.select { |q| q.scope == scope } if scope
|
|
80
|
+
result = result.select { |q| (q.tags & tags).any? } if tags
|
|
81
|
+
result
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Add a query to this set.
|
|
85
|
+
#
|
|
86
|
+
# @param query [Query] Query to add
|
|
87
|
+
# @return [void]
|
|
88
|
+
# @raise [ArgumentError] if intent or scope is invalid
|
|
89
|
+
def add(query)
|
|
90
|
+
validate_query!(query)
|
|
91
|
+
@queries << query
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Number of queries in this set.
|
|
95
|
+
#
|
|
96
|
+
# @return [Integer]
|
|
97
|
+
def size
|
|
98
|
+
@queries.size
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
private
|
|
102
|
+
|
|
103
|
+
# Parse a query hash from JSON into a Query struct.
|
|
104
|
+
#
|
|
105
|
+
# @param hash [Hash] Raw query data
|
|
106
|
+
# @return [Query]
|
|
107
|
+
def self.parse_query(hash)
|
|
108
|
+
Query.new(
|
|
109
|
+
query: hash.fetch('query'),
|
|
110
|
+
expected_units: hash.fetch('expected_units', []),
|
|
111
|
+
intent: hash.fetch('intent', 'lookup').to_sym,
|
|
112
|
+
scope: hash.fetch('scope', 'specific').to_sym,
|
|
113
|
+
tags: hash.fetch('tags', [])
|
|
114
|
+
)
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
private_class_method :parse_query
|
|
118
|
+
|
|
119
|
+
# Serialize a Query to a hash for JSON output.
|
|
120
|
+
#
|
|
121
|
+
# @param query [Query] Query to serialize
|
|
122
|
+
# @return [Hash]
|
|
123
|
+
def serialize_query(query)
|
|
124
|
+
{
|
|
125
|
+
'query' => query.query,
|
|
126
|
+
'expected_units' => query.expected_units,
|
|
127
|
+
'intent' => query.intent.to_s,
|
|
128
|
+
'scope' => query.scope.to_s,
|
|
129
|
+
'tags' => query.tags
|
|
130
|
+
}
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# Validate intent and scope values.
|
|
134
|
+
#
|
|
135
|
+
# @param query [Query] Query to validate
|
|
136
|
+
# @raise [ArgumentError] if intent or scope is invalid
|
|
137
|
+
def validate_query!(query)
|
|
138
|
+
unless VALID_INTENTS.include?(query.intent)
|
|
139
|
+
raise ArgumentError, "Invalid intent: #{query.intent}. Must be one of #{VALID_INTENTS.join(', ')}"
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
return if VALID_SCOPES.include?(query.scope)
|
|
143
|
+
|
|
144
|
+
raise ArgumentError, "Invalid scope: #{query.scope}. Must be one of #{VALID_SCOPES.join(', ')}"
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
end
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
|
|
5
|
+
module CodebaseIndex
|
|
6
|
+
module Evaluation
|
|
7
|
+
# Generates JSON reports from evaluation results.
|
|
8
|
+
#
|
|
9
|
+
# Takes an EvaluationReport and produces a structured JSON document
|
|
10
|
+
# with per-query scores, aggregate metrics, and metadata.
|
|
11
|
+
#
|
|
12
|
+
# @example
|
|
13
|
+
# generator = ReportGenerator.new
|
|
14
|
+
# json = generator.generate(report)
|
|
15
|
+
# generator.save(report, "tmp/eval_report.json")
|
|
16
|
+
#
|
|
17
|
+
class ReportGenerator
|
|
18
|
+
# Generate a JSON string from an evaluation report.
|
|
19
|
+
#
|
|
20
|
+
# @param report [Evaluator::EvaluationReport] Evaluation report
|
|
21
|
+
# @param metadata [Hash] Optional metadata to include
|
|
22
|
+
# @return [String] Pretty-printed JSON
|
|
23
|
+
def generate(report, metadata: {})
|
|
24
|
+
data = build_report_hash(report, metadata)
|
|
25
|
+
JSON.pretty_generate(data)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Save an evaluation report to a JSON file.
|
|
29
|
+
#
|
|
30
|
+
# @param report [Evaluator::EvaluationReport] Evaluation report
|
|
31
|
+
# @param path [String] Output file path
|
|
32
|
+
# @param metadata [Hash] Optional metadata to include
|
|
33
|
+
# @return [void]
|
|
34
|
+
def save(report, path, metadata: {})
|
|
35
|
+
FileUtils.mkdir_p(File.dirname(path))
|
|
36
|
+
File.write(path, generate(report, metadata: metadata))
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
private
|
|
40
|
+
|
|
41
|
+
# Build the complete report hash.
|
|
42
|
+
#
|
|
43
|
+
# @param report [Evaluator::EvaluationReport] Evaluation report
|
|
44
|
+
# @param metadata [Hash] Additional metadata
|
|
45
|
+
# @return [Hash]
|
|
46
|
+
def build_report_hash(report, metadata)
|
|
47
|
+
{
|
|
48
|
+
'metadata' => build_metadata(metadata),
|
|
49
|
+
'aggregates' => serialize_aggregates(report.aggregates),
|
|
50
|
+
'results' => report.results.map { |r| serialize_result(r) }
|
|
51
|
+
}
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Build the metadata section.
|
|
55
|
+
#
|
|
56
|
+
# @param extra [Hash] Additional metadata
|
|
57
|
+
# @return [Hash]
|
|
58
|
+
def build_metadata(extra)
|
|
59
|
+
{
|
|
60
|
+
'generated_at' => Time.now.iso8601,
|
|
61
|
+
'version' => defined?(CodebaseIndex::VERSION) ? CodebaseIndex::VERSION : 'unknown'
|
|
62
|
+
}.merge(extra.transform_keys(&:to_s))
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Serialize aggregate metrics.
|
|
66
|
+
#
|
|
67
|
+
# @param aggregates [Hash] Aggregate metrics with symbol keys
|
|
68
|
+
# @return [Hash] String-keyed hash
|
|
69
|
+
def serialize_aggregates(aggregates)
|
|
70
|
+
aggregates.transform_keys(&:to_s).transform_values do |v|
|
|
71
|
+
v.is_a?(Float) ? v.round(4) : v
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Serialize a single query result.
|
|
76
|
+
#
|
|
77
|
+
# @param result [Evaluator::QueryResult] Query result
|
|
78
|
+
# @return [Hash]
|
|
79
|
+
def serialize_result(result)
|
|
80
|
+
{
|
|
81
|
+
'query' => result.query,
|
|
82
|
+
'expected_units' => result.expected_units,
|
|
83
|
+
'retrieved_units' => result.retrieved_units,
|
|
84
|
+
'scores' => result.scores.transform_keys(&:to_s).transform_values { |v| v.round(4) },
|
|
85
|
+
'tokens_used' => result.tokens_used
|
|
86
|
+
}
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
end
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'digest'
|
|
4
|
+
require 'json'
|
|
5
|
+
|
|
6
|
+
module CodebaseIndex
|
|
7
|
+
# ExtractedUnit represents a single meaningful unit of code from the codebase.
|
|
8
|
+
#
|
|
9
|
+
# This could be a model, controller, service, component, or framework source.
|
|
10
|
+
# Each unit is self-contained with its source code, metadata, and relationship
|
|
11
|
+
# information. Units are serialized to JSON for consumption by the indexing pipeline.
|
|
12
|
+
#
|
|
13
|
+
# @example Creating a model unit
|
|
14
|
+
# unit = ExtractedUnit.new(
|
|
15
|
+
# type: :model,
|
|
16
|
+
# identifier: "User",
|
|
17
|
+
# file_path: "app/models/user.rb"
|
|
18
|
+
# )
|
|
19
|
+
# unit.source_code = File.read(unit.file_path)
|
|
20
|
+
# unit.metadata = { associations: [...], callbacks: [...] }
|
|
21
|
+
# unit.dependencies = [{ type: :service, target: "UserService" }]
|
|
22
|
+
#
|
|
23
|
+
class ExtractedUnit
|
|
24
|
+
attr_accessor :type, # Symbol: :model, :controller, :service, :component, :job, :rails_source, :gem_source
|
|
25
|
+
:identifier, # String: Unique key, e.g., "User", "Users::RegistrationsController#create"
|
|
26
|
+
:file_path, # String: Absolute path to source file
|
|
27
|
+
:namespace, # String: Module namespace if any
|
|
28
|
+
:source_code, # String: The actual code, with concerns inlined for models
|
|
29
|
+
:metadata, # Hash: Type-specific structured data
|
|
30
|
+
:dependencies, # Array<Hash>: What this unit calls/references
|
|
31
|
+
:dependents, # Array<Hash>: What references this unit (populated in second pass)
|
|
32
|
+
:chunks # Array<Hash>: Pre-chunked versions if unit is large
|
|
33
|
+
|
|
34
|
+
def initialize(type:, identifier:, file_path:)
|
|
35
|
+
@type = type
|
|
36
|
+
@identifier = identifier
|
|
37
|
+
@file_path = file_path
|
|
38
|
+
@metadata = {}
|
|
39
|
+
@dependencies = []
|
|
40
|
+
@dependents = []
|
|
41
|
+
@chunks = []
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Serialize to hash for JSON output
|
|
45
|
+
#
|
|
46
|
+
# @return [Hash] Complete unit data for indexing pipeline
|
|
47
|
+
def to_h
|
|
48
|
+
{
|
|
49
|
+
type: type,
|
|
50
|
+
identifier: identifier,
|
|
51
|
+
file_path: file_path,
|
|
52
|
+
namespace: namespace,
|
|
53
|
+
source_code: source_code,
|
|
54
|
+
metadata: metadata,
|
|
55
|
+
dependencies: dependencies,
|
|
56
|
+
dependents: dependents,
|
|
57
|
+
chunks: chunks,
|
|
58
|
+
extracted_at: Time.now.iso8601,
|
|
59
|
+
source_hash: Digest::SHA256.hexdigest(source_code || '')
|
|
60
|
+
}
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Estimate token count for chunking decisions.
|
|
64
|
+
# Benchmarked against tiktoken (cl100k_base) on 19 Ruby source files.
|
|
65
|
+
# Actual mean is 4.41 chars/token. Uses 4.0 as a conservative floor
|
|
66
|
+
# (~10.6% overestimate). See docs/TOKEN_BENCHMARK.md.
|
|
67
|
+
#
|
|
68
|
+
# @return [Integer] Estimated token count
|
|
69
|
+
def estimated_tokens
|
|
70
|
+
source_tokens = source_code ? (source_code.length / 4.0).ceil : 0
|
|
71
|
+
metadata_tokens = metadata.any? ? (metadata.to_json.length / 4.0).ceil : 0
|
|
72
|
+
source_tokens + metadata_tokens
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Check if unit needs chunking based on size
|
|
76
|
+
#
|
|
77
|
+
# @param threshold [Integer] Token threshold for chunking (default: 1500)
|
|
78
|
+
# @return [Boolean]
|
|
79
|
+
def needs_chunking?(threshold: 1500)
|
|
80
|
+
estimated_tokens > threshold
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Build semantic chunks for large units
|
|
84
|
+
# Preserves context by including unit header in each chunk
|
|
85
|
+
#
|
|
86
|
+
# @param max_tokens [Integer] Maximum tokens per chunk
|
|
87
|
+
# @return [Array<Hash>] List of chunk hashes
|
|
88
|
+
def build_default_chunks(max_tokens: 1500)
|
|
89
|
+
return [] unless needs_chunking?
|
|
90
|
+
|
|
91
|
+
chunks = []
|
|
92
|
+
current_chunk = []
|
|
93
|
+
current_tokens = 0
|
|
94
|
+
|
|
95
|
+
# Always include a header with unit context
|
|
96
|
+
header = build_chunk_header
|
|
97
|
+
header_tokens = (header.length / 4.0).ceil
|
|
98
|
+
|
|
99
|
+
source_code.lines.each do |line|
|
|
100
|
+
line_tokens = (line.length / 4.0).ceil
|
|
101
|
+
|
|
102
|
+
if current_tokens + line_tokens > max_tokens && current_chunk.any?
|
|
103
|
+
content = header + current_chunk.join
|
|
104
|
+
chunks << {
|
|
105
|
+
chunk_index: chunks.size,
|
|
106
|
+
identifier: "#{identifier}#chunk_#{chunks.size}",
|
|
107
|
+
content: content,
|
|
108
|
+
content_hash: Digest::SHA256.hexdigest(content),
|
|
109
|
+
estimated_tokens: current_tokens + header_tokens
|
|
110
|
+
}
|
|
111
|
+
current_chunk = []
|
|
112
|
+
current_tokens = 0
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
current_chunk << line
|
|
116
|
+
current_tokens += line_tokens
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
# Final chunk
|
|
120
|
+
if current_chunk.any?
|
|
121
|
+
content = header + current_chunk.join
|
|
122
|
+
chunks << {
|
|
123
|
+
chunk_index: chunks.size,
|
|
124
|
+
identifier: "#{identifier}#chunk_#{chunks.size}",
|
|
125
|
+
content: content,
|
|
126
|
+
content_hash: Digest::SHA256.hexdigest(content),
|
|
127
|
+
estimated_tokens: current_tokens + header_tokens
|
|
128
|
+
}
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
chunks
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
private
|
|
135
|
+
|
|
136
|
+
def build_chunk_header
|
|
137
|
+
<<~HEADER
|
|
138
|
+
# Unit: #{identifier} (#{type})
|
|
139
|
+
# File: #{file_path}
|
|
140
|
+
# Namespace: #{namespace || '(root)'}
|
|
141
|
+
# ---
|
|
142
|
+
HEADER
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
end
|