woods 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +89 -0
- data/CODE_OF_CONDUCT.md +83 -0
- data/CONTRIBUTING.md +65 -0
- data/LICENSE.txt +21 -0
- data/README.md +406 -0
- data/exe/woods-console +59 -0
- data/exe/woods-console-mcp +22 -0
- data/exe/woods-mcp +34 -0
- data/exe/woods-mcp-http +37 -0
- data/exe/woods-mcp-start +58 -0
- data/lib/generators/woods/install_generator.rb +32 -0
- data/lib/generators/woods/pgvector_generator.rb +37 -0
- data/lib/generators/woods/templates/add_pgvector_to_woods.rb.erb +15 -0
- data/lib/generators/woods/templates/create_woods_tables.rb.erb +43 -0
- data/lib/tasks/woods.rake +621 -0
- data/lib/tasks/woods_evaluation.rake +115 -0
- data/lib/woods/ast/call_site_extractor.rb +106 -0
- data/lib/woods/ast/method_extractor.rb +71 -0
- data/lib/woods/ast/node.rb +116 -0
- data/lib/woods/ast/parser.rb +614 -0
- data/lib/woods/ast.rb +6 -0
- data/lib/woods/builder.rb +200 -0
- data/lib/woods/cache/cache_middleware.rb +199 -0
- data/lib/woods/cache/cache_store.rb +264 -0
- data/lib/woods/cache/redis_cache_store.rb +116 -0
- data/lib/woods/cache/solid_cache_store.rb +111 -0
- data/lib/woods/chunking/chunk.rb +84 -0
- data/lib/woods/chunking/semantic_chunker.rb +295 -0
- data/lib/woods/console/adapters/cache_adapter.rb +58 -0
- data/lib/woods/console/adapters/good_job_adapter.rb +33 -0
- data/lib/woods/console/adapters/job_adapter.rb +68 -0
- data/lib/woods/console/adapters/sidekiq_adapter.rb +33 -0
- data/lib/woods/console/adapters/solid_queue_adapter.rb +33 -0
- data/lib/woods/console/audit_logger.rb +75 -0
- data/lib/woods/console/bridge.rb +177 -0
- data/lib/woods/console/confirmation.rb +90 -0
- data/lib/woods/console/connection_manager.rb +173 -0
- data/lib/woods/console/console_response_renderer.rb +74 -0
- data/lib/woods/console/embedded_executor.rb +373 -0
- data/lib/woods/console/model_validator.rb +81 -0
- data/lib/woods/console/rack_middleware.rb +87 -0
- data/lib/woods/console/safe_context.rb +82 -0
- data/lib/woods/console/server.rb +612 -0
- data/lib/woods/console/sql_validator.rb +172 -0
- data/lib/woods/console/tools/tier1.rb +118 -0
- data/lib/woods/console/tools/tier2.rb +117 -0
- data/lib/woods/console/tools/tier3.rb +110 -0
- data/lib/woods/console/tools/tier4.rb +79 -0
- data/lib/woods/coordination/pipeline_lock.rb +109 -0
- data/lib/woods/cost_model/embedding_cost.rb +88 -0
- data/lib/woods/cost_model/estimator.rb +128 -0
- data/lib/woods/cost_model/provider_pricing.rb +67 -0
- data/lib/woods/cost_model/storage_cost.rb +52 -0
- data/lib/woods/cost_model.rb +22 -0
- data/lib/woods/db/migrations/001_create_units.rb +38 -0
- data/lib/woods/db/migrations/002_create_edges.rb +35 -0
- data/lib/woods/db/migrations/003_create_embeddings.rb +37 -0
- data/lib/woods/db/migrations/004_create_snapshots.rb +45 -0
- data/lib/woods/db/migrations/005_create_snapshot_units.rb +40 -0
- data/lib/woods/db/migrations/006_rename_tables.rb +34 -0
- data/lib/woods/db/migrator.rb +73 -0
- data/lib/woods/db/schema_version.rb +73 -0
- data/lib/woods/dependency_graph.rb +236 -0
- data/lib/woods/embedding/indexer.rb +140 -0
- data/lib/woods/embedding/openai.rb +126 -0
- data/lib/woods/embedding/provider.rb +162 -0
- data/lib/woods/embedding/text_preparer.rb +112 -0
- data/lib/woods/evaluation/baseline_runner.rb +115 -0
- data/lib/woods/evaluation/evaluator.rb +139 -0
- data/lib/woods/evaluation/metrics.rb +79 -0
- data/lib/woods/evaluation/query_set.rb +148 -0
- data/lib/woods/evaluation/report_generator.rb +90 -0
- data/lib/woods/extracted_unit.rb +145 -0
- data/lib/woods/extractor.rb +1028 -0
- data/lib/woods/extractors/action_cable_extractor.rb +201 -0
- data/lib/woods/extractors/ast_source_extraction.rb +46 -0
- data/lib/woods/extractors/behavioral_profile.rb +309 -0
- data/lib/woods/extractors/caching_extractor.rb +261 -0
- data/lib/woods/extractors/callback_analyzer.rb +246 -0
- data/lib/woods/extractors/concern_extractor.rb +292 -0
- data/lib/woods/extractors/configuration_extractor.rb +219 -0
- data/lib/woods/extractors/controller_extractor.rb +404 -0
- data/lib/woods/extractors/database_view_extractor.rb +278 -0
- data/lib/woods/extractors/decorator_extractor.rb +253 -0
- data/lib/woods/extractors/engine_extractor.rb +223 -0
- data/lib/woods/extractors/event_extractor.rb +211 -0
- data/lib/woods/extractors/factory_extractor.rb +289 -0
- data/lib/woods/extractors/graphql_extractor.rb +892 -0
- data/lib/woods/extractors/i18n_extractor.rb +117 -0
- data/lib/woods/extractors/job_extractor.rb +374 -0
- data/lib/woods/extractors/lib_extractor.rb +218 -0
- data/lib/woods/extractors/mailer_extractor.rb +269 -0
- data/lib/woods/extractors/manager_extractor.rb +188 -0
- data/lib/woods/extractors/middleware_extractor.rb +133 -0
- data/lib/woods/extractors/migration_extractor.rb +469 -0
- data/lib/woods/extractors/model_extractor.rb +988 -0
- data/lib/woods/extractors/phlex_extractor.rb +252 -0
- data/lib/woods/extractors/policy_extractor.rb +191 -0
- data/lib/woods/extractors/poro_extractor.rb +229 -0
- data/lib/woods/extractors/pundit_extractor.rb +223 -0
- data/lib/woods/extractors/rails_source_extractor.rb +473 -0
- data/lib/woods/extractors/rake_task_extractor.rb +343 -0
- data/lib/woods/extractors/route_extractor.rb +181 -0
- data/lib/woods/extractors/scheduled_job_extractor.rb +331 -0
- data/lib/woods/extractors/serializer_extractor.rb +339 -0
- data/lib/woods/extractors/service_extractor.rb +217 -0
- data/lib/woods/extractors/shared_dependency_scanner.rb +91 -0
- data/lib/woods/extractors/shared_utility_methods.rb +281 -0
- data/lib/woods/extractors/state_machine_extractor.rb +398 -0
- data/lib/woods/extractors/test_mapping_extractor.rb +225 -0
- data/lib/woods/extractors/validator_extractor.rb +211 -0
- data/lib/woods/extractors/view_component_extractor.rb +311 -0
- data/lib/woods/extractors/view_template_extractor.rb +261 -0
- data/lib/woods/feedback/gap_detector.rb +89 -0
- data/lib/woods/feedback/store.rb +119 -0
- data/lib/woods/filename_utils.rb +32 -0
- data/lib/woods/flow_analysis/operation_extractor.rb +206 -0
- data/lib/woods/flow_analysis/response_code_mapper.rb +154 -0
- data/lib/woods/flow_assembler.rb +290 -0
- data/lib/woods/flow_document.rb +191 -0
- data/lib/woods/flow_precomputer.rb +102 -0
- data/lib/woods/formatting/base.rb +30 -0
- data/lib/woods/formatting/claude_adapter.rb +98 -0
- data/lib/woods/formatting/generic_adapter.rb +56 -0
- data/lib/woods/formatting/gpt_adapter.rb +64 -0
- data/lib/woods/formatting/human_adapter.rb +78 -0
- data/lib/woods/graph_analyzer.rb +374 -0
- data/lib/woods/mcp/bootstrapper.rb +96 -0
- data/lib/woods/mcp/index_reader.rb +394 -0
- data/lib/woods/mcp/renderers/claude_renderer.rb +81 -0
- data/lib/woods/mcp/renderers/json_renderer.rb +17 -0
- data/lib/woods/mcp/renderers/markdown_renderer.rb +353 -0
- data/lib/woods/mcp/renderers/plain_renderer.rb +240 -0
- data/lib/woods/mcp/server.rb +962 -0
- data/lib/woods/mcp/tool_response_renderer.rb +85 -0
- data/lib/woods/model_name_cache.rb +51 -0
- data/lib/woods/notion/client.rb +217 -0
- data/lib/woods/notion/exporter.rb +219 -0
- data/lib/woods/notion/mapper.rb +40 -0
- data/lib/woods/notion/mappers/column_mapper.rb +57 -0
- data/lib/woods/notion/mappers/migration_mapper.rb +39 -0
- data/lib/woods/notion/mappers/model_mapper.rb +161 -0
- data/lib/woods/notion/mappers/shared.rb +22 -0
- data/lib/woods/notion/rate_limiter.rb +68 -0
- data/lib/woods/observability/health_check.rb +79 -0
- data/lib/woods/observability/instrumentation.rb +34 -0
- data/lib/woods/observability/structured_logger.rb +57 -0
- data/lib/woods/operator/error_escalator.rb +81 -0
- data/lib/woods/operator/pipeline_guard.rb +92 -0
- data/lib/woods/operator/status_reporter.rb +80 -0
- data/lib/woods/railtie.rb +38 -0
- data/lib/woods/resilience/circuit_breaker.rb +99 -0
- data/lib/woods/resilience/index_validator.rb +167 -0
- data/lib/woods/resilience/retryable_provider.rb +108 -0
- data/lib/woods/retrieval/context_assembler.rb +261 -0
- data/lib/woods/retrieval/query_classifier.rb +133 -0
- data/lib/woods/retrieval/ranker.rb +277 -0
- data/lib/woods/retrieval/search_executor.rb +316 -0
- data/lib/woods/retriever.rb +152 -0
- data/lib/woods/ruby_analyzer/class_analyzer.rb +170 -0
- data/lib/woods/ruby_analyzer/dataflow_analyzer.rb +77 -0
- data/lib/woods/ruby_analyzer/fqn_builder.rb +18 -0
- data/lib/woods/ruby_analyzer/mermaid_renderer.rb +280 -0
- data/lib/woods/ruby_analyzer/method_analyzer.rb +143 -0
- data/lib/woods/ruby_analyzer/trace_enricher.rb +143 -0
- data/lib/woods/ruby_analyzer.rb +87 -0
- data/lib/woods/session_tracer/file_store.rb +104 -0
- data/lib/woods/session_tracer/middleware.rb +143 -0
- data/lib/woods/session_tracer/redis_store.rb +106 -0
- data/lib/woods/session_tracer/session_flow_assembler.rb +254 -0
- data/lib/woods/session_tracer/session_flow_document.rb +223 -0
- data/lib/woods/session_tracer/solid_cache_store.rb +139 -0
- data/lib/woods/session_tracer/store.rb +81 -0
- data/lib/woods/storage/graph_store.rb +120 -0
- data/lib/woods/storage/metadata_store.rb +196 -0
- data/lib/woods/storage/pgvector.rb +195 -0
- data/lib/woods/storage/qdrant.rb +205 -0
- data/lib/woods/storage/vector_store.rb +167 -0
- data/lib/woods/temporal/json_snapshot_store.rb +245 -0
- data/lib/woods/temporal/snapshot_store.rb +345 -0
- data/lib/woods/token_utils.rb +19 -0
- data/lib/woods/version.rb +5 -0
- data/lib/woods.rb +246 -0
- metadata +270 -0
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Woods
|
|
4
|
+
module Db
|
|
5
|
+
module Migrations
|
|
6
|
+
# Renames codebase_* tables to woods_* as part of the gem rename.
|
|
7
|
+
module RenameTables
|
|
8
|
+
VERSION = 6
|
|
9
|
+
|
|
10
|
+
# @param connection [Object] Database connection
|
|
11
|
+
# @return [void]
|
|
12
|
+
def self.up(connection)
|
|
13
|
+
renames = {
|
|
14
|
+
'codebase_units' => 'woods_units',
|
|
15
|
+
'codebase_edges' => 'woods_edges',
|
|
16
|
+
'codebase_embeddings' => 'woods_embeddings',
|
|
17
|
+
'codebase_snapshots' => 'woods_snapshots',
|
|
18
|
+
'codebase_snapshot_units' => 'woods_snapshot_units'
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
renames.each do |old_name, new_name|
|
|
22
|
+
# Only rename if the old table exists (fresh installs won't have it)
|
|
23
|
+
result = connection.execute(
|
|
24
|
+
"SELECT name FROM sqlite_master WHERE type='table' AND name='#{old_name}'"
|
|
25
|
+
)
|
|
26
|
+
next if result.empty?
|
|
27
|
+
|
|
28
|
+
connection.execute("ALTER TABLE #{old_name} RENAME TO #{new_name}")
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'schema_version'
|
|
4
|
+
require_relative 'migrations/001_create_units'
|
|
5
|
+
require_relative 'migrations/002_create_edges'
|
|
6
|
+
require_relative 'migrations/003_create_embeddings'
|
|
7
|
+
require_relative 'migrations/004_create_snapshots'
|
|
8
|
+
require_relative 'migrations/005_create_snapshot_units'
|
|
9
|
+
require_relative 'migrations/006_rename_tables'
|
|
10
|
+
|
|
11
|
+
module Woods
|
|
12
|
+
module Db
|
|
13
|
+
# Runs schema migrations against a database connection.
|
|
14
|
+
#
|
|
15
|
+
# Tracks applied migrations via {SchemaVersion} and only runs pending ones.
|
|
16
|
+
# Migrations are defined as modules in `db/migrations/` with a VERSION
|
|
17
|
+
# constant and a `.up(connection)` class method.
|
|
18
|
+
#
|
|
19
|
+
# @example
|
|
20
|
+
# db = SQLite3::Database.new('woods.db')
|
|
21
|
+
# migrator = Migrator.new(connection: db)
|
|
22
|
+
# migrator.migrate! # => [1, 2, 3]
|
|
23
|
+
#
|
|
24
|
+
class Migrator
|
|
25
|
+
MIGRATIONS = [
|
|
26
|
+
Migrations::CreateUnits,
|
|
27
|
+
Migrations::CreateEdges,
|
|
28
|
+
Migrations::CreateEmbeddings,
|
|
29
|
+
Migrations::CreateSnapshots,
|
|
30
|
+
Migrations::CreateSnapshotUnits,
|
|
31
|
+
Migrations::RenameTables
|
|
32
|
+
].freeze
|
|
33
|
+
|
|
34
|
+
attr_reader :schema_version
|
|
35
|
+
|
|
36
|
+
# @param connection [Object] Database connection supporting #execute
|
|
37
|
+
def initialize(connection:)
|
|
38
|
+
@connection = connection
|
|
39
|
+
@schema_version = SchemaVersion.new(connection: connection)
|
|
40
|
+
@schema_version.ensure_table!
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Run all pending migrations.
|
|
44
|
+
#
|
|
45
|
+
# @return [Array<Integer>] Version numbers of newly applied migrations
|
|
46
|
+
def migrate!
|
|
47
|
+
applied = []
|
|
48
|
+
pending_migrations.each do |migration|
|
|
49
|
+
migration.up(@connection)
|
|
50
|
+
@schema_version.record_version(migration::VERSION)
|
|
51
|
+
applied << migration::VERSION
|
|
52
|
+
end
|
|
53
|
+
applied
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# List version numbers of pending (unapplied) migrations.
|
|
57
|
+
#
|
|
58
|
+
# @return [Array<Integer>]
|
|
59
|
+
def pending_versions
|
|
60
|
+
applied = @schema_version.applied_versions
|
|
61
|
+
MIGRATIONS.map { |m| m::VERSION }.reject { |v| applied.include?(v) }
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
private
|
|
65
|
+
|
|
66
|
+
# @return [Array<Module>] Pending migration modules
|
|
67
|
+
def pending_migrations
|
|
68
|
+
applied = @schema_version.applied_versions
|
|
69
|
+
MIGRATIONS.reject { |m| applied.include?(m::VERSION) }
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Woods
|
|
4
|
+
module Db
|
|
5
|
+
# Tracks which schema migrations have been applied.
|
|
6
|
+
#
|
|
7
|
+
# Uses a simple `woods_schema_migrations` table with a single
|
|
8
|
+
# `version` column. Works with any database connection that supports
|
|
9
|
+
# `execute` and returns arrays (SQLite3, pg, mysql2).
|
|
10
|
+
#
|
|
11
|
+
# @example
|
|
12
|
+
# db = SQLite3::Database.new('woods.db')
|
|
13
|
+
# sv = SchemaVersion.new(connection: db)
|
|
14
|
+
# sv.ensure_table!
|
|
15
|
+
# sv.current_version # => 0
|
|
16
|
+
# sv.record_version(1)
|
|
17
|
+
# sv.current_version # => 1
|
|
18
|
+
#
|
|
19
|
+
class SchemaVersion
|
|
20
|
+
TABLE_NAME = 'woods_schema_migrations'
|
|
21
|
+
|
|
22
|
+
# @param connection [Object] Database connection supporting #execute
|
|
23
|
+
def initialize(connection:)
|
|
24
|
+
@connection = connection
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Create the schema migrations table if it does not exist.
|
|
28
|
+
#
|
|
29
|
+
# @return [void]
|
|
30
|
+
def ensure_table!
|
|
31
|
+
@connection.execute(<<~SQL)
|
|
32
|
+
CREATE TABLE IF NOT EXISTS #{TABLE_NAME} (
|
|
33
|
+
version INTEGER PRIMARY KEY NOT NULL,
|
|
34
|
+
applied_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
35
|
+
)
|
|
36
|
+
SQL
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# List all applied migration version numbers, sorted ascending.
|
|
40
|
+
#
|
|
41
|
+
# @return [Array<Integer>]
|
|
42
|
+
def applied_versions
|
|
43
|
+
rows = @connection.execute("SELECT version FROM #{TABLE_NAME} ORDER BY version ASC")
|
|
44
|
+
rows.map { |row| row.is_a?(Array) ? row[0] : row['version'] }
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Record a migration version as applied.
|
|
48
|
+
#
|
|
49
|
+
# @param version [Integer] The migration version number
|
|
50
|
+
# @return [void]
|
|
51
|
+
def record_version(version)
|
|
52
|
+
@connection.execute(
|
|
53
|
+
"INSERT OR IGNORE INTO #{TABLE_NAME} (version) VALUES (?)", [version]
|
|
54
|
+
)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Check whether a version has been applied.
|
|
58
|
+
#
|
|
59
|
+
# @param version [Integer]
|
|
60
|
+
# @return [Boolean]
|
|
61
|
+
def applied?(version)
|
|
62
|
+
applied_versions.include?(version)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# The highest applied version, or 0 if none.
|
|
66
|
+
#
|
|
67
|
+
# @return [Integer]
|
|
68
|
+
def current_version
|
|
69
|
+
applied_versions.last || 0
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'set'
|
|
4
|
+
require 'json'
|
|
5
|
+
|
|
6
|
+
module Woods
|
|
7
|
+
# DependencyGraph tracks relationships between code units for:
|
|
8
|
+
# 1. Understanding what depends on what
|
|
9
|
+
# 2. Computing "blast radius" for incremental re-indexing
|
|
10
|
+
# 3. Enabling graph-based retrieval queries
|
|
11
|
+
#
|
|
12
|
+
# The graph is bidirectional - we track both what a unit depends on
|
|
13
|
+
# and what depends on that unit (reverse edges).
|
|
14
|
+
#
|
|
15
|
+
# @example Building and querying the graph
|
|
16
|
+
# graph = DependencyGraph.new
|
|
17
|
+
# graph.register(user_model_unit)
|
|
18
|
+
# graph.register(user_service_unit)
|
|
19
|
+
#
|
|
20
|
+
# # Find everything affected by a change to user.rb
|
|
21
|
+
# affected = graph.affected_by(["app/models/user.rb"])
|
|
22
|
+
#
|
|
23
|
+
class DependencyGraph
|
|
24
|
+
def initialize
|
|
25
|
+
@nodes = {} # identifier => { type:, file_path: }
|
|
26
|
+
@edges = {} # identifier => [dependency identifiers]
|
|
27
|
+
@reverse = {} # identifier => Set of dependent identifiers
|
|
28
|
+
@file_map = {} # file_path => identifier
|
|
29
|
+
@type_index = {} # type => Set of identifiers
|
|
30
|
+
@to_h = nil
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Register a unit in the graph
|
|
34
|
+
#
|
|
35
|
+
# @param unit [ExtractedUnit] The unit to register
|
|
36
|
+
def register(unit)
|
|
37
|
+
@to_h = nil
|
|
38
|
+
|
|
39
|
+
@nodes[unit.identifier] = {
|
|
40
|
+
type: unit.type,
|
|
41
|
+
file_path: unit.file_path,
|
|
42
|
+
namespace: unit.namespace
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
@edges[unit.identifier] = unit.dependencies.map { |d| d[:target] }
|
|
46
|
+
@file_map[unit.file_path] = unit.identifier if unit.file_path
|
|
47
|
+
|
|
48
|
+
# Type index for filtering (Set-based for O(1) insert)
|
|
49
|
+
(@type_index[unit.type] ||= Set.new).add(unit.identifier)
|
|
50
|
+
|
|
51
|
+
# Build reverse edges (Set-based for O(1) insert)
|
|
52
|
+
unit.dependencies.each do |dep|
|
|
53
|
+
(@reverse[dep[:target]] ||= Set.new).add(unit.identifier)
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Find all units affected by changes to given files
|
|
58
|
+
# Uses BFS to find transitive dependents
|
|
59
|
+
#
|
|
60
|
+
# @param changed_files [Array<String>] List of changed file paths
|
|
61
|
+
# @param max_depth [Integer] Maximum traversal depth (nil for unlimited)
|
|
62
|
+
# @return [Array<String>] List of affected unit identifiers
|
|
63
|
+
def affected_by(changed_files, max_depth: nil)
|
|
64
|
+
directly_changed = changed_files.filter_map { |f| @file_map[f] }
|
|
65
|
+
|
|
66
|
+
affected = Set.new(directly_changed)
|
|
67
|
+
queue = directly_changed.map { |id| [id, 0] } # [identifier, depth]
|
|
68
|
+
|
|
69
|
+
while queue.any?
|
|
70
|
+
current, depth = queue.shift
|
|
71
|
+
next if max_depth && depth >= max_depth
|
|
72
|
+
|
|
73
|
+
dependents = @reverse[current] || []
|
|
74
|
+
|
|
75
|
+
dependents.each do |dep|
|
|
76
|
+
unless affected.include?(dep)
|
|
77
|
+
affected.add(dep)
|
|
78
|
+
queue.push([dep, depth + 1])
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
affected.to_a
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Check if a node exists in the graph by exact identifier.
|
|
87
|
+
#
|
|
88
|
+
# @param identifier [String] Unit identifier to check
|
|
89
|
+
# @return [Boolean] true if the node exists
|
|
90
|
+
def node_exists?(identifier)
|
|
91
|
+
@nodes.key?(identifier)
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Find a node by suffix matching (e.g., "Update" matches "Order::Update").
|
|
95
|
+
#
|
|
96
|
+
# When multiple nodes share the same suffix, the first match wins.
|
|
97
|
+
# Suffix matching requires a "::" separator — bare identifiers (no namespace)
|
|
98
|
+
# are not matched by this method; use {#node_exists?} for exact lookups.
|
|
99
|
+
#
|
|
100
|
+
# @param suffix [String] The suffix to match against
|
|
101
|
+
# @return [String, nil] The first matching identifier, or nil
|
|
102
|
+
def find_node_by_suffix(suffix)
|
|
103
|
+
target_suffix = "::#{suffix}"
|
|
104
|
+
@nodes.keys.find { |id| id.end_with?(target_suffix) }
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Get direct dependencies of a unit
|
|
108
|
+
#
|
|
109
|
+
# @param identifier [String] Unit identifier
|
|
110
|
+
# @return [Array<String>] List of dependency identifiers
|
|
111
|
+
def dependencies_of(identifier)
|
|
112
|
+
@edges[identifier] || []
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# Get direct dependents of a unit (what depends on it)
|
|
116
|
+
#
|
|
117
|
+
# @param identifier [String] Unit identifier
|
|
118
|
+
# @return [Array<String>] List of dependent identifiers
|
|
119
|
+
def dependents_of(identifier)
|
|
120
|
+
@reverse.fetch(identifier, Set.new).to_a
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# Get all units of a specific type
|
|
124
|
+
#
|
|
125
|
+
# @param type [Symbol] Unit type (:model, :controller, etc.)
|
|
126
|
+
# @return [Array<String>] List of unit identifiers
|
|
127
|
+
def units_of_type(type)
|
|
128
|
+
@type_index.fetch(type, Set.new).to_a
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
# Compute PageRank scores for all nodes
|
|
132
|
+
#
|
|
133
|
+
# Uses the reverse edges (dependents) as the link structure: a node
|
|
134
|
+
# with many dependents gets a higher score. This matches Aider's insight
|
|
135
|
+
# that structural importance correlates with retrieval relevance.
|
|
136
|
+
#
|
|
137
|
+
# @param damping [Float] Damping factor (default: 0.85)
|
|
138
|
+
# @param iterations [Integer] Number of iterations (default: 20)
|
|
139
|
+
# @return [Hash<String, Float>] Identifier => PageRank score
|
|
140
|
+
def pagerank(damping: 0.85, iterations: 20)
|
|
141
|
+
n = @nodes.size
|
|
142
|
+
return {} if n.zero?
|
|
143
|
+
|
|
144
|
+
node_ids = @nodes.keys
|
|
145
|
+
base_score = 1.0 / n
|
|
146
|
+
scores = node_ids.to_h { |id| [id, base_score] }
|
|
147
|
+
|
|
148
|
+
iterations.times do
|
|
149
|
+
# Collect rank from dangling nodes (no outgoing edges) and redistribute
|
|
150
|
+
dangling_sum = node_ids.sum do |id|
|
|
151
|
+
@edges[id].nil? || @edges[id].empty? ? scores[id] : 0.0
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
new_scores = {}
|
|
155
|
+
|
|
156
|
+
node_ids.each do |id|
|
|
157
|
+
# Sum contributions from nodes that depend on this one
|
|
158
|
+
incoming = @reverse[id] || []
|
|
159
|
+
rank_sum = incoming.sum do |src|
|
|
160
|
+
out_degree = (@edges[src] || []).size
|
|
161
|
+
out_degree.positive? ? scores[src] / out_degree : 0.0
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
new_scores[id] = ((1.0 - damping) / n) + (damping * (rank_sum + (dangling_sum / n)))
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
scores = new_scores
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
scores
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# Serialize graph for persistence. Memoized — cache is invalidated on register.
|
|
174
|
+
# Returns a dup so callers can't pollute the cached hash.
|
|
175
|
+
#
|
|
176
|
+
# @return [Hash] Complete graph data
|
|
177
|
+
def to_h
|
|
178
|
+
@to_h ||= {
|
|
179
|
+
nodes: @nodes,
|
|
180
|
+
edges: @edges,
|
|
181
|
+
reverse: @reverse.transform_values(&:to_a),
|
|
182
|
+
file_map: @file_map,
|
|
183
|
+
type_index: @type_index.transform_values(&:to_a),
|
|
184
|
+
stats: {
|
|
185
|
+
node_count: @nodes.size,
|
|
186
|
+
edge_count: @edges.values.sum(&:size),
|
|
187
|
+
types: @type_index.transform_values(&:size)
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
@to_h.dup
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
# Load graph from persisted data
|
|
194
|
+
#
|
|
195
|
+
# After JSON round-trip all keys become strings. This method normalizes
|
|
196
|
+
# them back to the expected types: node values use symbol keys (:type,
|
|
197
|
+
# :file_path, :namespace), and type_index uses symbol keys for types.
|
|
198
|
+
#
|
|
199
|
+
# @param data [Hash] Previously serialized graph data
|
|
200
|
+
# @return [DependencyGraph] Restored graph
|
|
201
|
+
def self.from_h(data)
|
|
202
|
+
graph = new
|
|
203
|
+
|
|
204
|
+
raw_nodes = data[:nodes] || data['nodes'] || {}
|
|
205
|
+
graph.instance_variable_set(:@nodes, raw_nodes.transform_values { |v| symbolize_node(v) })
|
|
206
|
+
|
|
207
|
+
graph.instance_variable_set(:@edges, data[:edges] || data['edges'] || {})
|
|
208
|
+
|
|
209
|
+
raw_reverse = data[:reverse] || data['reverse'] || {}
|
|
210
|
+
graph.instance_variable_set(:@reverse, raw_reverse.transform_values { |v| v.is_a?(Set) ? v : Set.new(v) })
|
|
211
|
+
|
|
212
|
+
graph.instance_variable_set(:@file_map, data[:file_map] || data['file_map'] || {})
|
|
213
|
+
|
|
214
|
+
raw_type_index = data[:type_index] || data['type_index'] || {}
|
|
215
|
+
graph.instance_variable_set(:@type_index, raw_type_index.transform_keys(&:to_sym).transform_values do |v|
|
|
216
|
+
v.is_a?(Set) ? v : Set.new(v)
|
|
217
|
+
end)
|
|
218
|
+
|
|
219
|
+
graph
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
# Normalize a node hash to use symbol keys
|
|
223
|
+
#
|
|
224
|
+
# @param node [Hash] Node data with string or symbol keys
|
|
225
|
+
# @return [Hash] Node data with symbol keys
|
|
226
|
+
def self.symbolize_node(node)
|
|
227
|
+
return node unless node.is_a?(Hash)
|
|
228
|
+
|
|
229
|
+
{
|
|
230
|
+
type: (node[:type] || node['type'])&.to_sym,
|
|
231
|
+
file_path: node[:file_path] || node['file_path'],
|
|
232
|
+
namespace: node[:namespace] || node['namespace']
|
|
233
|
+
}
|
|
234
|
+
end
|
|
235
|
+
end
|
|
236
|
+
end
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
require 'digest'
|
|
5
|
+
|
|
6
|
+
module Woods
|
|
7
|
+
module Embedding
|
|
8
|
+
# Orchestrates the indexing pipeline: reads extracted units, prepares text,
|
|
9
|
+
# generates embeddings, and stores vectors. Supports full and incremental
|
|
10
|
+
# modes with checkpoint-based resumability.
|
|
11
|
+
class Indexer
|
|
12
|
+
# @param checkpoint_interval [Integer] Save checkpoint every N batches (default: 10)
|
|
13
|
+
def initialize(provider:, text_preparer:, vector_store:, output_dir:, batch_size: 32, checkpoint_interval: 10) # rubocop:disable Metrics/ParameterLists
|
|
14
|
+
@provider = provider
|
|
15
|
+
@text_preparer = text_preparer
|
|
16
|
+
@vector_store = vector_store
|
|
17
|
+
@output_dir = output_dir
|
|
18
|
+
@batch_size = batch_size
|
|
19
|
+
@checkpoint_interval = checkpoint_interval
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# Index all extracted units (full mode). Returns stats hash.
|
|
23
|
+
# @return [Hash] Stats with :processed, :skipped, :errors counts
|
|
24
|
+
def index_all
|
|
25
|
+
process_units(load_units, incremental: false)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Index only changed units (incremental mode). Returns stats hash.
|
|
29
|
+
# @return [Hash] Stats with :processed, :skipped, :errors counts
|
|
30
|
+
def index_incremental
|
|
31
|
+
process_units(load_units, incremental: true)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
private
|
|
35
|
+
|
|
36
|
+
def load_units
|
|
37
|
+
Dir.glob(File.join(@output_dir, '**', '*.json')).filter_map do |path|
|
|
38
|
+
next if File.basename(path) == 'checkpoint.json'
|
|
39
|
+
|
|
40
|
+
JSON.parse(File.read(path))
|
|
41
|
+
rescue JSON::ParserError
|
|
42
|
+
nil
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def process_units(units, incremental:)
|
|
47
|
+
checkpoint = incremental ? load_checkpoint : {}
|
|
48
|
+
stats = { processed: 0, skipped: 0, errors: 0 }
|
|
49
|
+
batch_count = 0
|
|
50
|
+
|
|
51
|
+
units.each_slice(@batch_size) do |batch|
|
|
52
|
+
process_batch(batch, checkpoint, stats, incremental: incremental)
|
|
53
|
+
batch_count += 1
|
|
54
|
+
save_checkpoint(checkpoint) if (batch_count % @checkpoint_interval).zero?
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Always save final checkpoint
|
|
58
|
+
save_checkpoint(checkpoint)
|
|
59
|
+
|
|
60
|
+
stats
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def process_batch(batch, checkpoint, stats, incremental:)
|
|
64
|
+
to_embed = batch.each_with_object([]) do |unit_data, items|
|
|
65
|
+
if incremental && checkpoint[unit_data['identifier']] == unit_data['source_hash']
|
|
66
|
+
stats[:skipped] += 1
|
|
67
|
+
next
|
|
68
|
+
end
|
|
69
|
+
collect_embed_items(unit_data, items)
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
embed_and_store(to_embed, checkpoint, stats)
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def collect_embed_items(unit_data, items)
|
|
76
|
+
texts = prepare_texts(unit_data)
|
|
77
|
+
identifier = unit_data['identifier']
|
|
78
|
+
|
|
79
|
+
texts.each_with_index do |text, idx|
|
|
80
|
+
embed_id = texts.length > 1 ? "#{identifier}#chunk_#{idx}" : identifier
|
|
81
|
+
items << { id: embed_id, text: text, unit_data: unit_data,
|
|
82
|
+
source_hash: unit_data['source_hash'], identifier: identifier }
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def prepare_texts(unit_data)
|
|
87
|
+
unit = build_unit(unit_data)
|
|
88
|
+
unit.chunks&.any? ? @text_preparer.prepare_chunks(unit) : [@text_preparer.prepare(unit)]
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def build_unit(data)
|
|
92
|
+
unit = ExtractedUnit.new(type: data['type']&.to_sym, identifier: data['identifier'],
|
|
93
|
+
file_path: data['file_path'])
|
|
94
|
+
unit.namespace = data['namespace']
|
|
95
|
+
unit.source_code = data['source_code']
|
|
96
|
+
unit.dependencies = data['dependencies'] || []
|
|
97
|
+
unit.chunks = (data['chunks'] || []).map { |c| c.transform_keys(&:to_sym) }
|
|
98
|
+
unit
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
def embed_and_store(items, checkpoint, stats)
|
|
102
|
+
return if items.empty?
|
|
103
|
+
|
|
104
|
+
vectors = @provider.embed_batch(items.map { |i| i[:text] })
|
|
105
|
+
store_vectors(items, vectors, checkpoint, stats)
|
|
106
|
+
rescue StandardError => e
|
|
107
|
+
stats[:errors] += items.size
|
|
108
|
+
raise Woods::Error, "Embedding failed: #{e.message}"
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def store_vectors(items, vectors, checkpoint, stats)
|
|
112
|
+
entries = items.each_with_index.map do |item, idx|
|
|
113
|
+
{ id: item[:id], vector: vectors[idx],
|
|
114
|
+
metadata: { type: item[:unit_data]['type'], identifier: item[:identifier],
|
|
115
|
+
file_path: item[:unit_data]['file_path'] } }
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
@vector_store.store_batch(entries)
|
|
119
|
+
|
|
120
|
+
items.each do |item|
|
|
121
|
+
checkpoint[item[:identifier]] = item[:source_hash]
|
|
122
|
+
stats[:processed] += 1
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
def load_checkpoint
|
|
127
|
+
path = File.join(@output_dir, 'checkpoint.json')
|
|
128
|
+
return {} unless File.exist?(path)
|
|
129
|
+
|
|
130
|
+
JSON.parse(File.read(path))
|
|
131
|
+
rescue JSON::ParserError
|
|
132
|
+
{}
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
def save_checkpoint(checkpoint)
|
|
136
|
+
File.write(File.join(@output_dir, 'checkpoint.json'), JSON.generate(checkpoint))
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
end
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'net/http'
|
|
4
|
+
require 'json'
|
|
5
|
+
|
|
6
|
+
module Woods
|
|
7
|
+
module Embedding
|
|
8
|
+
module Provider
|
|
9
|
+
# OpenAI adapter for cloud embeddings via the OpenAI HTTP API.
|
|
10
|
+
#
|
|
11
|
+
# Uses the `/v1/embeddings` endpoint to generate embeddings. Requires a valid
|
|
12
|
+
# OpenAI API key.
|
|
13
|
+
#
|
|
14
|
+
# @example
|
|
15
|
+
# provider = Woods::Embedding::Provider::OpenAI.new(api_key: ENV['OPENAI_API_KEY'])
|
|
16
|
+
# vector = provider.embed("class User < ApplicationRecord; end")
|
|
17
|
+
# vectors = provider.embed_batch(["text1", "text2"])
|
|
18
|
+
class OpenAI
|
|
19
|
+
include Interface
|
|
20
|
+
|
|
21
|
+
ENDPOINT = URI('https://api.openai.com/v1/embeddings')
|
|
22
|
+
DEFAULT_MODEL = 'text-embedding-3-small'
|
|
23
|
+
DIMENSIONS = {
|
|
24
|
+
'text-embedding-3-small' => 1536,
|
|
25
|
+
'text-embedding-3-large' => 3072
|
|
26
|
+
}.freeze
|
|
27
|
+
|
|
28
|
+
# @param api_key [String] OpenAI API key
|
|
29
|
+
# @param model [String] OpenAI embedding model name (default: text-embedding-3-small)
|
|
30
|
+
def initialize(api_key:, model: DEFAULT_MODEL)
|
|
31
|
+
@api_key = api_key
|
|
32
|
+
@model = model
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Embed a single text string.
|
|
36
|
+
#
|
|
37
|
+
# @param text [String] the text to embed
|
|
38
|
+
# @return [Array<Float>] the embedding vector
|
|
39
|
+
# @raise [Woods::Error] if the API returns an error
|
|
40
|
+
def embed(text)
|
|
41
|
+
response = post_request({ model: @model, input: text })
|
|
42
|
+
response['data'].first['embedding']
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Embed multiple texts in a single request.
|
|
46
|
+
#
|
|
47
|
+
# Sorts results by the index field to guarantee ordering matches input.
|
|
48
|
+
#
|
|
49
|
+
# @param texts [Array<String>] the texts to embed
|
|
50
|
+
# @return [Array<Array<Float>>] array of embedding vectors
|
|
51
|
+
# @raise [Woods::Error] if the API returns an error
|
|
52
|
+
def embed_batch(texts)
|
|
53
|
+
response = post_request({ model: @model, input: texts })
|
|
54
|
+
response['data']
|
|
55
|
+
.sort_by { |item| item['index'] }
|
|
56
|
+
.map { |item| item['embedding'] }
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Return the dimensionality of vectors produced by this model.
|
|
60
|
+
#
|
|
61
|
+
# Uses the known dimensions for standard models, falling back to a
|
|
62
|
+
# test embedding for unknown models.
|
|
63
|
+
#
|
|
64
|
+
# @return [Integer] number of dimensions
|
|
65
|
+
def dimensions
|
|
66
|
+
DIMENSIONS[@model] || embed('test').length
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Return the model name.
|
|
70
|
+
#
|
|
71
|
+
# @return [String] the OpenAI model name
|
|
72
|
+
def model_name
|
|
73
|
+
@model
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
private
|
|
77
|
+
|
|
78
|
+
# Send a POST request to the OpenAI embeddings API.
|
|
79
|
+
#
|
|
80
|
+
# @param body [Hash] request body
|
|
81
|
+
# @return [Hash] parsed JSON response
|
|
82
|
+
# @raise [Woods::Error] if the API returns a non-success status
|
|
83
|
+
def post_request(body)
|
|
84
|
+
request = Net::HTTP::Post.new(ENDPOINT.path)
|
|
85
|
+
request['Content-Type'] = 'application/json'
|
|
86
|
+
request['Authorization'] = "Bearer #{@api_key}"
|
|
87
|
+
request.body = body.to_json
|
|
88
|
+
|
|
89
|
+
response = http_client.request(request)
|
|
90
|
+
|
|
91
|
+
unless response.is_a?(Net::HTTPSuccess)
|
|
92
|
+
raise Woods::Error, "OpenAI API error: #{response.code} #{response.body}"
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
JSON.parse(response.body)
|
|
96
|
+
rescue Errno::ECONNRESET, Net::OpenTimeout, IOError
|
|
97
|
+
# Connection dropped — reset and retry once
|
|
98
|
+
@http_client = nil
|
|
99
|
+
response = http_client.request(request)
|
|
100
|
+
unless response.is_a?(Net::HTTPSuccess)
|
|
101
|
+
raise Woods::Error, "OpenAI API error: #{response.code} #{response.body}"
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
JSON.parse(response.body)
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Return a reusable, started HTTP client for the OpenAI API.
|
|
108
|
+
# Calling http.start opens a persistent TCP connection so
|
|
109
|
+
# keep_alive_timeout actually takes effect across requests.
|
|
110
|
+
#
|
|
111
|
+
# @return [Net::HTTP]
|
|
112
|
+
def http_client
|
|
113
|
+
return @http_client if @http_client&.started?
|
|
114
|
+
|
|
115
|
+
http = Net::HTTP.new(ENDPOINT.host, ENDPOINT.port)
|
|
116
|
+
http.use_ssl = true
|
|
117
|
+
http.open_timeout = 10
|
|
118
|
+
http.read_timeout = 30
|
|
119
|
+
http.keep_alive_timeout = 30
|
|
120
|
+
http.start
|
|
121
|
+
@http_client = http
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
end
|