woods 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +89 -0
- data/CODE_OF_CONDUCT.md +83 -0
- data/CONTRIBUTING.md +65 -0
- data/LICENSE.txt +21 -0
- data/README.md +406 -0
- data/exe/woods-console +59 -0
- data/exe/woods-console-mcp +22 -0
- data/exe/woods-mcp +34 -0
- data/exe/woods-mcp-http +37 -0
- data/exe/woods-mcp-start +58 -0
- data/lib/generators/woods/install_generator.rb +32 -0
- data/lib/generators/woods/pgvector_generator.rb +37 -0
- data/lib/generators/woods/templates/add_pgvector_to_woods.rb.erb +15 -0
- data/lib/generators/woods/templates/create_woods_tables.rb.erb +43 -0
- data/lib/tasks/woods.rake +621 -0
- data/lib/tasks/woods_evaluation.rake +115 -0
- data/lib/woods/ast/call_site_extractor.rb +106 -0
- data/lib/woods/ast/method_extractor.rb +71 -0
- data/lib/woods/ast/node.rb +116 -0
- data/lib/woods/ast/parser.rb +614 -0
- data/lib/woods/ast.rb +6 -0
- data/lib/woods/builder.rb +200 -0
- data/lib/woods/cache/cache_middleware.rb +199 -0
- data/lib/woods/cache/cache_store.rb +264 -0
- data/lib/woods/cache/redis_cache_store.rb +116 -0
- data/lib/woods/cache/solid_cache_store.rb +111 -0
- data/lib/woods/chunking/chunk.rb +84 -0
- data/lib/woods/chunking/semantic_chunker.rb +295 -0
- data/lib/woods/console/adapters/cache_adapter.rb +58 -0
- data/lib/woods/console/adapters/good_job_adapter.rb +33 -0
- data/lib/woods/console/adapters/job_adapter.rb +68 -0
- data/lib/woods/console/adapters/sidekiq_adapter.rb +33 -0
- data/lib/woods/console/adapters/solid_queue_adapter.rb +33 -0
- data/lib/woods/console/audit_logger.rb +75 -0
- data/lib/woods/console/bridge.rb +177 -0
- data/lib/woods/console/confirmation.rb +90 -0
- data/lib/woods/console/connection_manager.rb +173 -0
- data/lib/woods/console/console_response_renderer.rb +74 -0
- data/lib/woods/console/embedded_executor.rb +373 -0
- data/lib/woods/console/model_validator.rb +81 -0
- data/lib/woods/console/rack_middleware.rb +87 -0
- data/lib/woods/console/safe_context.rb +82 -0
- data/lib/woods/console/server.rb +612 -0
- data/lib/woods/console/sql_validator.rb +172 -0
- data/lib/woods/console/tools/tier1.rb +118 -0
- data/lib/woods/console/tools/tier2.rb +117 -0
- data/lib/woods/console/tools/tier3.rb +110 -0
- data/lib/woods/console/tools/tier4.rb +79 -0
- data/lib/woods/coordination/pipeline_lock.rb +109 -0
- data/lib/woods/cost_model/embedding_cost.rb +88 -0
- data/lib/woods/cost_model/estimator.rb +128 -0
- data/lib/woods/cost_model/provider_pricing.rb +67 -0
- data/lib/woods/cost_model/storage_cost.rb +52 -0
- data/lib/woods/cost_model.rb +22 -0
- data/lib/woods/db/migrations/001_create_units.rb +38 -0
- data/lib/woods/db/migrations/002_create_edges.rb +35 -0
- data/lib/woods/db/migrations/003_create_embeddings.rb +37 -0
- data/lib/woods/db/migrations/004_create_snapshots.rb +45 -0
- data/lib/woods/db/migrations/005_create_snapshot_units.rb +40 -0
- data/lib/woods/db/migrations/006_rename_tables.rb +34 -0
- data/lib/woods/db/migrator.rb +73 -0
- data/lib/woods/db/schema_version.rb +73 -0
- data/lib/woods/dependency_graph.rb +236 -0
- data/lib/woods/embedding/indexer.rb +140 -0
- data/lib/woods/embedding/openai.rb +126 -0
- data/lib/woods/embedding/provider.rb +162 -0
- data/lib/woods/embedding/text_preparer.rb +112 -0
- data/lib/woods/evaluation/baseline_runner.rb +115 -0
- data/lib/woods/evaluation/evaluator.rb +139 -0
- data/lib/woods/evaluation/metrics.rb +79 -0
- data/lib/woods/evaluation/query_set.rb +148 -0
- data/lib/woods/evaluation/report_generator.rb +90 -0
- data/lib/woods/extracted_unit.rb +145 -0
- data/lib/woods/extractor.rb +1028 -0
- data/lib/woods/extractors/action_cable_extractor.rb +201 -0
- data/lib/woods/extractors/ast_source_extraction.rb +46 -0
- data/lib/woods/extractors/behavioral_profile.rb +309 -0
- data/lib/woods/extractors/caching_extractor.rb +261 -0
- data/lib/woods/extractors/callback_analyzer.rb +246 -0
- data/lib/woods/extractors/concern_extractor.rb +292 -0
- data/lib/woods/extractors/configuration_extractor.rb +219 -0
- data/lib/woods/extractors/controller_extractor.rb +404 -0
- data/lib/woods/extractors/database_view_extractor.rb +278 -0
- data/lib/woods/extractors/decorator_extractor.rb +253 -0
- data/lib/woods/extractors/engine_extractor.rb +223 -0
- data/lib/woods/extractors/event_extractor.rb +211 -0
- data/lib/woods/extractors/factory_extractor.rb +289 -0
- data/lib/woods/extractors/graphql_extractor.rb +892 -0
- data/lib/woods/extractors/i18n_extractor.rb +117 -0
- data/lib/woods/extractors/job_extractor.rb +374 -0
- data/lib/woods/extractors/lib_extractor.rb +218 -0
- data/lib/woods/extractors/mailer_extractor.rb +269 -0
- data/lib/woods/extractors/manager_extractor.rb +188 -0
- data/lib/woods/extractors/middleware_extractor.rb +133 -0
- data/lib/woods/extractors/migration_extractor.rb +469 -0
- data/lib/woods/extractors/model_extractor.rb +988 -0
- data/lib/woods/extractors/phlex_extractor.rb +252 -0
- data/lib/woods/extractors/policy_extractor.rb +191 -0
- data/lib/woods/extractors/poro_extractor.rb +229 -0
- data/lib/woods/extractors/pundit_extractor.rb +223 -0
- data/lib/woods/extractors/rails_source_extractor.rb +473 -0
- data/lib/woods/extractors/rake_task_extractor.rb +343 -0
- data/lib/woods/extractors/route_extractor.rb +181 -0
- data/lib/woods/extractors/scheduled_job_extractor.rb +331 -0
- data/lib/woods/extractors/serializer_extractor.rb +339 -0
- data/lib/woods/extractors/service_extractor.rb +217 -0
- data/lib/woods/extractors/shared_dependency_scanner.rb +91 -0
- data/lib/woods/extractors/shared_utility_methods.rb +281 -0
- data/lib/woods/extractors/state_machine_extractor.rb +398 -0
- data/lib/woods/extractors/test_mapping_extractor.rb +225 -0
- data/lib/woods/extractors/validator_extractor.rb +211 -0
- data/lib/woods/extractors/view_component_extractor.rb +311 -0
- data/lib/woods/extractors/view_template_extractor.rb +261 -0
- data/lib/woods/feedback/gap_detector.rb +89 -0
- data/lib/woods/feedback/store.rb +119 -0
- data/lib/woods/filename_utils.rb +32 -0
- data/lib/woods/flow_analysis/operation_extractor.rb +206 -0
- data/lib/woods/flow_analysis/response_code_mapper.rb +154 -0
- data/lib/woods/flow_assembler.rb +290 -0
- data/lib/woods/flow_document.rb +191 -0
- data/lib/woods/flow_precomputer.rb +102 -0
- data/lib/woods/formatting/base.rb +30 -0
- data/lib/woods/formatting/claude_adapter.rb +98 -0
- data/lib/woods/formatting/generic_adapter.rb +56 -0
- data/lib/woods/formatting/gpt_adapter.rb +64 -0
- data/lib/woods/formatting/human_adapter.rb +78 -0
- data/lib/woods/graph_analyzer.rb +374 -0
- data/lib/woods/mcp/bootstrapper.rb +96 -0
- data/lib/woods/mcp/index_reader.rb +394 -0
- data/lib/woods/mcp/renderers/claude_renderer.rb +81 -0
- data/lib/woods/mcp/renderers/json_renderer.rb +17 -0
- data/lib/woods/mcp/renderers/markdown_renderer.rb +353 -0
- data/lib/woods/mcp/renderers/plain_renderer.rb +240 -0
- data/lib/woods/mcp/server.rb +962 -0
- data/lib/woods/mcp/tool_response_renderer.rb +85 -0
- data/lib/woods/model_name_cache.rb +51 -0
- data/lib/woods/notion/client.rb +217 -0
- data/lib/woods/notion/exporter.rb +219 -0
- data/lib/woods/notion/mapper.rb +40 -0
- data/lib/woods/notion/mappers/column_mapper.rb +57 -0
- data/lib/woods/notion/mappers/migration_mapper.rb +39 -0
- data/lib/woods/notion/mappers/model_mapper.rb +161 -0
- data/lib/woods/notion/mappers/shared.rb +22 -0
- data/lib/woods/notion/rate_limiter.rb +68 -0
- data/lib/woods/observability/health_check.rb +79 -0
- data/lib/woods/observability/instrumentation.rb +34 -0
- data/lib/woods/observability/structured_logger.rb +57 -0
- data/lib/woods/operator/error_escalator.rb +81 -0
- data/lib/woods/operator/pipeline_guard.rb +92 -0
- data/lib/woods/operator/status_reporter.rb +80 -0
- data/lib/woods/railtie.rb +38 -0
- data/lib/woods/resilience/circuit_breaker.rb +99 -0
- data/lib/woods/resilience/index_validator.rb +167 -0
- data/lib/woods/resilience/retryable_provider.rb +108 -0
- data/lib/woods/retrieval/context_assembler.rb +261 -0
- data/lib/woods/retrieval/query_classifier.rb +133 -0
- data/lib/woods/retrieval/ranker.rb +277 -0
- data/lib/woods/retrieval/search_executor.rb +316 -0
- data/lib/woods/retriever.rb +152 -0
- data/lib/woods/ruby_analyzer/class_analyzer.rb +170 -0
- data/lib/woods/ruby_analyzer/dataflow_analyzer.rb +77 -0
- data/lib/woods/ruby_analyzer/fqn_builder.rb +18 -0
- data/lib/woods/ruby_analyzer/mermaid_renderer.rb +280 -0
- data/lib/woods/ruby_analyzer/method_analyzer.rb +143 -0
- data/lib/woods/ruby_analyzer/trace_enricher.rb +143 -0
- data/lib/woods/ruby_analyzer.rb +87 -0
- data/lib/woods/session_tracer/file_store.rb +104 -0
- data/lib/woods/session_tracer/middleware.rb +143 -0
- data/lib/woods/session_tracer/redis_store.rb +106 -0
- data/lib/woods/session_tracer/session_flow_assembler.rb +254 -0
- data/lib/woods/session_tracer/session_flow_document.rb +223 -0
- data/lib/woods/session_tracer/solid_cache_store.rb +139 -0
- data/lib/woods/session_tracer/store.rb +81 -0
- data/lib/woods/storage/graph_store.rb +120 -0
- data/lib/woods/storage/metadata_store.rb +196 -0
- data/lib/woods/storage/pgvector.rb +195 -0
- data/lib/woods/storage/qdrant.rb +205 -0
- data/lib/woods/storage/vector_store.rb +167 -0
- data/lib/woods/temporal/json_snapshot_store.rb +245 -0
- data/lib/woods/temporal/snapshot_store.rb +345 -0
- data/lib/woods/token_utils.rb +19 -0
- data/lib/woods/version.rb +5 -0
- data/lib/woods.rb +246 -0
- metadata +270 -0
|
@@ -0,0 +1,345 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
require 'time'
|
|
5
|
+
|
|
6
|
+
module Woods
|
|
7
|
+
module Temporal
|
|
8
|
+
# SnapshotStore captures and queries temporal snapshots of extraction runs.
|
|
9
|
+
#
|
|
10
|
+
# Each snapshot is anchored to a git commit SHA and stores per-unit content
|
|
11
|
+
# hashes for efficient diff computation. Full source is not duplicated —
|
|
12
|
+
# only hashes of source, metadata, and dependencies are stored per snapshot.
|
|
13
|
+
#
|
|
14
|
+
# @example Capturing a snapshot
|
|
15
|
+
# store = SnapshotStore.new(connection: db)
|
|
16
|
+
# store.capture(manifest, unit_hashes)
|
|
17
|
+
#
|
|
18
|
+
# @example Comparing snapshots
|
|
19
|
+
# diff = store.diff("abc123", "def456")
|
|
20
|
+
# diff[:added] # => [{ identifier: "NewModel", ... }]
|
|
21
|
+
# diff[:modified] # => [{ identifier: "User", ... }]
|
|
22
|
+
# diff[:deleted] # => [{ identifier: "OldService", ... }]
|
|
23
|
+
#
|
|
24
|
+
class SnapshotStore # rubocop:disable Metrics/ClassLength
|
|
25
|
+
# @param connection [Object] Database connection supporting #execute and #get_first_row
|
|
26
|
+
def initialize(connection:)
|
|
27
|
+
@db = connection
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Capture a snapshot after extraction completes.
|
|
31
|
+
#
|
|
32
|
+
# Stores the manifest metadata and per-unit content hashes.
|
|
33
|
+
# Computes diff stats vs. the most recent previous snapshot.
|
|
34
|
+
#
|
|
35
|
+
# @param manifest [Hash] The manifest data (string or symbol keys)
|
|
36
|
+
# @param unit_hashes [Array<Hash>] Per-unit content hashes
|
|
37
|
+
# @return [Hash] Snapshot record with diff stats
|
|
38
|
+
def capture(manifest, unit_hashes)
|
|
39
|
+
git_sha = mget(manifest, 'git_sha')
|
|
40
|
+
return nil unless git_sha
|
|
41
|
+
|
|
42
|
+
previous = find_latest
|
|
43
|
+
upsert_snapshot(manifest, git_sha, unit_hashes.size)
|
|
44
|
+
|
|
45
|
+
snapshot_id = fetch_snapshot_id(git_sha)
|
|
46
|
+
@db.execute('DELETE FROM woods_snapshot_units WHERE snapshot_id = ?', [snapshot_id])
|
|
47
|
+
insert_unit_hashes(snapshot_id, unit_hashes)
|
|
48
|
+
|
|
49
|
+
update_diff_stats(snapshot_id, previous)
|
|
50
|
+
find(git_sha)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# List snapshots, optionally filtered by branch.
|
|
54
|
+
#
|
|
55
|
+
# @param limit [Integer] Max results (default 20)
|
|
56
|
+
# @param branch [String, nil] Filter by branch name
|
|
57
|
+
# @return [Array<Hash>] Snapshot summaries sorted by extracted_at descending
|
|
58
|
+
def list(limit: 20, branch: nil)
|
|
59
|
+
rows = if branch
|
|
60
|
+
@db.execute(
|
|
61
|
+
'SELECT * FROM woods_snapshots WHERE git_branch = ? ORDER BY extracted_at DESC LIMIT ?',
|
|
62
|
+
[branch, limit]
|
|
63
|
+
)
|
|
64
|
+
else
|
|
65
|
+
@db.execute(
|
|
66
|
+
'SELECT * FROM woods_snapshots ORDER BY extracted_at DESC LIMIT ?',
|
|
67
|
+
[limit]
|
|
68
|
+
)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
rows.map { |row| row_to_hash(row) }
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Find a specific snapshot by git SHA.
|
|
75
|
+
#
|
|
76
|
+
# @param git_sha [String]
|
|
77
|
+
# @return [Hash, nil] Snapshot metadata or nil if not found
|
|
78
|
+
def find(git_sha)
|
|
79
|
+
row = @db.get_first_row('SELECT * FROM woods_snapshots WHERE git_sha = ?', [git_sha])
|
|
80
|
+
return nil unless row
|
|
81
|
+
|
|
82
|
+
row_to_hash(row)
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Compute diff between two snapshots.
|
|
86
|
+
#
|
|
87
|
+
# @param sha_a [String] Before snapshot git SHA
|
|
88
|
+
# @param sha_b [String] After snapshot git SHA
|
|
89
|
+
# @return [Hash] {added: [...], modified: [...], deleted: [...]}
|
|
90
|
+
def diff(sha_a, sha_b)
|
|
91
|
+
id_a = fetch_snapshot_id(sha_a)
|
|
92
|
+
id_b = fetch_snapshot_id(sha_b)
|
|
93
|
+
|
|
94
|
+
return { added: [], modified: [], deleted: [] } unless id_a && id_b
|
|
95
|
+
|
|
96
|
+
units_a = load_snapshot_units(id_a)
|
|
97
|
+
units_b = load_snapshot_units(id_b)
|
|
98
|
+
|
|
99
|
+
compute_diff(units_a, units_b)
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# History of a single unit across snapshots.
|
|
103
|
+
#
|
|
104
|
+
# @param identifier [String] Unit identifier
|
|
105
|
+
# @param limit [Integer] Max snapshots to return (default 20)
|
|
106
|
+
# @return [Array<Hash>] Entries with git_sha, extracted_at, source_hash, changed flag
|
|
107
|
+
def unit_history(identifier, limit: 20)
|
|
108
|
+
rows = @db.execute(<<~SQL, [identifier, limit])
|
|
109
|
+
SELECT su.source_hash, su.metadata_hash, su.dependencies_hash, su.unit_type,
|
|
110
|
+
s.git_sha, s.extracted_at, s.git_branch
|
|
111
|
+
FROM woods_snapshot_units su
|
|
112
|
+
JOIN woods_snapshots s ON s.id = su.snapshot_id
|
|
113
|
+
WHERE su.identifier = ?
|
|
114
|
+
ORDER BY s.extracted_at DESC
|
|
115
|
+
LIMIT ?
|
|
116
|
+
SQL
|
|
117
|
+
|
|
118
|
+
entries = rows.map { |row| history_entry_from_row(row) }
|
|
119
|
+
mark_changed_entries(entries)
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
private
|
|
123
|
+
|
|
124
|
+
# Build a history entry hash from a database row.
|
|
125
|
+
#
|
|
126
|
+
# @param row [Hash]
|
|
127
|
+
# @return [Hash]
|
|
128
|
+
def history_entry_from_row(row)
|
|
129
|
+
{
|
|
130
|
+
git_sha: row['git_sha'],
|
|
131
|
+
extracted_at: row['extracted_at'],
|
|
132
|
+
git_branch: row['git_branch'],
|
|
133
|
+
unit_type: row['unit_type'],
|
|
134
|
+
source_hash: row['source_hash'],
|
|
135
|
+
metadata_hash: row['metadata_hash'],
|
|
136
|
+
dependencies_hash: row['dependencies_hash']
|
|
137
|
+
}
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
# Mark changed flag on history entries by comparing source hashes.
|
|
141
|
+
#
|
|
142
|
+
# @param entries [Array<Hash>]
|
|
143
|
+
# @return [Array<Hash>]
|
|
144
|
+
def mark_changed_entries(entries)
|
|
145
|
+
entries.each_with_index do |entry, i|
|
|
146
|
+
entry[:changed] = if i == entries.size - 1
|
|
147
|
+
true # Oldest version is always "changed" (first appearance)
|
|
148
|
+
else
|
|
149
|
+
entry[:source_hash] != entries[i + 1][:source_hash]
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
entries
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
# Get a value from a hash that may have string or symbol keys.
|
|
156
|
+
#
|
|
157
|
+
# @param hash [Hash]
|
|
158
|
+
# @param key [String]
|
|
159
|
+
# @return [Object, nil]
|
|
160
|
+
def mget(hash, key)
|
|
161
|
+
hash[key] || hash[key.to_sym]
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
# Insert or replace the snapshot row from manifest data.
|
|
165
|
+
#
|
|
166
|
+
# @param manifest [Hash]
|
|
167
|
+
# @param git_sha [String]
|
|
168
|
+
# @param default_total [Integer]
|
|
169
|
+
# @return [void]
|
|
170
|
+
def upsert_snapshot(manifest, git_sha, default_total)
|
|
171
|
+
params = [
|
|
172
|
+
git_sha,
|
|
173
|
+
mget(manifest, 'git_branch'),
|
|
174
|
+
mget(manifest, 'extracted_at') || Time.now.iso8601,
|
|
175
|
+
mget(manifest, 'rails_version'),
|
|
176
|
+
mget(manifest, 'ruby_version'),
|
|
177
|
+
mget(manifest, 'total_units') || default_total,
|
|
178
|
+
JSON.generate(mget(manifest, 'counts') || {}),
|
|
179
|
+
mget(manifest, 'gemfile_lock_sha'),
|
|
180
|
+
mget(manifest, 'schema_sha')
|
|
181
|
+
]
|
|
182
|
+
@db.execute(<<~SQL, params)
|
|
183
|
+
INSERT OR REPLACE INTO woods_snapshots
|
|
184
|
+
(git_sha, git_branch, extracted_at, rails_version, ruby_version,
|
|
185
|
+
total_units, unit_counts, gemfile_lock_sha, schema_sha)
|
|
186
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
187
|
+
SQL
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
# Update a snapshot's diff stats vs. a previous snapshot.
|
|
191
|
+
#
|
|
192
|
+
# @param snapshot_id [Integer]
|
|
193
|
+
# @param previous [Hash, nil]
|
|
194
|
+
# @return [void]
|
|
195
|
+
def update_diff_stats(snapshot_id, previous)
|
|
196
|
+
diff_stats = compute_diff_stats(snapshot_id, previous)
|
|
197
|
+
@db.execute(
|
|
198
|
+
'UPDATE woods_snapshots SET units_added = ?, units_modified = ?, units_deleted = ? WHERE id = ?',
|
|
199
|
+
[diff_stats[:added], diff_stats[:modified], diff_stats[:deleted], snapshot_id]
|
|
200
|
+
)
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
# Find the most recent snapshot.
|
|
204
|
+
#
|
|
205
|
+
# @return [Hash, nil]
|
|
206
|
+
def find_latest
|
|
207
|
+
row = @db.get_first_row('SELECT * FROM woods_snapshots ORDER BY extracted_at DESC LIMIT 1')
|
|
208
|
+
return nil unless row
|
|
209
|
+
|
|
210
|
+
row_to_hash(row)
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
# Fetch a snapshot's ID by git SHA.
|
|
214
|
+
#
|
|
215
|
+
# @param git_sha [String]
|
|
216
|
+
# @return [Integer, nil]
|
|
217
|
+
def fetch_snapshot_id(git_sha)
|
|
218
|
+
@db.get_first_value('SELECT id FROM woods_snapshots WHERE git_sha = ?', [git_sha])
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
# Insert per-unit hash records for a snapshot.
|
|
222
|
+
#
|
|
223
|
+
# @param snapshot_id [Integer]
|
|
224
|
+
# @param unit_hashes [Array<Hash>]
|
|
225
|
+
# @return [void]
|
|
226
|
+
def insert_unit_hashes(snapshot_id, unit_hashes)
|
|
227
|
+
sql = <<~SQL
|
|
228
|
+
INSERT INTO woods_snapshot_units
|
|
229
|
+
(snapshot_id, identifier, unit_type, source_hash, metadata_hash, dependencies_hash)
|
|
230
|
+
VALUES (?, ?, ?, ?, ?, ?)
|
|
231
|
+
SQL
|
|
232
|
+
|
|
233
|
+
# Wrap in a transaction to batch all inserts into a single commit,
|
|
234
|
+
# reducing per-row fsync overhead from O(n) to O(1).
|
|
235
|
+
@db.transaction do
|
|
236
|
+
unit_hashes.each do |uh|
|
|
237
|
+
params = [
|
|
238
|
+
snapshot_id,
|
|
239
|
+
mget(uh, 'identifier'),
|
|
240
|
+
mget(uh, 'type').to_s,
|
|
241
|
+
mget(uh, 'source_hash'),
|
|
242
|
+
mget(uh, 'metadata_hash'),
|
|
243
|
+
mget(uh, 'dependencies_hash')
|
|
244
|
+
]
|
|
245
|
+
@db.execute(sql, params)
|
|
246
|
+
end
|
|
247
|
+
end
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
# Load all unit records for a snapshot as a hash keyed by identifier.
|
|
251
|
+
#
|
|
252
|
+
# @param snapshot_id [Integer]
|
|
253
|
+
# @return [Hash{String => Hash}]
|
|
254
|
+
def load_snapshot_units(snapshot_id)
|
|
255
|
+
sql = <<~SQL
|
|
256
|
+
SELECT identifier, unit_type, source_hash, metadata_hash, dependencies_hash
|
|
257
|
+
FROM woods_snapshot_units WHERE snapshot_id = ?
|
|
258
|
+
SQL
|
|
259
|
+
rows = @db.execute(sql, [snapshot_id])
|
|
260
|
+
|
|
261
|
+
rows.to_h do |row|
|
|
262
|
+
[row['identifier'], {
|
|
263
|
+
unit_type: row['unit_type'],
|
|
264
|
+
source_hash: row['source_hash'],
|
|
265
|
+
metadata_hash: row['metadata_hash'],
|
|
266
|
+
dependencies_hash: row['dependencies_hash']
|
|
267
|
+
}]
|
|
268
|
+
end
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
# Compute diff between two sets of unit hashes.
|
|
272
|
+
#
|
|
273
|
+
# @param units_a [Hash{String => Hash}] Before
|
|
274
|
+
# @param units_b [Hash{String => Hash}] After
|
|
275
|
+
# @return [Hash] {added: [...], modified: [...], deleted: [...]}
|
|
276
|
+
def compute_diff(units_a, units_b) # rubocop:disable Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
|
|
277
|
+
added = []
|
|
278
|
+
modified = []
|
|
279
|
+
deleted = []
|
|
280
|
+
|
|
281
|
+
# Units in B but not A → added
|
|
282
|
+
# Units in both → check for modifications
|
|
283
|
+
units_b.each do |identifier, data_b|
|
|
284
|
+
if units_a.key?(identifier)
|
|
285
|
+
data_a = units_a[identifier]
|
|
286
|
+
if data_a[:source_hash] != data_b[:source_hash] ||
|
|
287
|
+
data_a[:metadata_hash] != data_b[:metadata_hash] ||
|
|
288
|
+
data_a[:dependencies_hash] != data_b[:dependencies_hash]
|
|
289
|
+
modified << { identifier: identifier, unit_type: data_b[:unit_type] }
|
|
290
|
+
end
|
|
291
|
+
else
|
|
292
|
+
added << { identifier: identifier, unit_type: data_b[:unit_type] }
|
|
293
|
+
end
|
|
294
|
+
end
|
|
295
|
+
|
|
296
|
+
# Units in A but not B → deleted
|
|
297
|
+
units_a.each do |identifier, data_a|
|
|
298
|
+
deleted << { identifier: identifier, unit_type: data_a[:unit_type] } unless units_b.key?(identifier)
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
{ added: added, modified: modified, deleted: deleted }
|
|
302
|
+
end
|
|
303
|
+
|
|
304
|
+
# Compute aggregate diff stats.
|
|
305
|
+
#
|
|
306
|
+
# @param current_snapshot_id [Integer]
|
|
307
|
+
# @param previous_snapshot [Hash, nil]
|
|
308
|
+
# @return [Hash] {added:, modified:, deleted:}
|
|
309
|
+
def compute_diff_stats(current_snapshot_id, previous_snapshot)
|
|
310
|
+
return { added: 0, modified: 0, deleted: 0 } unless previous_snapshot
|
|
311
|
+
|
|
312
|
+
prev_id = fetch_snapshot_id(previous_snapshot[:git_sha])
|
|
313
|
+
return { added: 0, modified: 0, deleted: 0 } unless prev_id
|
|
314
|
+
|
|
315
|
+
units_prev = load_snapshot_units(prev_id)
|
|
316
|
+
units_curr = load_snapshot_units(current_snapshot_id)
|
|
317
|
+
|
|
318
|
+
result = compute_diff(units_prev, units_curr)
|
|
319
|
+
{ added: result[:added].size, modified: result[:modified].size, deleted: result[:deleted].size }
|
|
320
|
+
end
|
|
321
|
+
|
|
322
|
+
# Convert a database row to a normalized hash.
|
|
323
|
+
#
|
|
324
|
+
# @param row [Hash] SQLite3 result row
|
|
325
|
+
# @return [Hash]
|
|
326
|
+
def row_to_hash(row)
|
|
327
|
+
{
|
|
328
|
+
id: row['id'],
|
|
329
|
+
git_sha: row['git_sha'],
|
|
330
|
+
git_branch: row['git_branch'],
|
|
331
|
+
extracted_at: row['extracted_at'],
|
|
332
|
+
rails_version: row['rails_version'],
|
|
333
|
+
ruby_version: row['ruby_version'],
|
|
334
|
+
total_units: row['total_units'],
|
|
335
|
+
unit_counts: row['unit_counts'] ? JSON.parse(row['unit_counts']) : {},
|
|
336
|
+
gemfile_lock_sha: row['gemfile_lock_sha'],
|
|
337
|
+
schema_sha: row['schema_sha'],
|
|
338
|
+
units_added: row['units_added'],
|
|
339
|
+
units_modified: row['units_modified'],
|
|
340
|
+
units_deleted: row['units_deleted']
|
|
341
|
+
}
|
|
342
|
+
end
|
|
343
|
+
end
|
|
344
|
+
end
|
|
345
|
+
end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Woods
|
|
4
|
+
# Shared token estimation utility.
|
|
5
|
+
#
|
|
6
|
+
# Uses project convention: (string.length / 4.0).ceil
|
|
7
|
+
# See docs/TOKEN_BENCHMARK.md — conservative floor (~10.6% overestimate).
|
|
8
|
+
module TokenUtils
|
|
9
|
+
module_function
|
|
10
|
+
|
|
11
|
+
# Estimate token count for a string.
|
|
12
|
+
#
|
|
13
|
+
# @param text [String] Text to estimate
|
|
14
|
+
# @return [Integer] Estimated token count
|
|
15
|
+
def estimate_tokens(text)
|
|
16
|
+
(text.length / 4.0).ceil
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
data/lib/woods.rb
ADDED
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Woods - Rails Codebase Indexing and Retrieval
|
|
4
|
+
#
|
|
5
|
+
# A system for extracting, indexing, and retrieving context from Rails codebases
|
|
6
|
+
# to enable AI-assisted development, debugging, and analytics.
|
|
7
|
+
#
|
|
8
|
+
# ## Quick Start
|
|
9
|
+
#
|
|
10
|
+
# # Extract codebase
|
|
11
|
+
# Woods.extract!
|
|
12
|
+
#
|
|
13
|
+
# # Or via rake
|
|
14
|
+
# bundle exec rake woods:extract
|
|
15
|
+
#
|
|
16
|
+
# ## Configuration
|
|
17
|
+
#
|
|
18
|
+
# Woods.configure do |config|
|
|
19
|
+
# config.output_dir = Rails.root.join("tmp/woods")
|
|
20
|
+
# config.max_context_tokens = 8000
|
|
21
|
+
# config.include_framework_sources = true
|
|
22
|
+
# end
|
|
23
|
+
#
|
|
24
|
+
require_relative 'woods/version'
|
|
25
|
+
|
|
26
|
+
module Woods
|
|
27
|
+
class Error < StandardError; end
|
|
28
|
+
class ConfigurationError < Error; end
|
|
29
|
+
class ExtractionError < Error; end
|
|
30
|
+
class SessionTracerError < Error; end
|
|
31
|
+
|
|
32
|
+
CONFIG_MUTEX = Mutex.new
|
|
33
|
+
|
|
34
|
+
# ════════════════════════════════════════════════════════════════════════
|
|
35
|
+
# Configuration
|
|
36
|
+
# ════════════════════════════════════════════════════════════════════════
|
|
37
|
+
|
|
38
|
+
class Configuration
|
|
39
|
+
attr_accessor :embedding_model, :include_framework_sources, :gem_configs,
|
|
40
|
+
:vector_store, :metadata_store, :graph_store, :embedding_provider, :log_level,
|
|
41
|
+
:vector_store_options, :metadata_store_options, :embedding_options,
|
|
42
|
+
:concurrent_extraction, :precompute_flows, :enable_snapshots,
|
|
43
|
+
:session_tracer_enabled, :session_store, :session_id_proc, :session_exclude_paths,
|
|
44
|
+
:console_mcp_enabled, :console_mcp_path, :console_redacted_columns,
|
|
45
|
+
:notion_api_token, :notion_database_ids,
|
|
46
|
+
:cache_store, :cache_options
|
|
47
|
+
attr_reader :max_context_tokens, :similarity_threshold, :extractors, :pretty_json, :context_format,
|
|
48
|
+
:cache_enabled
|
|
49
|
+
|
|
50
|
+
def initialize # rubocop:disable Metrics/MethodLength
|
|
51
|
+
@output_dir = nil # Resolved lazily; Rails.root is nil at require time
|
|
52
|
+
@embedding_model = 'text-embedding-3-small'
|
|
53
|
+
@max_context_tokens = 8000
|
|
54
|
+
@similarity_threshold = 0.7
|
|
55
|
+
@include_framework_sources = true
|
|
56
|
+
@gem_configs = {}
|
|
57
|
+
@extractors = %i[models controllers services components view_components jobs mailers graphql serializers
|
|
58
|
+
managers policies validators rails_source]
|
|
59
|
+
@pretty_json = true
|
|
60
|
+
@concurrent_extraction = false
|
|
61
|
+
@precompute_flows = false
|
|
62
|
+
@enable_snapshots = false
|
|
63
|
+
@context_format = :markdown
|
|
64
|
+
@session_tracer_enabled = false
|
|
65
|
+
@session_store = nil
|
|
66
|
+
@session_id_proc = nil
|
|
67
|
+
@session_exclude_paths = []
|
|
68
|
+
@console_mcp_enabled = false
|
|
69
|
+
@console_mcp_path = '/mcp/console'
|
|
70
|
+
@console_redacted_columns = []
|
|
71
|
+
@notion_api_token = nil
|
|
72
|
+
@notion_database_ids = {}
|
|
73
|
+
@cache_enabled = false
|
|
74
|
+
@cache_store = nil # :redis, :solid_cache, :memory, or a CacheStore instance
|
|
75
|
+
@cache_options = {} # { redis: client, cache: store, ttl: { embeddings: 86400, ... } }
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# @return [Pathname, String] Output directory, defaulting to Rails.root/tmp/woods
|
|
79
|
+
def output_dir
|
|
80
|
+
@output_dir ||= defined?(Rails) && Rails.root ? Rails.root.join('tmp/woods') : 'tmp/woods'
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# @param value [Object] Must respond to #to_s
|
|
84
|
+
# @raise [ConfigurationError] if value is nil
|
|
85
|
+
def output_dir=(value)
|
|
86
|
+
raise ConfigurationError, 'output_dir cannot be nil' if value.nil?
|
|
87
|
+
|
|
88
|
+
@output_dir = value
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# @param value [Integer] Must be a positive Integer
|
|
92
|
+
# @raise [ConfigurationError] if value is not a positive Integer
|
|
93
|
+
def max_context_tokens=(value)
|
|
94
|
+
unless value.is_a?(Integer) && value.positive?
|
|
95
|
+
raise ConfigurationError, "max_context_tokens must be a positive Integer, got #{value.inspect}"
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
@max_context_tokens = value
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# @param value [Numeric] Must be between 0.0 and 1.0 inclusive
|
|
102
|
+
# @raise [ConfigurationError] if value is out of range or not numeric
|
|
103
|
+
def similarity_threshold=(value)
|
|
104
|
+
raise ConfigurationError, "similarity_threshold must be Numeric, got #{value.inspect}" unless value.is_a?(Numeric)
|
|
105
|
+
|
|
106
|
+
float_val = value.to_f
|
|
107
|
+
unless float_val.between?(0.0, 1.0)
|
|
108
|
+
raise ConfigurationError, "similarity_threshold must be between 0.0 and 1.0, got #{value.inspect}"
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
@similarity_threshold = float_val
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# @param value [Array<Symbol>] List of extractor names
|
|
115
|
+
# @raise [ConfigurationError] if value is not an Array of Symbols
|
|
116
|
+
def extractors=(value)
|
|
117
|
+
unless value.is_a?(Array) && value.all?(Symbol)
|
|
118
|
+
raise ConfigurationError, "extractors must be an Array of Symbols, got #{value.inspect}"
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
@extractors = value
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
# @param value [Boolean] Must be true or false
|
|
125
|
+
# @raise [ConfigurationError] if value is not a boolean
|
|
126
|
+
def pretty_json=(value)
|
|
127
|
+
validate_boolean!(:pretty_json, value)
|
|
128
|
+
@pretty_json = value
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
# @param value [Symbol] Must be one of :claude, :markdown, :plain, :json
|
|
132
|
+
# @raise [ConfigurationError] if value is not a valid format
|
|
133
|
+
def context_format=(value)
|
|
134
|
+
valid = %i[claude markdown plain json]
|
|
135
|
+
unless valid.include?(value)
|
|
136
|
+
raise ConfigurationError, "context_format must be one of #{valid.inspect}, got #{value.inspect}"
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
@context_format = value
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
# @param value [Boolean] Enable or disable the cache layer
|
|
143
|
+
# @raise [ConfigurationError] if value is not a boolean
|
|
144
|
+
def cache_enabled=(value)
|
|
145
|
+
validate_boolean!(:cache_enabled, value)
|
|
146
|
+
@cache_enabled = value
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
# Add a gem to be indexed
|
|
150
|
+
#
|
|
151
|
+
# @param gem_name [String] Name of the gem
|
|
152
|
+
# @param paths [Array<String>] Relative paths within the gem to index
|
|
153
|
+
# @param priority [Symbol] :high, :medium, or :low
|
|
154
|
+
def add_gem(gem_name, paths:, priority: :medium)
|
|
155
|
+
@gem_configs[gem_name] = { paths: paths, priority: priority }
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
private
|
|
159
|
+
|
|
160
|
+
def validate_boolean!(name, value)
|
|
161
|
+
return if value.is_a?(TrueClass) || value.is_a?(FalseClass)
|
|
162
|
+
|
|
163
|
+
raise ConfigurationError, "#{name} must be true or false, got #{value.inspect}"
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
# ════════════════════════════════════════════════════════════════════════
|
|
168
|
+
# Module Interface
|
|
169
|
+
# ════════════════════════════════════════════════════════════════════════
|
|
170
|
+
|
|
171
|
+
class << self
|
|
172
|
+
attr_accessor :configuration
|
|
173
|
+
|
|
174
|
+
def configure
|
|
175
|
+
CONFIG_MUTEX.synchronize do
|
|
176
|
+
self.configuration ||= Configuration.new
|
|
177
|
+
yield(configuration) if block_given?
|
|
178
|
+
configuration
|
|
179
|
+
end
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
# Configure the module using a named preset and optional block customization.
|
|
183
|
+
#
|
|
184
|
+
# Valid preset names: :local, :postgresql, :production
|
|
185
|
+
#
|
|
186
|
+
# @param name [Symbol] Preset name
|
|
187
|
+
# @yield [config] Optional block for further customization after preset is applied
|
|
188
|
+
# @yieldparam config [Configuration] The configuration object
|
|
189
|
+
# @return [Configuration] The applied configuration
|
|
190
|
+
def configure_with_preset(name)
|
|
191
|
+
CONFIG_MUTEX.synchronize do
|
|
192
|
+
self.configuration = Builder.preset_config(name)
|
|
193
|
+
yield configuration if block_given?
|
|
194
|
+
configuration
|
|
195
|
+
end
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
# Build a Retriever wired with adapters from the current configuration.
|
|
199
|
+
#
|
|
200
|
+
# @return [Retriever] A fully wired retriever instance
|
|
201
|
+
def build_retriever
|
|
202
|
+
Builder.new(configuration).build_retriever
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
# Retrieve context for a natural language query using the current configuration.
|
|
206
|
+
#
|
|
207
|
+
# @param query [String] Natural language query
|
|
208
|
+
# @param opts [Hash] Options passed through to the retriever (e.g., budget:)
|
|
209
|
+
# @return [Retriever::RetrievalResult] Retrieval result
|
|
210
|
+
def retrieve(query, **opts)
|
|
211
|
+
build_retriever.retrieve(query, **opts)
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
# Perform full extraction
|
|
215
|
+
#
|
|
216
|
+
# @param output_dir [String] Override output directory
|
|
217
|
+
# @return [Hash] Extraction results
|
|
218
|
+
def extract!(output_dir: nil)
|
|
219
|
+
require_relative 'woods/extractor'
|
|
220
|
+
|
|
221
|
+
dir = output_dir || configuration.output_dir
|
|
222
|
+
extractor = Extractor.new(output_dir: dir)
|
|
223
|
+
extractor.extract_all
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
# Perform incremental extraction
|
|
227
|
+
#
|
|
228
|
+
# @param changed_files [Array<String>] List of changed files
|
|
229
|
+
# @return [Array<String>] Re-extracted unit identifiers
|
|
230
|
+
def extract_changed!(changed_files)
|
|
231
|
+
require_relative 'woods/extractor'
|
|
232
|
+
|
|
233
|
+
extractor = Extractor.new(output_dir: configuration.output_dir)
|
|
234
|
+
extractor.extract_changed(changed_files)
|
|
235
|
+
end
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
# Initialize with defaults
|
|
239
|
+
configure
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
require_relative 'woods/builder'
|
|
243
|
+
require_relative 'woods/cost_model'
|
|
244
|
+
require_relative 'woods/cache/cache_store'
|
|
245
|
+
require_relative 'woods/cache/cache_middleware'
|
|
246
|
+
require_relative 'woods/railtie' if defined?(Rails::Railtie)
|