woods 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +169 -0
- data/README.md +20 -8
- data/exe/woods-console +51 -6
- data/exe/woods-console-mcp +24 -4
- data/exe/woods-mcp +30 -7
- data/exe/woods-mcp-http +47 -6
- data/lib/generators/woods/install_generator.rb +13 -4
- data/lib/generators/woods/templates/woods.rb.tt +155 -0
- data/lib/tasks/woods.rake +15 -50
- data/lib/woods/builder.rb +174 -9
- data/lib/woods/cache/cache_middleware.rb +360 -31
- data/lib/woods/chunking/semantic_chunker.rb +334 -7
- data/lib/woods/console/adapters/job_adapter.rb +10 -4
- data/lib/woods/console/audit_logger.rb +76 -4
- data/lib/woods/console/bridge.rb +48 -15
- data/lib/woods/console/bridge_protocol.rb +44 -0
- data/lib/woods/console/confirmation.rb +3 -4
- data/lib/woods/console/console_response_renderer.rb +56 -18
- data/lib/woods/console/credential_index.rb +201 -0
- data/lib/woods/console/credential_scanner.rb +302 -0
- data/lib/woods/console/dispatch_pipeline.rb +138 -0
- data/lib/woods/console/embedded_executor.rb +682 -35
- data/lib/woods/console/eval_guard.rb +319 -0
- data/lib/woods/console/model_validator.rb +1 -3
- data/lib/woods/console/rack_middleware.rb +185 -29
- data/lib/woods/console/redactor.rb +161 -0
- data/lib/woods/console/response_context.rb +127 -0
- data/lib/woods/console/safe_context.rb +220 -23
- data/lib/woods/console/scope_predicate_parser.rb +131 -0
- data/lib/woods/console/server.rb +417 -486
- data/lib/woods/console/sql_noise_stripper.rb +87 -0
- data/lib/woods/console/sql_table_scanner.rb +213 -0
- data/lib/woods/console/sql_validator.rb +81 -31
- data/lib/woods/console/table_gate.rb +93 -0
- data/lib/woods/console/tool_specs.rb +552 -0
- data/lib/woods/console/tools/tier1.rb +3 -3
- data/lib/woods/console/tools/tier4.rb +7 -1
- data/lib/woods/dependency_graph.rb +66 -7
- data/lib/woods/embedding/indexer.rb +190 -6
- data/lib/woods/embedding/openai.rb +40 -4
- data/lib/woods/embedding/provider.rb +104 -8
- data/lib/woods/embedding/text_preparer.rb +23 -3
- data/lib/woods/embedding/token_counter.rb +133 -0
- data/lib/woods/evaluation/baseline_runner.rb +20 -2
- data/lib/woods/evaluation/metrics.rb +4 -1
- data/lib/woods/extracted_unit.rb +1 -0
- data/lib/woods/extractor.rb +7 -1
- data/lib/woods/extractors/controller_extractor.rb +6 -0
- data/lib/woods/extractors/mailer_extractor.rb +16 -2
- data/lib/woods/extractors/model_extractor.rb +6 -1
- data/lib/woods/extractors/phlex_extractor.rb +13 -4
- data/lib/woods/extractors/rails_source_extractor.rb +2 -0
- data/lib/woods/extractors/route_helper_resolver.rb +130 -0
- data/lib/woods/extractors/shared_dependency_scanner.rb +130 -2
- data/lib/woods/extractors/view_component_extractor.rb +12 -1
- data/lib/woods/extractors/view_engines/base.rb +141 -0
- data/lib/woods/extractors/view_engines/erb.rb +145 -0
- data/lib/woods/extractors/view_template_extractor.rb +92 -133
- data/lib/woods/flow_assembler.rb +23 -15
- data/lib/woods/flow_precomputer.rb +21 -2
- data/lib/woods/graph_analyzer.rb +3 -4
- data/lib/woods/index_artifact.rb +173 -0
- data/lib/woods/mcp/bearer_auth.rb +45 -0
- data/lib/woods/mcp/bootstrap_state.rb +94 -0
- data/lib/woods/mcp/bootstrapper.rb +337 -16
- data/lib/woods/mcp/config_resolver.rb +288 -0
- data/lib/woods/mcp/errors.rb +134 -0
- data/lib/woods/mcp/index_reader.rb +265 -30
- data/lib/woods/mcp/origin_guard.rb +132 -0
- data/lib/woods/mcp/provider_probe.rb +166 -0
- data/lib/woods/mcp/renderers/claude_renderer.rb +6 -0
- data/lib/woods/mcp/renderers/markdown_renderer.rb +39 -3
- data/lib/woods/mcp/renderers/plain_renderer.rb +16 -2
- data/lib/woods/mcp/server.rb +737 -137
- data/lib/woods/model_name_cache.rb +78 -2
- data/lib/woods/notion/client.rb +25 -2
- data/lib/woods/notion/mappers/model_mapper.rb +36 -2
- data/lib/woods/railtie.rb +55 -15
- data/lib/woods/resilience/circuit_breaker.rb +9 -2
- data/lib/woods/resilience/retryable_provider.rb +40 -3
- data/lib/woods/resolved_config.rb +299 -0
- data/lib/woods/retrieval/context_assembler.rb +112 -5
- data/lib/woods/retrieval/query_classifier.rb +1 -1
- data/lib/woods/retrieval/ranker.rb +55 -6
- data/lib/woods/retrieval/search_executor.rb +42 -13
- data/lib/woods/retriever.rb +330 -24
- data/lib/woods/session_tracer/middleware.rb +35 -1
- data/lib/woods/storage/graph_store.rb +39 -0
- data/lib/woods/storage/inapplicable_backend.rb +14 -0
- data/lib/woods/storage/metadata_store.rb +129 -1
- data/lib/woods/storage/pgvector.rb +70 -8
- data/lib/woods/storage/qdrant.rb +196 -5
- data/lib/woods/storage/snapshotter/metadata.rb +172 -0
- data/lib/woods/storage/snapshotter/vector.rb +238 -0
- data/lib/woods/storage/snapshotter.rb +24 -0
- data/lib/woods/storage/vector_store.rb +184 -35
- data/lib/woods/tasks.rb +85 -0
- data/lib/woods/temporal/snapshot_store.rb +49 -1
- data/lib/woods/token_utils.rb +44 -5
- data/lib/woods/unblocked/client.rb +1 -1
- data/lib/woods/unblocked/document_builder.rb +35 -10
- data/lib/woods/unblocked/exporter.rb +1 -1
- data/lib/woods/util/host_guard.rb +61 -0
- data/lib/woods/version.rb +1 -1
- data/lib/woods.rb +126 -6
- metadata +69 -4
|
@@ -29,28 +29,76 @@ module Woods
|
|
|
29
29
|
end
|
|
30
30
|
end
|
|
31
31
|
|
|
32
|
+
# Class-like unit types that MethodChunker handles — anything
|
|
33
|
+
# shaped as "class or module with public methods, maybe privates,
|
|
34
|
+
# maybe callbacks/filters." Extend this list when new extractors
|
|
35
|
+
# produce units of similar structure.
|
|
36
|
+
METHOD_CHUNKABLE_TYPES = %i[
|
|
37
|
+
service job mailer concern policy pundit_policy serializer
|
|
38
|
+
decorator presenter interactor query_object value_object
|
|
39
|
+
component view_component action_cable_channel channel
|
|
40
|
+
graphql_resolver graphql_type helper validator api_client poro
|
|
41
|
+
manager configuration
|
|
42
|
+
].freeze
|
|
43
|
+
|
|
32
44
|
# Splits ExtractedUnits into semantic chunks based on unit type.
|
|
33
45
|
#
|
|
34
46
|
# Models are split by: summary, associations, validations, callbacks,
|
|
35
47
|
# scopes, methods. Controllers are split by: summary (filters), per-action.
|
|
36
|
-
#
|
|
48
|
+
# Class-like types (services, jobs, mailers, concerns, policies, …) split
|
|
49
|
+
# by summary + per-public-method + bundled privates via MethodChunker.
|
|
50
|
+
# Other types stay whole.
|
|
51
|
+
#
|
|
52
|
+
# Any chunk that still exceeds `max_chars` after semantic splitting is
|
|
53
|
+
# sliced into line-balanced sub-chunks so no single chunk is ever larger
|
|
54
|
+
# than the embedding provider's input budget.
|
|
37
55
|
#
|
|
38
56
|
# Units below the token threshold are returned as a single :whole chunk.
|
|
39
57
|
#
|
|
40
58
|
# @example
|
|
41
|
-
# chunker = SemanticChunker.new(threshold: 200)
|
|
59
|
+
# chunker = SemanticChunker.new(threshold: 200, max_chars: 20_480)
|
|
42
60
|
# chunks = chunker.chunk(extracted_unit)
|
|
43
61
|
# chunks.map(&:chunk_type) # => [:summary, :associations, :validations, :methods]
|
|
44
62
|
#
|
|
45
|
-
class SemanticChunker
|
|
63
|
+
class SemanticChunker # rubocop:disable Metrics/ClassLength
|
|
46
64
|
# Default token threshold below which units stay whole.
|
|
47
65
|
DEFAULT_THRESHOLD = 200
|
|
48
66
|
|
|
67
|
+
# Minimum chars-per-slice budget during tokenizer-driven recursive
|
|
68
|
+
# splitting. Prevents unbounded halving on pathological content
|
|
69
|
+
# (e.g., a single 2000-char regex line that tokenizes into 6000
|
|
70
|
+
# tokens because BERT WordPiece fragments every `\w+` boundary).
|
|
71
|
+
MIN_SLICE_CHARS = 256
|
|
72
|
+
private_constant :MIN_SLICE_CHARS
|
|
73
|
+
|
|
49
74
|
# @param threshold [Integer] Token count threshold for chunking
|
|
50
|
-
|
|
75
|
+
# @param max_chars [Integer, nil] Hard character ceiling for any single
|
|
76
|
+
# chunk. When set, any chunk larger than this is sliced into
|
|
77
|
+
# line-balanced sub-chunks. `nil` disables the safety net.
|
|
78
|
+
# @param token_counter [Woods::Embedding::TokenCounter, nil] Optional
|
|
79
|
+
# exact-token counter. When both this and `max_tokens` are set,
|
|
80
|
+
# oversize detection uses the real tokenizer rather than the
|
|
81
|
+
# char-length estimate, and post-slice verification recursively
|
|
82
|
+
# re-splits any piece that still exceeds `max_tokens`.
|
|
83
|
+
# @param max_tokens [Integer, nil] Token budget used with
|
|
84
|
+
# `token_counter` for the authoritative oversize check.
|
|
85
|
+
def initialize(threshold: DEFAULT_THRESHOLD, max_chars: nil,
|
|
86
|
+
token_counter: nil, max_tokens: nil)
|
|
51
87
|
@threshold = threshold
|
|
88
|
+
@max_chars = max_chars
|
|
89
|
+
@token_counter = token_counter
|
|
90
|
+
@max_tokens = max_tokens
|
|
52
91
|
end
|
|
53
92
|
|
|
93
|
+
# @return [Woods::Embedding::TokenCounter, nil]
|
|
94
|
+
attr_reader :token_counter
|
|
95
|
+
|
|
96
|
+
# @return [Integer, nil]
|
|
97
|
+
attr_reader :max_tokens
|
|
98
|
+
|
|
99
|
+
# @return [Integer, nil]
|
|
100
|
+
attr_reader :max_chars
|
|
101
|
+
|
|
54
102
|
# Split an ExtractedUnit into semantic chunks.
|
|
55
103
|
#
|
|
56
104
|
# @param unit [ExtractedUnit] The unit to chunk
|
|
@@ -59,15 +107,76 @@ module Woods
|
|
|
59
107
|
return [] if unit.source_code.nil? || unit.source_code.strip.empty?
|
|
60
108
|
return [build_whole_chunk(unit)] if unit.estimated_tokens <= @threshold
|
|
61
109
|
|
|
110
|
+
enforce_char_limit(chunks_for(unit), unit)
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# Enforce {@max_chars} on a unit's already-populated `chunks` array
|
|
114
|
+
# (hashes produced by extraction or a prior chunking pass). Oversize
|
|
115
|
+
# chunks are split into line-balanced siblings with `_part_N` chunk
|
|
116
|
+
# types; small chunks pass through unchanged. No-op when `@max_chars`
|
|
117
|
+
# is unset or `unit.chunks` is empty.
|
|
118
|
+
#
|
|
119
|
+
# Exists so the Indexer can apply the same ceiling to pre-chunked
|
|
120
|
+
# units (e.g. `rails_source`) that extraction already sliced — the
|
|
121
|
+
# extractor's own chunker is unaware of the embedding provider's
|
|
122
|
+
# budget and can emit chunks larger than the ceiling we'd pick here.
|
|
123
|
+
#
|
|
124
|
+
# @param unit [ExtractedUnit]
|
|
125
|
+
# @return [void]
|
|
126
|
+
def enforce_chunk_limits!(unit)
|
|
127
|
+
return unless enforcement_active?
|
|
128
|
+
return if unit.chunks.nil? || unit.chunks.empty?
|
|
129
|
+
|
|
130
|
+
unit.chunks = unit.chunks.flat_map { |chunk| split_oversize_hash_chunk(chunk) }
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
private
|
|
134
|
+
|
|
135
|
+
# True when either the char ceiling or the token-based verifier is
|
|
136
|
+
# wired up.
|
|
137
|
+
def enforcement_active?
|
|
138
|
+
@max_chars || (@token_counter && @max_tokens)
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
# Token-authoritative oversize check, falling back to char length.
|
|
142
|
+
def oversize?(content)
|
|
143
|
+
return false if content.nil? || content.empty?
|
|
144
|
+
return @token_counter.count(content) > @max_tokens if tokenizer_active?
|
|
145
|
+
|
|
146
|
+
@max_chars && content.length > @max_chars
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
def tokenizer_active?
|
|
150
|
+
@token_counter && @max_tokens
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
# @param chunk [Hash] a unit-chunk hash (symbol or string keys)
|
|
154
|
+
# @return [Array<Hash>]
|
|
155
|
+
def split_oversize_hash_chunk(chunk)
|
|
156
|
+
content = chunk[:content] || chunk['content']
|
|
157
|
+
return [chunk] if content.nil? || !oversize?(content)
|
|
158
|
+
|
|
159
|
+
chunk_type = chunk[:chunk_type] || chunk['chunk_type'] || :whole
|
|
160
|
+
verified_slices(content).each_with_index.map do |slice, idx|
|
|
161
|
+
{ content: slice, chunk_type: :"#{chunk_type}_part_#{idx}" }
|
|
162
|
+
end
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
# Dispatch to the type-appropriate chunker.
|
|
166
|
+
#
|
|
167
|
+
# @param unit [ExtractedUnit]
|
|
168
|
+
# @return [Array<Chunk>]
|
|
169
|
+
def chunks_for(unit)
|
|
62
170
|
case unit.type
|
|
63
171
|
when :model then ModelChunker.new(unit).chunk
|
|
64
172
|
when :controller then ControllerChunker.new(unit).chunk
|
|
65
|
-
else
|
|
173
|
+
else
|
|
174
|
+
return MethodChunker.new(unit).chunk if METHOD_CHUNKABLE_TYPES.include?(unit.type)
|
|
175
|
+
|
|
176
|
+
[build_whole_chunk(unit)]
|
|
66
177
|
end
|
|
67
178
|
end
|
|
68
179
|
|
|
69
|
-
private
|
|
70
|
-
|
|
71
180
|
# Build a single :whole chunk for small units.
|
|
72
181
|
#
|
|
73
182
|
# @param unit [ExtractedUnit]
|
|
@@ -80,6 +189,107 @@ module Woods
|
|
|
80
189
|
parent_type: unit.type
|
|
81
190
|
)
|
|
82
191
|
end
|
|
192
|
+
|
|
193
|
+
# Slice any chunk whose content exceeds the active budget into
|
|
194
|
+
# line-balanced sub-chunks. Preserves chunk_type with a `_part_N`
|
|
195
|
+
# suffix so downstream consumers can see they came from the same
|
|
196
|
+
# section.
|
|
197
|
+
#
|
|
198
|
+
# When `@max_chars` is nil but a token verifier is wired up, we
|
|
199
|
+
# still need to walk every chunk — `oversize?` will fall through
|
|
200
|
+
# to the token check. Skipping when only `@max_chars` is missing
|
|
201
|
+
# leaves the token-based path unreachable from this method.
|
|
202
|
+
#
|
|
203
|
+
# @param chunks [Array<Chunk>]
|
|
204
|
+
# @param unit [ExtractedUnit]
|
|
205
|
+
# @return [Array<Chunk>]
|
|
206
|
+
def enforce_char_limit(chunks, unit)
|
|
207
|
+
return chunks unless enforcement_active?
|
|
208
|
+
|
|
209
|
+
chunks.flat_map { |chunk| split_oversize_chunk(chunk, unit) }
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
# @param chunk [Chunk]
|
|
213
|
+
# @param unit [ExtractedUnit]
|
|
214
|
+
# @return [Array<Chunk>]
|
|
215
|
+
def split_oversize_chunk(chunk, unit)
|
|
216
|
+
return [chunk] unless oversize?(chunk.content)
|
|
217
|
+
|
|
218
|
+
verified_slices(chunk.content).each_with_index.map do |slice, idx|
|
|
219
|
+
Chunk.new(
|
|
220
|
+
content: slice,
|
|
221
|
+
chunk_type: :"#{chunk.chunk_type}_part_#{idx}",
|
|
222
|
+
parent_identifier: unit.identifier,
|
|
223
|
+
parent_type: unit.type
|
|
224
|
+
)
|
|
225
|
+
end
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
# Slice by lines, then (when a tokenizer is wired in) recursively
|
|
229
|
+
# re-split any slice whose real token count still exceeds the
|
|
230
|
+
# budget. Char-based slicing alone is unreliable on dense Rails
|
|
231
|
+
# source because BERT WordPiece tokenizes `::`-heavy constants at
|
|
232
|
+
# far below our estimate; the verifier catches those cases.
|
|
233
|
+
#
|
|
234
|
+
# @param content [String]
|
|
235
|
+
# @return [Array<String>]
|
|
236
|
+
def verified_slices(content)
|
|
237
|
+
limit = @max_chars || estimated_char_budget
|
|
238
|
+
slices = slice_by_lines(content, limit)
|
|
239
|
+
return slices unless tokenizer_active?
|
|
240
|
+
|
|
241
|
+
slices.flat_map { |slice| verify_slice(slice, limit) }
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
# Ensure a single post-line-split slice fits the token budget.
|
|
245
|
+
# Halves the char limit and reslices if it doesn't. Stops at
|
|
246
|
+
# {MIN_SLICE_CHARS} to avoid unbounded recursion on content that
|
|
247
|
+
# cannot be split line-wise (minified output, huge regex literals).
|
|
248
|
+
#
|
|
249
|
+
# @param slice [String]
|
|
250
|
+
# @param char_limit [Integer]
|
|
251
|
+
# @return [Array<String>]
|
|
252
|
+
def verify_slice(slice, char_limit)
|
|
253
|
+
return [slice] unless @token_counter.count(slice) > @max_tokens
|
|
254
|
+
|
|
255
|
+
smaller = char_limit / 2
|
|
256
|
+
return [slice] if smaller < MIN_SLICE_CHARS
|
|
257
|
+
|
|
258
|
+
slice_by_lines(slice, smaller).flat_map { |sub| verify_slice(sub, smaller) }
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
# Conservative char budget used when no explicit `max_chars` was
|
|
262
|
+
# given but the tokenizer is active. Uses a permissive estimate
|
|
263
|
+
# because the verifier will halve this further as needed.
|
|
264
|
+
def estimated_char_budget
|
|
265
|
+
return 0 unless @max_tokens
|
|
266
|
+
|
|
267
|
+
@max_tokens * 2
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
# Greedy line-based slicing that respects a supplied `limit`.
|
|
271
|
+
# Lines longer than `limit` are hard-cut (lossy — but such lines
|
|
272
|
+
# are already pathological: minified JSON dumps, long regexes).
|
|
273
|
+
#
|
|
274
|
+
# @param content [String]
|
|
275
|
+
# @param limit [Integer]
|
|
276
|
+
# @return [Array<String>]
|
|
277
|
+
def slice_by_lines(content, limit = @max_chars)
|
|
278
|
+
slices = []
|
|
279
|
+
current = String.new
|
|
280
|
+
content.each_line do |line|
|
|
281
|
+
line_parts = line.length > limit ? line.scan(/.{1,#{limit}}/m) : [line]
|
|
282
|
+
line_parts.each do |part|
|
|
283
|
+
if current.length + part.length > limit && !current.empty?
|
|
284
|
+
slices << current
|
|
285
|
+
current = String.new
|
|
286
|
+
end
|
|
287
|
+
current << part
|
|
288
|
+
end
|
|
289
|
+
end
|
|
290
|
+
slices << current unless current.empty?
|
|
291
|
+
slices
|
|
292
|
+
end
|
|
83
293
|
end
|
|
84
294
|
|
|
85
295
|
# Chunks a model unit by semantic sections: summary, associations,
|
|
@@ -291,5 +501,122 @@ module Woods
|
|
|
291
501
|
chunks
|
|
292
502
|
end
|
|
293
503
|
end
|
|
504
|
+
|
|
505
|
+
# Generic method-aware chunker for class-like unit types.
|
|
506
|
+
#
|
|
507
|
+
# Splits into:
|
|
508
|
+
# - `:summary` — class/module declaration, includes, constants,
|
|
509
|
+
# attr_* DSL, class-level method calls, and any class-level code
|
|
510
|
+
# before the first public method.
|
|
511
|
+
# - `:method_<name>` — one chunk per public instance method.
|
|
512
|
+
# - `:private_methods` — all private/protected methods bundled
|
|
513
|
+
# together (they're usually implementation helpers and rarely
|
|
514
|
+
# queried individually).
|
|
515
|
+
#
|
|
516
|
+
# Used for services, jobs, mailers, concerns, policies, serializers,
|
|
517
|
+
# decorators, presenters, interactors, form objects, components,
|
|
518
|
+
# GraphQL resolvers, helpers, validators, and other class-like units.
|
|
519
|
+
#
|
|
520
|
+
# @api private
|
|
521
|
+
class MethodChunker
|
|
522
|
+
include ChunkBuilder
|
|
523
|
+
|
|
524
|
+
# @param unit [ExtractedUnit]
|
|
525
|
+
def initialize(unit)
|
|
526
|
+
@unit = unit
|
|
527
|
+
end
|
|
528
|
+
|
|
529
|
+
# @return [Array<Chunk>]
|
|
530
|
+
def chunk
|
|
531
|
+
state = parse_lines(@unit.source_code.lines)
|
|
532
|
+
build_chunks(state).reject(&:empty?)
|
|
533
|
+
end
|
|
534
|
+
|
|
535
|
+
private
|
|
536
|
+
|
|
537
|
+
# Parse lines into summary + per-public-method + private buffers.
|
|
538
|
+
#
|
|
539
|
+
# @param lines [Array<String>]
|
|
540
|
+
# @return [Hash]
|
|
541
|
+
def parse_lines(lines)
|
|
542
|
+
state = {
|
|
543
|
+
summary: [], methods: {}, private_methods: [],
|
|
544
|
+
current_method: nil, depth: 0, in_private: false
|
|
545
|
+
}
|
|
546
|
+
lines.each do |line|
|
|
547
|
+
if state[:current_method]
|
|
548
|
+
track_method_line(state, line)
|
|
549
|
+
else
|
|
550
|
+
classify_top_level_line(state, line)
|
|
551
|
+
end
|
|
552
|
+
end
|
|
553
|
+
|
|
554
|
+
state
|
|
555
|
+
end
|
|
556
|
+
|
|
557
|
+
# While inside a method body, collect every line and track depth so
|
|
558
|
+
# we know when the method closes. Blocks (`do...end`, `if...end`)
|
|
559
|
+
# nest inside methods and must be balanced before the method's own
|
|
560
|
+
# `end` line counts.
|
|
561
|
+
def track_method_line(state, line)
|
|
562
|
+
target = state[:in_private] ? state[:private_methods] : state[:methods][state[:current_method]]
|
|
563
|
+
target << line
|
|
564
|
+
|
|
565
|
+
state[:depth] += 1 if line.match?(/\bdo\b/) && !line.match?(/\bend\b/)
|
|
566
|
+
return unless line.strip.match?(/^end\s*$/)
|
|
567
|
+
|
|
568
|
+
state[:depth] -= 1
|
|
569
|
+
return unless state[:depth] <= 0
|
|
570
|
+
|
|
571
|
+
state[:current_method] = nil
|
|
572
|
+
state[:depth] = 0
|
|
573
|
+
end
|
|
574
|
+
|
|
575
|
+
# Classify a class-body line: privacy marker, method start, or
|
|
576
|
+
# summary content (DSL calls, attrs, comments, includes).
|
|
577
|
+
def classify_top_level_line(state, line)
|
|
578
|
+
if line.match?(PRIVATE_PATTERN)
|
|
579
|
+
state[:in_private] = true
|
|
580
|
+
state[:private_methods] << line
|
|
581
|
+
elsif line.match?(METHOD_PATTERN)
|
|
582
|
+
start_method(state, line)
|
|
583
|
+
elsif state[:in_private]
|
|
584
|
+
state[:private_methods] << line
|
|
585
|
+
else
|
|
586
|
+
state[:summary] << line
|
|
587
|
+
end
|
|
588
|
+
end
|
|
589
|
+
|
|
590
|
+
def start_method(state, line)
|
|
591
|
+
method_name = line[/def\s+(?:self\.)?(\w+)/, 1]
|
|
592
|
+
state[:current_method] = method_name
|
|
593
|
+
state[:depth] = 1
|
|
594
|
+
|
|
595
|
+
if state[:in_private]
|
|
596
|
+
state[:private_methods] << line
|
|
597
|
+
else
|
|
598
|
+
# Preserve insertion order — Hash does this by default, but we
|
|
599
|
+
# initialize the entry here so `build_chunks` below walks
|
|
600
|
+
# methods in source order.
|
|
601
|
+
state[:methods][method_name] = [line]
|
|
602
|
+
end
|
|
603
|
+
end
|
|
604
|
+
|
|
605
|
+
# Build the final chunk array from the parse state.
|
|
606
|
+
#
|
|
607
|
+
# @param state [Hash]
|
|
608
|
+
# @return [Array<Chunk>]
|
|
609
|
+
def build_chunks(state)
|
|
610
|
+
chunks = []
|
|
611
|
+
chunks << build_chunk(:summary, state[:summary].join) if state[:summary].any?
|
|
612
|
+
|
|
613
|
+
state[:methods].each do |method_name, lines|
|
|
614
|
+
chunks << build_chunk(:"method_#{method_name}", lines.join)
|
|
615
|
+
end
|
|
616
|
+
|
|
617
|
+
chunks << build_chunk(:private_methods, state[:private_methods].join) if state[:private_methods].any?
|
|
618
|
+
chunks
|
|
619
|
+
end
|
|
620
|
+
end
|
|
294
621
|
end
|
|
295
622
|
end
|
|
@@ -34,10 +34,14 @@ module Woods
|
|
|
34
34
|
|
|
35
35
|
# Find a job by its ID.
|
|
36
36
|
#
|
|
37
|
-
#
|
|
37
|
+
# A nil id is dropped from the bridge request so downstream tools see
|
|
38
|
+
# a missing parameter rather than an explicit `nil` — symmetric with
|
|
39
|
+
# `CacheAdapter.stats(namespace: nil)`.
|
|
40
|
+
#
|
|
41
|
+
# @param id [Object, nil] Job ID
|
|
38
42
|
# @return [Hash] Bridge request
|
|
39
43
|
def find_job(id:)
|
|
40
|
-
{ tool: "#{prefix}_find_job", params: { id: id } }
|
|
44
|
+
{ tool: "#{prefix}_find_job", params: { id: id }.compact }
|
|
41
45
|
end
|
|
42
46
|
|
|
43
47
|
# List scheduled jobs.
|
|
@@ -51,10 +55,12 @@ module Woods
|
|
|
51
55
|
|
|
52
56
|
# Retry a failed job.
|
|
53
57
|
#
|
|
54
|
-
#
|
|
58
|
+
# A nil id is dropped from the bridge request — see #find_job.
|
|
59
|
+
#
|
|
60
|
+
# @param id [Object, nil] Job ID
|
|
55
61
|
# @return [Hash] Bridge request
|
|
56
62
|
def retry_job(id:)
|
|
57
|
-
{ tool: "#{prefix}_retry_job", params: { id: id } }
|
|
63
|
+
{ tool: "#{prefix}_retry_job", params: { id: id }.compact }
|
|
58
64
|
end
|
|
59
65
|
|
|
60
66
|
private
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require 'json'
|
|
4
|
+
require 'time'
|
|
4
5
|
require 'fileutils'
|
|
6
|
+
require_relative 'credential_scanner'
|
|
5
7
|
|
|
6
8
|
module Woods
|
|
7
9
|
module Console
|
|
@@ -10,6 +12,10 @@ module Woods
|
|
|
10
12
|
# Each line is a JSON object with: tool name, params, timestamp,
|
|
11
13
|
# confirmation status, and result summary.
|
|
12
14
|
#
|
|
15
|
+
# Params and result summaries are passed through {CredentialScanner} so
|
|
16
|
+
# credentials an agent pastes inline into `console_eval` (or any other
|
|
17
|
+
# tool) do not land in audit logs unredacted.
|
|
18
|
+
#
|
|
13
19
|
# @example
|
|
14
20
|
# logger = AuditLogger.new(path: 'log/console_audit.jsonl')
|
|
15
21
|
# logger.log(tool: 'console_eval', params: { code: '1+1' },
|
|
@@ -17,9 +23,15 @@ module Woods
|
|
|
17
23
|
# logger.entries # => [{ "tool" => "console_eval", ... }]
|
|
18
24
|
#
|
|
19
25
|
class AuditLogger
|
|
26
|
+
# Soft cap on any single logged field. Stops an attacker with Tier-4
|
|
27
|
+
# access from filling disk via arbitrarily long params.
|
|
28
|
+
MAX_FIELD_CHARS = 16_384
|
|
29
|
+
|
|
20
30
|
# @param path [String] Path to the JSONL audit log file
|
|
21
|
-
|
|
31
|
+
# @param scanner [#scan, nil] CredentialScanner override (mostly for tests).
|
|
32
|
+
def initialize(path:, scanner: nil)
|
|
22
33
|
@path = path
|
|
34
|
+
@scanner = scanner || CredentialScanner.new
|
|
23
35
|
end
|
|
24
36
|
|
|
25
37
|
# Write an audit entry.
|
|
@@ -34,15 +46,75 @@ module Woods
|
|
|
34
46
|
|
|
35
47
|
entry = {
|
|
36
48
|
tool: tool,
|
|
37
|
-
params: params,
|
|
49
|
+
params: redact(truncate_deep(params)),
|
|
38
50
|
confirmed: confirmed,
|
|
39
|
-
result_summary: result_summary,
|
|
51
|
+
result_summary: redact(truncate_value(result_summary)),
|
|
40
52
|
timestamp: Time.now.utc.iso8601
|
|
41
53
|
}
|
|
42
54
|
|
|
43
|
-
|
|
55
|
+
# Exclusive flock around the append — concurrent Tier-4 invocations
|
|
56
|
+
# across Puma threads would otherwise interleave bytes and produce
|
|
57
|
+
# malformed JSONL lines (integrity hit on audit review).
|
|
58
|
+
File.open(@path, File::WRONLY | File::APPEND | File::CREAT, 0o644) do |f|
|
|
59
|
+
f.flock(File::LOCK_EX)
|
|
60
|
+
f.puts(JSON.generate(sanitize_controls(entry)))
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
private
|
|
65
|
+
|
|
66
|
+
# Run a value through CredentialScanner. The scanner returns
|
|
67
|
+
# `[redacted_value, match_counts]`; the audit log wants only the
|
|
68
|
+
# redacted payload. nil scanner means pass-through (tests).
|
|
69
|
+
def redact(value)
|
|
70
|
+
return value unless @scanner && value
|
|
71
|
+
|
|
72
|
+
redacted, _counts = @scanner.scan(value)
|
|
73
|
+
redacted
|
|
74
|
+
rescue StandardError
|
|
75
|
+
# Never let redaction failure block audit writes — drop the value
|
|
76
|
+
# to a safe sentinel rather than logging raw content.
|
|
77
|
+
'[REDACTION_FAILED]'
|
|
44
78
|
end
|
|
45
79
|
|
|
80
|
+
# Recursively cap strings at MAX_FIELD_CHARS. Arrays/hashes preserve
|
|
81
|
+
# shape; scalars other than String pass through unchanged.
|
|
82
|
+
def truncate_deep(value)
|
|
83
|
+
case value
|
|
84
|
+
when Hash then value.transform_values { |v| truncate_deep(v) }
|
|
85
|
+
when Array then value.map { |v| truncate_deep(v) }
|
|
86
|
+
else truncate_value(value)
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def truncate_value(value)
|
|
91
|
+
return value unless value.is_a?(String) && value.length > MAX_FIELD_CHARS
|
|
92
|
+
|
|
93
|
+
"#{value[0, MAX_FIELD_CHARS]}… [truncated #{value.length - MAX_FIELD_CHARS} chars]"
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Defense-in-depth against log injection: strip ASCII control characters
|
|
97
|
+
# (NUL through US + DEL, except TAB) from every string in the entry
|
|
98
|
+
# before it reaches `JSON.generate`. `JSON.generate` already escapes
|
|
99
|
+
# these in string values, but (a) some downstream log readers parse
|
|
100
|
+
# JSONL by splitting on literal `\n` before JSON-parsing, and (b) a
|
|
101
|
+
# future consumer that decodes and reprints values (e.g. a terminal
|
|
102
|
+
# audit UI) would re-expose injection vectors.
|
|
103
|
+
CONTROL_CHARS = /[\x00-\x08\x0A-\x1F\x7F]/
|
|
104
|
+
private_constant :CONTROL_CHARS
|
|
105
|
+
|
|
106
|
+
def sanitize_controls(value)
|
|
107
|
+
case value
|
|
108
|
+
when String then value.gsub(CONTROL_CHARS, '')
|
|
109
|
+
when Hash then value.transform_keys { |k| sanitize_controls(k) }
|
|
110
|
+
.transform_values { |v| sanitize_controls(v) }
|
|
111
|
+
when Array then value.map { |v| sanitize_controls(v) }
|
|
112
|
+
else value
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
public
|
|
117
|
+
|
|
46
118
|
# Read all audit entries.
|
|
47
119
|
#
|
|
48
120
|
# @return [Array<Hash>] Parsed JSONL entries
|
data/lib/woods/console/bridge.rb
CHANGED
|
@@ -1,31 +1,64 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require 'json'
|
|
4
|
+
require_relative 'bridge_protocol'
|
|
4
5
|
require_relative 'model_validator'
|
|
5
6
|
require_relative 'safe_context'
|
|
6
7
|
|
|
7
8
|
module Woods
|
|
8
9
|
module Console
|
|
9
|
-
#
|
|
10
|
+
# **PROTOCOL SCAFFOLD — does not execute real queries.** Every handler
|
|
11
|
+
# below returns static empty data (`{ 'count' => 0 }`, `{ 'records' =>
|
|
12
|
+
# [] }`, etc.). Real in-process execution lives in
|
|
13
|
+
# {Woods::Console::EmbeddedExecutor}; the eventual real bridge process
|
|
14
|
+
# for Option D will replace this scaffold with a class that performs
|
|
15
|
+
# actual ActiveRecord queries.
|
|
16
|
+
#
|
|
17
|
+
# The scaffold pins the JSON-lines wire protocol — request envelope,
|
|
18
|
+
# response envelope, supported-tools list, error shape — so other
|
|
19
|
+
# components (EmbeddedExecutor, ConnectionManager, Server) can be
|
|
20
|
+
# built and tested against a stable contract before the real
|
|
21
|
+
# bridge-process implementation lands. Treat this class the way you'd
|
|
22
|
+
# treat a Sinatra fake of a third-party API in tests: it satisfies
|
|
23
|
+
# the protocol, nothing more.
|
|
24
|
+
#
|
|
25
|
+
# ## Why the name carries "Stub"
|
|
26
|
+
#
|
|
27
|
+
# Round-1 audit Track H-4 flagged this class as a "critical SafeContext
|
|
28
|
+
# bypass" because `handle_request` doesn't wrap calls in `SafeContext`.
|
|
29
|
+
# That finding wasn't exploitable — no live code path executes through
|
|
30
|
+
# this class in the shipped gem — but the bare name `Bridge` made the
|
|
31
|
+
# scaffold status invisible to auditors. The `Stub` prefix removes the
|
|
32
|
+
# ambiguity. When the real bridge-process implementation is delivered,
|
|
33
|
+
# it should claim the `Bridge` name; this class will either be deleted
|
|
34
|
+
# (if the protocol is fully owned by the real bridge) or renamed to
|
|
35
|
+
# `BridgeProtocol` and reduced to a constants module.
|
|
36
|
+
#
|
|
37
|
+
# ## Protocol
|
|
10
38
|
#
|
|
11
39
|
# Reads JSON-lines requests from an input IO, validates model/column names,
|
|
12
|
-
# dispatches to tool handlers, and writes JSON-lines responses to
|
|
40
|
+
# dispatches to (stub) tool handlers, and writes JSON-lines responses to
|
|
41
|
+
# an output IO.
|
|
13
42
|
#
|
|
14
43
|
# Protocol:
|
|
15
44
|
# Request: {"id":"req_1","tool":"count","params":{"model":"Order","scope":{"status":"pending"}}}
|
|
16
45
|
# Response: {"id":"req_1","ok":true,"result":{"count":1847},"timing_ms":12.3}
|
|
17
46
|
# Error: {"id":"req_1","ok":false,"error":"Model not found","error_type":"validation"}
|
|
18
47
|
#
|
|
19
|
-
# @example
|
|
20
|
-
# bridge =
|
|
21
|
-
#
|
|
48
|
+
# @example Wiring against a fake input/output (testing only — handlers return empty data)
|
|
49
|
+
# bridge = StubBridge.new(input: $stdin, output: $stdout,
|
|
50
|
+
# model_validator: validator, safe_context: ctx)
|
|
22
51
|
# bridge.run
|
|
23
52
|
#
|
|
24
|
-
class
|
|
25
|
-
|
|
26
|
-
#
|
|
27
|
-
|
|
28
|
-
|
|
53
|
+
class StubBridge
|
|
54
|
+
# Protocol constants live on {BridgeProtocol} so the real executor
|
|
55
|
+
# (EmbeddedExecutor) and a future real bridge-process class can
|
|
56
|
+
# reference them without importing the scaffold. These top-level
|
|
57
|
+
# aliases keep `StubBridge::SUPPORTED_TOOLS` working for existing
|
|
58
|
+
# callers and specs.
|
|
59
|
+
SUPPORTED_TOOLS = BridgeProtocol::SUPPORTED_TOOLS
|
|
60
|
+
TIER1_TOOLS = BridgeProtocol::TIER1_TOOLS
|
|
61
|
+
TOOL_HANDLERS = BridgeProtocol::TOOL_HANDLERS
|
|
29
62
|
|
|
30
63
|
# @param input [IO] Input stream (reads JSON-lines)
|
|
31
64
|
# @param output [IO] Output stream (writes JSON-lines)
|
|
@@ -115,10 +148,10 @@ module Woods
|
|
|
115
148
|
@model_validator.validate_model!(model)
|
|
116
149
|
end
|
|
117
150
|
|
|
118
|
-
# Stub handlers below return empty/zero data by design
|
|
119
|
-
#
|
|
120
|
-
#
|
|
121
|
-
#
|
|
151
|
+
# Stub handlers below return empty/zero data by design — see the
|
|
152
|
+
# class-level docstring. Real in-process execution happens in
|
|
153
|
+
# EmbeddedExecutor; the eventual Option-D bridge process will replace
|
|
154
|
+
# this class entirely.
|
|
122
155
|
|
|
123
156
|
def handle_count(_params)
|
|
124
157
|
{ 'count' => 0 }
|
|
@@ -134,7 +167,7 @@ module Woods
|
|
|
134
167
|
|
|
135
168
|
def handle_pluck(params)
|
|
136
169
|
@model_validator.validate_columns!(params['model'], params['columns']) if params['columns']
|
|
137
|
-
{ 'values' => [] }
|
|
170
|
+
{ 'columns' => Array(params['columns']), 'values' => [] }
|
|
138
171
|
end
|
|
139
172
|
|
|
140
173
|
def handle_aggregate(params)
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Woods
|
|
4
|
+
module Console
|
|
5
|
+
# Canonical console-tool protocol contract shared by {StubBridge}
|
|
6
|
+
# (the JSON-lines scaffold) and {EmbeddedExecutor} (the in-process
|
|
7
|
+
# executor that ships today). The eventual real bridge-process
|
|
8
|
+
# implementation (Option D — see `docs/design/CONSOLE_SERVER.md`)
|
|
9
|
+
# will also reference this module so every executor that speaks the
|
|
10
|
+
# protocol agrees on the tool vocabulary.
|
|
11
|
+
#
|
|
12
|
+
# Three constants live here:
|
|
13
|
+
#
|
|
14
|
+
# - {SUPPORTED_TOOLS} — the canonical Tier 1 tool list.
|
|
15
|
+
# - {TIER1_TOOLS} — alias, kept as a distinct name for call
|
|
16
|
+
# sites that reason about tier semantics rather than the whole
|
|
17
|
+
# supported set.
|
|
18
|
+
# - {TOOL_HANDLERS} — tool → `handle_<tool>` method-symbol map.
|
|
19
|
+
#
|
|
20
|
+
# Previously these lived on {StubBridge} and {EmbeddedExecutor}
|
|
21
|
+
# borrowed them with `TIER1_TOOLS = StubBridge::TIER1_TOOLS`, which
|
|
22
|
+
# reads as "the real executor borrows constants from the stub" —
|
|
23
|
+
# backwards. Extracting the protocol here lets the real executor
|
|
24
|
+
# (and a future non-stub `Bridge` class) claim the contract without
|
|
25
|
+
# importing the scaffold.
|
|
26
|
+
module BridgeProtocol
|
|
27
|
+
SUPPORTED_TOOLS = %w[
|
|
28
|
+
count
|
|
29
|
+
sample
|
|
30
|
+
find
|
|
31
|
+
pluck
|
|
32
|
+
aggregate
|
|
33
|
+
association_count
|
|
34
|
+
schema
|
|
35
|
+
recent
|
|
36
|
+
status
|
|
37
|
+
].freeze
|
|
38
|
+
|
|
39
|
+
TIER1_TOOLS = SUPPORTED_TOOLS
|
|
40
|
+
|
|
41
|
+
TOOL_HANDLERS = SUPPORTED_TOOLS.to_h { |t| [t, :"handle_#{t}"] }.freeze
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|