woods 1.1.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +186 -0
- data/README.md +20 -8
- data/exe/woods-console +51 -6
- data/exe/woods-console-mcp +24 -4
- data/exe/woods-mcp +30 -7
- data/exe/woods-mcp-http +47 -6
- data/lib/generators/woods/install_generator.rb +13 -4
- data/lib/generators/woods/templates/woods.rb.tt +155 -0
- data/lib/tasks/woods.rake +69 -50
- data/lib/woods/builder.rb +174 -9
- data/lib/woods/cache/cache_middleware.rb +360 -31
- data/lib/woods/chunking/semantic_chunker.rb +334 -7
- data/lib/woods/console/adapters/job_adapter.rb +10 -4
- data/lib/woods/console/audit_logger.rb +76 -4
- data/lib/woods/console/bridge.rb +48 -15
- data/lib/woods/console/bridge_protocol.rb +44 -0
- data/lib/woods/console/confirmation.rb +3 -4
- data/lib/woods/console/console_response_renderer.rb +56 -18
- data/lib/woods/console/credential_index.rb +201 -0
- data/lib/woods/console/credential_scanner.rb +302 -0
- data/lib/woods/console/dispatch_pipeline.rb +138 -0
- data/lib/woods/console/embedded_executor.rb +682 -35
- data/lib/woods/console/eval_guard.rb +319 -0
- data/lib/woods/console/model_validator.rb +1 -3
- data/lib/woods/console/rack_middleware.rb +185 -29
- data/lib/woods/console/redactor.rb +161 -0
- data/lib/woods/console/response_context.rb +127 -0
- data/lib/woods/console/safe_context.rb +220 -23
- data/lib/woods/console/scope_predicate_parser.rb +131 -0
- data/lib/woods/console/server.rb +417 -486
- data/lib/woods/console/sql_noise_stripper.rb +87 -0
- data/lib/woods/console/sql_table_scanner.rb +213 -0
- data/lib/woods/console/sql_validator.rb +81 -31
- data/lib/woods/console/table_gate.rb +93 -0
- data/lib/woods/console/tool_specs.rb +552 -0
- data/lib/woods/console/tools/tier1.rb +3 -3
- data/lib/woods/console/tools/tier4.rb +7 -1
- data/lib/woods/dependency_graph.rb +66 -7
- data/lib/woods/embedding/indexer.rb +190 -6
- data/lib/woods/embedding/openai.rb +40 -4
- data/lib/woods/embedding/provider.rb +104 -8
- data/lib/woods/embedding/text_preparer.rb +23 -3
- data/lib/woods/embedding/token_counter.rb +133 -0
- data/lib/woods/evaluation/baseline_runner.rb +20 -2
- data/lib/woods/evaluation/metrics.rb +4 -1
- data/lib/woods/extracted_unit.rb +1 -0
- data/lib/woods/extractor.rb +7 -1
- data/lib/woods/extractors/controller_extractor.rb +6 -0
- data/lib/woods/extractors/mailer_extractor.rb +16 -2
- data/lib/woods/extractors/model_extractor.rb +6 -1
- data/lib/woods/extractors/phlex_extractor.rb +13 -4
- data/lib/woods/extractors/rails_source_extractor.rb +2 -0
- data/lib/woods/extractors/route_helper_resolver.rb +130 -0
- data/lib/woods/extractors/shared_dependency_scanner.rb +130 -2
- data/lib/woods/extractors/view_component_extractor.rb +12 -1
- data/lib/woods/extractors/view_engines/base.rb +141 -0
- data/lib/woods/extractors/view_engines/erb.rb +145 -0
- data/lib/woods/extractors/view_template_extractor.rb +92 -133
- data/lib/woods/flow_assembler.rb +23 -15
- data/lib/woods/flow_precomputer.rb +21 -2
- data/lib/woods/graph_analyzer.rb +210 -0
- data/lib/woods/index_artifact.rb +173 -0
- data/lib/woods/mcp/bearer_auth.rb +45 -0
- data/lib/woods/mcp/bootstrap_state.rb +94 -0
- data/lib/woods/mcp/bootstrapper.rb +337 -16
- data/lib/woods/mcp/config_resolver.rb +288 -0
- data/lib/woods/mcp/errors.rb +134 -0
- data/lib/woods/mcp/index_reader.rb +265 -30
- data/lib/woods/mcp/origin_guard.rb +132 -0
- data/lib/woods/mcp/provider_probe.rb +166 -0
- data/lib/woods/mcp/renderers/claude_renderer.rb +6 -0
- data/lib/woods/mcp/renderers/markdown_renderer.rb +100 -3
- data/lib/woods/mcp/renderers/plain_renderer.rb +16 -2
- data/lib/woods/mcp/server.rb +771 -137
- data/lib/woods/model_name_cache.rb +78 -2
- data/lib/woods/notion/client.rb +25 -2
- data/lib/woods/notion/mappers/model_mapper.rb +36 -2
- data/lib/woods/railtie.rb +55 -15
- data/lib/woods/resilience/circuit_breaker.rb +9 -2
- data/lib/woods/resilience/retryable_provider.rb +40 -3
- data/lib/woods/resolved_config.rb +299 -0
- data/lib/woods/retrieval/context_assembler.rb +112 -5
- data/lib/woods/retrieval/query_classifier.rb +1 -1
- data/lib/woods/retrieval/ranker.rb +55 -6
- data/lib/woods/retrieval/search_executor.rb +42 -13
- data/lib/woods/retriever.rb +330 -24
- data/lib/woods/session_tracer/middleware.rb +35 -1
- data/lib/woods/storage/graph_store.rb +39 -0
- data/lib/woods/storage/inapplicable_backend.rb +14 -0
- data/lib/woods/storage/metadata_store.rb +129 -1
- data/lib/woods/storage/pgvector.rb +70 -8
- data/lib/woods/storage/qdrant.rb +196 -5
- data/lib/woods/storage/snapshotter/metadata.rb +172 -0
- data/lib/woods/storage/snapshotter/vector.rb +238 -0
- data/lib/woods/storage/snapshotter.rb +24 -0
- data/lib/woods/storage/vector_store.rb +184 -35
- data/lib/woods/tasks.rb +85 -0
- data/lib/woods/temporal/snapshot_store.rb +49 -1
- data/lib/woods/token_utils.rb +44 -5
- data/lib/woods/unblocked/client.rb +163 -0
- data/lib/woods/unblocked/document_builder.rb +326 -0
- data/lib/woods/unblocked/exporter.rb +201 -0
- data/lib/woods/unblocked/rate_limiter.rb +94 -0
- data/lib/woods/util/host_guard.rb +61 -0
- data/lib/woods/version.rb +1 -1
- data/lib/woods.rb +130 -6
- metadata +73 -4
|
@@ -52,9 +52,8 @@ module Woods
|
|
|
52
52
|
# @param tool [String] Tool name
|
|
53
53
|
# @param description [String] Human-readable description of the action
|
|
54
54
|
# @param params [Hash] Tool parameters
|
|
55
|
-
# @return [true] if confirmed
|
|
56
55
|
# @raise [ConfirmationDeniedError] if denied
|
|
57
|
-
def request_confirmation(tool:, description:, params:)
|
|
56
|
+
def request_confirmation(tool:, description:, params:)
|
|
58
57
|
approved = evaluate(tool: tool, description: description, params: params)
|
|
59
58
|
|
|
60
59
|
@history << {
|
|
@@ -65,9 +64,9 @@ module Woods
|
|
|
65
64
|
timestamp: Time.now.utc.iso8601
|
|
66
65
|
}
|
|
67
66
|
|
|
68
|
-
|
|
67
|
+
return if approved
|
|
69
68
|
|
|
70
|
-
|
|
69
|
+
raise ConfirmationDeniedError, "Confirmation denied for #{tool}: #{description}"
|
|
71
70
|
end
|
|
72
71
|
|
|
73
72
|
private
|
|
@@ -9,11 +9,19 @@ module Woods
|
|
|
9
9
|
#
|
|
10
10
|
# Auto-detects:
|
|
11
11
|
# - Array<Hash> → Markdown tables
|
|
12
|
-
# - Single Hash → Key-value bullet lists
|
|
12
|
+
# - Single Hash → Key-value bullet lists (Array values recurse into tables;
|
|
13
|
+
# `rows` / `values` paired with a sibling `columns` render as positional
|
|
14
|
+
# Markdown tables so sql/query/pluck output isn't collapsed)
|
|
13
15
|
# - Simple Array → Bullet list
|
|
14
16
|
# - Scalars → Plain text
|
|
15
17
|
#
|
|
16
18
|
class ConsoleResponseRenderer < MCP::ToolResponseRenderer
|
|
19
|
+
# Keys that carry positional row data in tool responses. When any of
|
|
20
|
+
# these appears alongside a `columns` array we render a proper table
|
|
21
|
+
# using the columns as headers instead of dumping bracketed arrays.
|
|
22
|
+
POSITIONAL_ROW_KEYS = %w[rows values].freeze
|
|
23
|
+
private_constant :POSITIONAL_ROW_KEYS
|
|
24
|
+
|
|
17
25
|
# Smart default: auto-detect data shape and render accordingly.
|
|
18
26
|
#
|
|
19
27
|
# @param data [Object] The bridge response result
|
|
@@ -43,26 +51,56 @@ module Woods
|
|
|
43
51
|
|
|
44
52
|
def render_table(rows)
|
|
45
53
|
keys = rows.first.keys
|
|
46
|
-
|
|
47
|
-
lines << "| #{keys.join(' | ')} |"
|
|
48
|
-
lines << "| #{keys.map { '---' }.join(' | ')} |"
|
|
49
|
-
rows.each do |row|
|
|
50
|
-
lines << "| #{keys.map { |k| row[k] }.join(' | ')} |"
|
|
51
|
-
end
|
|
52
|
-
lines.join("\n")
|
|
54
|
+
header_row(keys) + rows.map { |row| row_line(keys.map { |k| row[k] }) }.join
|
|
53
55
|
end
|
|
54
56
|
|
|
55
57
|
def render_hash(data)
|
|
56
|
-
data
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
58
|
+
columns = stringify_array(data['columns'] || data[:columns])
|
|
59
|
+
data.map { |key, value| render_hash_entry(key, value, columns) }.join("\n")
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def render_hash_entry(key, value, sibling_columns)
|
|
63
|
+
case value
|
|
64
|
+
when Hash
|
|
65
|
+
"**#{key}:**\n" + value.map { |k, v| " - #{k}: #{v}" }.join("\n")
|
|
66
|
+
when Array
|
|
67
|
+
"**#{key}:**\n" + render_hash_array_value(key, value, sibling_columns)
|
|
68
|
+
else
|
|
69
|
+
"**#{key}:** #{value}"
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def render_hash_array_value(key, value, sibling_columns)
|
|
74
|
+
if POSITIONAL_ROW_KEYS.include?(key.to_s) && sibling_columns && !sibling_columns.empty?
|
|
75
|
+
render_positional_table(value, sibling_columns)
|
|
76
|
+
else
|
|
77
|
+
render_array(value)
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Build a Markdown table from positional row data + a columns header.
|
|
82
|
+
# Handles both Array<Array> (multi-column rows) and flat scalar arrays
|
|
83
|
+
# (pluck with a single column — Rails collapses the result).
|
|
84
|
+
def render_positional_table(rows, columns)
|
|
85
|
+
return '_(empty)_' if rows.empty?
|
|
86
|
+
|
|
87
|
+
header_row(columns) + rows.map { |row| row_line(positional_row_values(row)) }.join
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def positional_row_values(row)
|
|
91
|
+
row.is_a?(Array) ? row : [row]
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def header_row(keys)
|
|
95
|
+
"| #{keys.join(' | ')} |\n| #{keys.map { '---' }.join(' | ')} |\n"
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def row_line(values)
|
|
99
|
+
"| #{values.join(' | ')} |\n"
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def stringify_array(value)
|
|
103
|
+
value.is_a?(Array) ? value.map(&:to_s) : nil
|
|
66
104
|
end
|
|
67
105
|
end
|
|
68
106
|
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'set'
|
|
4
|
+
|
|
5
|
+
# @see Woods
|
|
6
|
+
module Woods
|
|
7
|
+
class Error < StandardError; end unless defined?(Woods::Error)
|
|
8
|
+
|
|
9
|
+
module Console
|
|
10
|
+
# Boot-time index of every string leaf in `Rails.application.credentials`.
|
|
11
|
+
#
|
|
12
|
+
# The pattern-based {CredentialScanner} catches *known credential shapes*
|
|
13
|
+
# (Stripe `sk_live_…`, AWS `AKIA…`, etc.). It cannot catch a value whose
|
|
14
|
+
# shape is unknown — a hand-rolled HMAC secret, a Twilio auth token, a
|
|
15
|
+
# third-party webhook signing key. This index closes that gap by
|
|
16
|
+
# remembering the host app's *actual* credential values and substring-
|
|
17
|
+
# matching them in every Console MCP response, so a row whose value
|
|
18
|
+
# exactly matches a stored credential is redacted regardless of column
|
|
19
|
+
# name or value shape.
|
|
20
|
+
#
|
|
21
|
+
# The index is built once at server boot — walking encrypted credentials
|
|
22
|
+
# on every request would be both expensive and unsafe (it requires the
|
|
23
|
+
# master key). Each string leaf with length ≥ {MIN_LENGTH} is added to a
|
|
24
|
+
# frozen Set, and a pre-compiled `Regexp.union` is held for one-pass
|
|
25
|
+
# `gsub` substitution.
|
|
26
|
+
#
|
|
27
|
+
# ### Multi-DB / sharded caveat
|
|
28
|
+
# The index reflects credentials available to the *Rails process* that
|
|
29
|
+
# boots the Console MCP server. A separate database that holds its own
|
|
30
|
+
# secrets (e.g., a vendored CMS app sharing the same Rails host) is not
|
|
31
|
+
# in scope. Use Layer 3 (`console_redacted_columns` /
|
|
32
|
+
# `console_redacted_key_values`) for those.
|
|
33
|
+
#
|
|
34
|
+
# ### Missing master key
|
|
35
|
+
# In environments without `config/master.key` (CI, fresh checkouts) the
|
|
36
|
+
# `Rails.application.credentials.config` call raises
|
|
37
|
+
# `ActiveSupport::EncryptedConfiguration::MissingKeyError` or
|
|
38
|
+
# `ActiveSupport::MessageEncryptor::InvalidMessage`. {.build} catches
|
|
39
|
+
# both *by name* (no constant reference, so the class load order is
|
|
40
|
+
# irrelevant) and returns an empty index — the server still boots, the
|
|
41
|
+
# other defense layers still apply.
|
|
42
|
+
#
|
|
43
|
+
# @example
|
|
44
|
+
# index = CredentialIndex.build(rails_app: Rails.application)
|
|
45
|
+
# index.match?("sk_live_actual_secret_value") # => true
|
|
46
|
+
# index.redact("token: sk_live_actual_secret_value")
|
|
47
|
+
# # => "token: [REDACTED:credential]"
|
|
48
|
+
#
|
|
49
|
+
class CredentialIndex
|
|
50
|
+
# Captured at require time so the mtime-check warning has a stable
|
|
51
|
+
# reference point even if the clock skews later. Frozen immediately
|
|
52
|
+
# to prevent accidental mutation.
|
|
53
|
+
PROCESS_START = Time.now.freeze
|
|
54
|
+
|
|
55
|
+
# Substrings shorter than this are not added to the index. Below ~12
|
|
56
|
+
# chars the false-positive rate climbs sharply (env names like
|
|
57
|
+
# `production`, hostnames, version strings, etc.).
|
|
58
|
+
MIN_LENGTH = 12
|
|
59
|
+
|
|
60
|
+
# Rendered marker for substring hits — distinct from the pattern
|
|
61
|
+
# scanner's `[REDACTED]` so operators reading audit output can see
|
|
62
|
+
# *which* layer caught the leak.
|
|
63
|
+
REDACTED = '[REDACTED:credential]'
|
|
64
|
+
|
|
65
|
+
# Encryption-related exception class names caught by name. Rails moves
|
|
66
|
+
# these around between versions; matching by `Class#name` keeps us
|
|
67
|
+
# from coupling to a specific constant path.
|
|
68
|
+
MISSING_KEY_ERRORS = %w[
|
|
69
|
+
ActiveSupport::EncryptedConfiguration::MissingKeyError
|
|
70
|
+
ActiveSupport::EncryptedFile::MissingKeyError
|
|
71
|
+
ActiveSupport::MessageEncryptor::InvalidMessage
|
|
72
|
+
].freeze
|
|
73
|
+
|
|
74
|
+
class << self
|
|
75
|
+
# Build an index from a Rails application's encrypted credentials.
|
|
76
|
+
#
|
|
77
|
+
# **Restart required after rotation.** The index is built once at
|
|
78
|
+
# server boot and held in memory for the life of the MCP process.
|
|
79
|
+
# When a host app rotates Rails credentials, the MCP process keeps
|
|
80
|
+
# the pre-rotation secrets in its frozen Set until the process
|
|
81
|
+
# restarts. New secrets added during rotation are NOT in the index
|
|
82
|
+
# — only Layer 2 shape patterns can catch them until restart.
|
|
83
|
+
#
|
|
84
|
+
# To trigger a rebuild without restarting, call
|
|
85
|
+
# `Woods::Console::Server.rebuild_credential_index` from a rotation
|
|
86
|
+
# job or initializer hook. See docs/CONSOLE_MCP_SETUP.md for the
|
|
87
|
+
# full restart guidance.
|
|
88
|
+
#
|
|
89
|
+
# @param rails_app [#credentials] usually `Rails.application`.
|
|
90
|
+
# @return [CredentialIndex] populated index, or an empty index when
|
|
91
|
+
# the credentials store can't be opened.
|
|
92
|
+
def build(rails_app:)
|
|
93
|
+
new(secrets: collect_secrets(rails_app))
|
|
94
|
+
rescue StandardError => e
|
|
95
|
+
raise unless missing_key_error?(e)
|
|
96
|
+
|
|
97
|
+
new(secrets: [])
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Emit a structured log warning when any of the given credentials
|
|
101
|
+
# files has a modification time newer than `process_start`. This
|
|
102
|
+
# catches the common "rotated credentials but forgot to restart"
|
|
103
|
+
# situation at boot time.
|
|
104
|
+
#
|
|
105
|
+
# Only files that actually exist on disk are checked; missing paths
|
|
106
|
+
# are silently skipped so CI and fresh-checkout environments (which
|
|
107
|
+
# have no credentials file) produce no noise.
|
|
108
|
+
#
|
|
109
|
+
# @param credentials_files [Array<String>] Paths to check (e.g.
|
|
110
|
+
# `config/credentials.yml.enc`,
|
|
111
|
+
# `config/credentials/production.yml.enc`).
|
|
112
|
+
# @param process_start [Time] When the current process started.
|
|
113
|
+
# Typically `Woods::Console::CredentialIndex::PROCESS_START`.
|
|
114
|
+
# @param logger [#warn] A logger responding to `warn(event, **kwargs)`.
|
|
115
|
+
# @return [void]
|
|
116
|
+
def warn_if_credentials_rotated(credentials_files:, process_start:, logger:)
|
|
117
|
+
credentials_files.each do |path|
|
|
118
|
+
next unless File.exist?(path)
|
|
119
|
+
|
|
120
|
+
mtime = File.mtime(path)
|
|
121
|
+
next unless mtime > process_start
|
|
122
|
+
|
|
123
|
+
logger.warn(
|
|
124
|
+
'console.credential_index.stale',
|
|
125
|
+
credentials_file: path,
|
|
126
|
+
file_mtime: mtime.iso8601,
|
|
127
|
+
process_start: process_start.iso8601,
|
|
128
|
+
hint: 'Credentials file was modified after process start — ' \
|
|
129
|
+
'restart the MCP process or call ' \
|
|
130
|
+
'Woods::Console::Server.rebuild_credential_index to ' \
|
|
131
|
+
'pick up rotated secrets.'
|
|
132
|
+
)
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
private
|
|
137
|
+
|
|
138
|
+
def collect_secrets(rails_app)
|
|
139
|
+
config = rails_app.credentials.config
|
|
140
|
+
collected = []
|
|
141
|
+
walk(config, collected)
|
|
142
|
+
collected
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
def walk(node, collected)
|
|
146
|
+
case node
|
|
147
|
+
when Hash then node.each_value { |v| walk(v, collected) }
|
|
148
|
+
when Array then node.each { |v| walk(v, collected) }
|
|
149
|
+
when String
|
|
150
|
+
collected << node if node.length >= MIN_LENGTH
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
def missing_key_error?(error)
|
|
155
|
+
MISSING_KEY_ERRORS.include?(error.class.name)
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
# @return [Set<String>] frozen set of secret strings (length ≥ MIN_LENGTH).
|
|
160
|
+
attr_reader :secrets
|
|
161
|
+
|
|
162
|
+
# @return [Regexp, nil] precompiled `Regexp.union` of every secret, or
|
|
163
|
+
# nil when the index is empty (no allocation, no per-string overhead).
|
|
164
|
+
attr_reader :pattern
|
|
165
|
+
|
|
166
|
+
# @param secrets [Enumerable<String>] string leaves harvested from
|
|
167
|
+
# credentials. Duplicates are collapsed; strings shorter than
|
|
168
|
+
# {MIN_LENGTH} are dropped.
|
|
169
|
+
def initialize(secrets:)
|
|
170
|
+
filtered = Array(secrets).select { |s| s.is_a?(String) && s.length >= MIN_LENGTH }
|
|
171
|
+
@secrets = filtered.to_set.freeze
|
|
172
|
+
@pattern = @secrets.empty? ? nil : Regexp.union(@secrets.to_a)
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
# @return [Boolean] true when no secrets were collected (missing key,
|
|
176
|
+
# empty credentials file, or every leaf below MIN_LENGTH).
|
|
177
|
+
def empty?
|
|
178
|
+
@secrets.empty?
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
# @param str [String]
|
|
182
|
+
# @return [Boolean] true when any indexed secret appears as a substring.
|
|
183
|
+
def match?(str)
|
|
184
|
+
return false if empty? || !str.is_a?(String)
|
|
185
|
+
|
|
186
|
+
@pattern.match?(str)
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
# Replace every indexed-secret substring in `str` with {REDACTED}.
|
|
190
|
+
#
|
|
191
|
+
# @param str [String]
|
|
192
|
+
# @return [String] redacted copy. Returns the input unchanged when the
|
|
193
|
+
# index is empty or no secret appears.
|
|
194
|
+
def redact(str)
|
|
195
|
+
return str if empty? || !str.is_a?(String) || !@pattern.match?(str)
|
|
196
|
+
|
|
197
|
+
str.gsub(@pattern, REDACTED)
|
|
198
|
+
end
|
|
199
|
+
end
|
|
200
|
+
end
|
|
201
|
+
end
|
|
@@ -0,0 +1,302 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'base64'
|
|
4
|
+
require 'cgi'
|
|
5
|
+
require 'set'
|
|
6
|
+
|
|
7
|
+
require_relative 'credential_index'
|
|
8
|
+
|
|
9
|
+
# @see Woods
|
|
10
|
+
module Woods
|
|
11
|
+
class Error < StandardError; end unless defined?(Woods::Error)
|
|
12
|
+
|
|
13
|
+
module Console
|
|
14
|
+
# Content-shape credential scanner for Console MCP responses.
|
|
15
|
+
#
|
|
16
|
+
# Walks a serialized response tree (strings, nested Hash, nested Array)
|
|
17
|
+
# and replaces substrings that match known credential formats with
|
|
18
|
+
# `[REDACTED]`. Pattern matching is high-specificity (word-boundary
|
|
19
|
+
# anchored, minimum-length bounded) so false positives against UUIDs,
|
|
20
|
+
# email addresses, and short identifiers stay rare.
|
|
21
|
+
#
|
|
22
|
+
# This is Layer 2 of the defense-in-depth stack — it runs AFTER the
|
|
23
|
+
# operator-configured column and EAV redaction layers so it catches
|
|
24
|
+
# credentials those layers missed (newly-added EAV keys, secrets stored
|
|
25
|
+
# in JSONB columns, associated records pulled via nested serialization).
|
|
26
|
+
#
|
|
27
|
+
# @example
|
|
28
|
+
# scanner = CredentialScanner.new
|
|
29
|
+
# value, counts = scanner.scan('token is sk_test_4eC39HqLyjWDarjtT1zdp7dc')
|
|
30
|
+
# value # => "token is [REDACTED]"
|
|
31
|
+
# counts # => { stripe_secret_key: 1 }
|
|
32
|
+
#
|
|
33
|
+
class CredentialScanner # rubocop:disable Metrics/ClassLength
|
|
34
|
+
REDACTED = '[REDACTED]'
|
|
35
|
+
|
|
36
|
+
# High-specificity credential patterns. Each is word-boundary anchored
|
|
37
|
+
# and bounded by a realistic minimum length so random short strings
|
|
38
|
+
# cannot trigger a match.
|
|
39
|
+
#
|
|
40
|
+
# Order matters: more-specific patterns appear before less-specific
|
|
41
|
+
# alternatives (e.g., `anthropic_api_key` before `openai_api_key`)
|
|
42
|
+
# so the specific counter increments rather than the generic one.
|
|
43
|
+
PATTERNS = {
|
|
44
|
+
stripe_secret_key: /\b(?:sk|rk)_(?:live|test)_[A-Za-z0-9]{24,}\b/,
|
|
45
|
+
stripe_publishable_key: /\bpk_(?:live|test)_[A-Za-z0-9]{24,}\b/,
|
|
46
|
+
stripe_webhook_secret: /\bwhsec_[A-Za-z0-9]{24,}\b/,
|
|
47
|
+
# Stripe Connect account IDs are PII per Stripe's ToS even though they
|
|
48
|
+
# are not strictly secret — surfacing one in an MCP response leaks the
|
|
49
|
+
# connected merchant's identity.
|
|
50
|
+
stripe_connect_account_id: /\bacct_[A-Za-z0-9]{16,}\b/,
|
|
51
|
+
# Klaviyo private API keys use a bare `pk_` prefix with no live/test
|
|
52
|
+
# infix — they evade the Stripe publishable regex and grant full API
|
|
53
|
+
# access to the Klaviyo tenant. Order matters: stripe_publishable_key
|
|
54
|
+
# runs first so its more-specific match wins on Stripe values.
|
|
55
|
+
klaviyo_private_key: /\bpk_[A-Za-z0-9]{34}\b/,
|
|
56
|
+
aws_access_key_id: /\b(?:AKIA|ASIA)[0-9A-Z]{16}\b/,
|
|
57
|
+
github_fine_grained_pat: /\bgithub_pat_[A-Za-z0-9_]{82}\b/,
|
|
58
|
+
github_token: /\bgh[pousr]_[A-Za-z0-9]{36,}\b/,
|
|
59
|
+
google_oauth_token: /\bya29\.[A-Za-z0-9_-]{20,}\b/,
|
|
60
|
+
jwt_token: /\beyJ[A-Za-z0-9_-]+\.eyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\b/,
|
|
61
|
+
pem_private_key_block: /-----BEGIN [A-Z ]*PRIVATE KEY-----[\s\S]*?-----END [A-Z ]*PRIVATE KEY-----/,
|
|
62
|
+
slack_token: /\bxox[abpr]-[A-Za-z0-9-]{10,}\b/,
|
|
63
|
+
sendgrid_api_key: /\bSG\.[A-Za-z0-9_-]{22}\.[A-Za-z0-9_-]{43}\b/,
|
|
64
|
+
mailgun_api_key: /\bkey-[a-f0-9]{32}\b/,
|
|
65
|
+
# Matches both the current `sk-ant-api03-…` / `sk-ant-admin01-…` shape
|
|
66
|
+
# and the legacy `sk-ant-…` format that shipped without the
|
|
67
|
+
# `api|admin` infix. Length floor prevents matching on a bare `sk-ant-`
|
|
68
|
+
# prefix in logs or docs.
|
|
69
|
+
anthropic_api_key: /\bsk-ant-(?:(?:api|admin)\d{2}-)?[A-Za-z0-9_-]{80,}\b/,
|
|
70
|
+
openai_api_key: %r{\bsk-(?:proj-)?[A-Za-z0-9/_-]{40,}\b},
|
|
71
|
+
# `rt`/`ua` extend the existing alternation to cover refresh tokens
|
|
72
|
+
# (`shprt_`) and user-access tokens (`shpua_`) — the prefix list
|
|
73
|
+
# before this PR missed both.
|
|
74
|
+
shopify_access_token: /\bshp(?:at|ca|ss|pa|rt|ua)_[a-f0-9]{32}\b/,
|
|
75
|
+
square_access_token: /\bsq0[a-z]{3}-[A-Za-z0-9_-]{22,}\b/,
|
|
76
|
+
paypal_access_token: /\baccess_token\$(?:production|sandbox)\$[A-Za-z0-9]+\$[a-f0-9]+\b/,
|
|
77
|
+
# Distinctive `00D<15-org-id>!<base64 payload>` shape — no FP risk
|
|
78
|
+
# and one of the highest-leverage additions per the research brief.
|
|
79
|
+
salesforce_access_token: /\b00D[A-Za-z0-9]{12}![A-Za-z0-9._]{80,250}\b/,
|
|
80
|
+
launchdarkly_sdk_key: /\bsdk-[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b/,
|
|
81
|
+
launchdarkly_mobile_key: /\bmob-[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b/,
|
|
82
|
+
hubspot_private_app_token: Regexp.new(
|
|
83
|
+
'\bpat-(?:na1|na2|eu1|eu2|ap1)-' \
|
|
84
|
+
'[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b'
|
|
85
|
+
),
|
|
86
|
+
brevo_api_key: /\bxkeysib-[a-f0-9]{64}-[A-Za-z0-9]{16}\b/,
|
|
87
|
+
brevo_smtp_key: /\bxsmtpsib-[a-f0-9]{64}-[A-Za-z0-9]{16}\b/,
|
|
88
|
+
kit_api_key: /\bkit_[A-Za-z0-9]{20,}\b/,
|
|
89
|
+
twilio_account_sid: /\bAC[0-9a-fA-F]{32}\b/,
|
|
90
|
+
twilio_api_key_sid: /\bSK[0-9a-fA-F]{32}\b/,
|
|
91
|
+
twilio_verify_service_sid: /\bVA[0-9a-fA-F]{32}\b/,
|
|
92
|
+
# Connection strings with embedded credentials: `postgres://user:pass@host/db`,
|
|
93
|
+
# `mysql2://user:pass@host/db`, `mongodb://…`, `amqp://…`, `redis://…`.
|
|
94
|
+
# Captures the entire URL — the password is part of it and redacting
|
|
95
|
+
# just the password field while leaving `user@host` visible is not
|
|
96
|
+
# worth the regex complexity when the host may itself be sensitive.
|
|
97
|
+
database_url_with_password: Regexp.new(
|
|
98
|
+
'\b(?:postgres|postgresql|mysql|mysql2|mongodb|mongodb\+srv|amqp|amqps|redis|rediss|' \
|
|
99
|
+
'clickhouse|cockroachdb|mariadb)://[^\s:@/]+:[^\s@/]+@\S+'
|
|
100
|
+
)
|
|
101
|
+
}.freeze
|
|
102
|
+
|
|
103
|
+
# @return [Array<Symbol>] every pattern name the scanner knows about.
|
|
104
|
+
def self.patterns
|
|
105
|
+
PATTERNS.keys
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Replace the boot-time credential index with a freshly built one.
|
|
109
|
+
#
|
|
110
|
+
# Called by `Woods::Console::Server.rebuild_credential_index` after a
|
|
111
|
+
# host app rotates its Rails credentials. Thread-safe: the assignment
|
|
112
|
+
# is atomic on MRI (GVL) and the new index is fully constructed before
|
|
113
|
+
# being swapped in, so in-flight scans see either the old or the new
|
|
114
|
+
# index — never a partial one.
|
|
115
|
+
#
|
|
116
|
+
# @param new_index [CredentialIndex, nil] The replacement index.
|
|
117
|
+
# @return [void]
|
|
118
|
+
def replace_index!(new_index)
|
|
119
|
+
@secret_index = if new_index.respond_to?(:empty?) && new_index.empty?
|
|
120
|
+
nil
|
|
121
|
+
else
|
|
122
|
+
new_index
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# Counter key emitted when {CredentialIndex} substring-matches a value
|
|
127
|
+
# before any shape pattern fires. Distinct from pattern names so
|
|
128
|
+
# observability can tell the two layers apart.
|
|
129
|
+
INDEX_HIT = :credential_index
|
|
130
|
+
|
|
131
|
+
# @param disabled_patterns [Array<Symbol, String>] names to skip at scan
|
|
132
|
+
# time. Strings are coerced to Symbols.
|
|
133
|
+
# @param secret_index [#match?, #redact, nil] Optional {CredentialIndex}
|
|
134
|
+
# built from the host app's actual credentials. When present, every
|
|
135
|
+
# string is run through the index *before* the pattern pass — so a
|
|
136
|
+
# value whose shape no pattern recognizes (Twilio auth tokens,
|
|
137
|
+
# hand-rolled HMAC keys, etc.) is still redacted when it matches a
|
|
138
|
+
# stored credential exactly. Pass `nil` (or a `CredentialIndex#empty?`
|
|
139
|
+
# index) to skip the substring layer.
|
|
140
|
+
def initialize(disabled_patterns: [], secret_index: nil)
|
|
141
|
+
disabled = Array(disabled_patterns).to_set(&:to_sym)
|
|
142
|
+
@active_patterns = PATTERNS.except(*disabled)
|
|
143
|
+
@secret_index = secret_index unless secret_index.respond_to?(:empty?) && secret_index.empty?
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
# Scan a value (String, Hash, Array, or any other object) for credentials.
|
|
147
|
+
#
|
|
148
|
+
# Strings are gsub'd against every active pattern. Hash values and Array
|
|
149
|
+
# elements are walked recursively; keys and non-string scalars
|
|
150
|
+
# (Integer, Float, true/false, nil) pass through untouched.
|
|
151
|
+
#
|
|
152
|
+
# @param value [Object]
|
|
153
|
+
# @return [Array(Object, Hash{Symbol=>Integer})] two-tuple of the scanned
|
|
154
|
+
# value and a per-pattern match count. Count entries are only present
|
|
155
|
+
# for patterns that fired — callers should treat a missing key as zero.
|
|
156
|
+
def scan(value)
|
|
157
|
+
counts = {}
|
|
158
|
+
scanned = walk(value, counts)
|
|
159
|
+
[scanned, counts]
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
private
|
|
163
|
+
|
|
164
|
+
def walk(value, counts)
|
|
165
|
+
case value
|
|
166
|
+
when String then scan_string(value, counts)
|
|
167
|
+
when Hash then walk_hash(value, counts)
|
|
168
|
+
when Array then value.map { |item| walk(item, counts) }
|
|
169
|
+
else value
|
|
170
|
+
end
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# Walk a Hash, scanning both keys and values for credentials.
|
|
174
|
+
#
|
|
175
|
+
# Keys are scanned by coercing them to String, running the scan, then
|
|
176
|
+
# restoring the original key type — a Symbol key that carries a
|
|
177
|
+
# credential-shaped string is emitted as a Symbol after redaction
|
|
178
|
+
# (e.g. `:"[REDACTED]"`). String keys stay Strings. Non-String,
|
|
179
|
+
# non-Symbol keys (Integer, etc.) pass through unchanged.
|
|
180
|
+
def walk_hash(hash, counts)
|
|
181
|
+
hash.each_with_object({}) do |(key, val), out|
|
|
182
|
+
scanned_key = case key
|
|
183
|
+
when String then scan_string(key, counts)
|
|
184
|
+
when Symbol then scan_string(key.to_s, counts).to_sym
|
|
185
|
+
else key
|
|
186
|
+
end
|
|
187
|
+
out[scanned_key] = walk(val, counts)
|
|
188
|
+
end
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
def scan_string(str, counts)
|
|
192
|
+
result = redact_indexed_secrets(str, counts)
|
|
193
|
+
result = scan_encoded_forms(result, counts)
|
|
194
|
+
@active_patterns.inject(result) do |acc, (name, pattern)|
|
|
195
|
+
acc.gsub(pattern) do
|
|
196
|
+
counts[name] = (counts[name] || 0) + 1
|
|
197
|
+
REDACTED
|
|
198
|
+
end
|
|
199
|
+
end
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
# Best-effort redaction of credentials that slip past the literal-pattern
|
|
203
|
+
# pass because they are URL-encoded (`sk%5Ftest%5F…`) or base64-wrapped
|
|
204
|
+
# (`c2tfdGVzdF8…`). For each candidate substring we try the decoded form
|
|
205
|
+
# against every active pattern; if any pattern matches, we redact the
|
|
206
|
+
# encoded substring in the original string. This is a defense-in-depth
|
|
207
|
+
# layer — the literal patterns remain the primary guarantee and the
|
|
208
|
+
# decoded scan runs only over substrings whose shape suggests encoding.
|
|
209
|
+
def scan_encoded_forms(str, counts)
|
|
210
|
+
str = scan_url_encoded(str, counts) if str.include?('%')
|
|
211
|
+
scan_base64(str, counts)
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
URL_ENCODED_CANDIDATE = %r{[A-Za-z0-9%_.~+/=-]{24,}}
|
|
215
|
+
# No trailing `\b` anchor — `\b` requires a word/non-word transition
|
|
216
|
+
# and fails at end-of-string after a `=` padding char. Use a negative
|
|
217
|
+
# look-ahead to stop at the next alphanumeric character instead.
|
|
218
|
+
BASE64_CANDIDATE = %r{\b[A-Za-z0-9+/]{40,}={0,2}(?![A-Za-z0-9+/])}
|
|
219
|
+
private_constant :URL_ENCODED_CANDIDATE, :BASE64_CANDIDATE
|
|
220
|
+
|
|
221
|
+
def scan_url_encoded(str, counts)
|
|
222
|
+
str.gsub(URL_ENCODED_CANDIDATE) do |candidate|
|
|
223
|
+
next candidate unless candidate.include?('%')
|
|
224
|
+
|
|
225
|
+
decoded = safely_unescape(candidate)
|
|
226
|
+
credential_name = first_matching_pattern(decoded)
|
|
227
|
+
if credential_name
|
|
228
|
+
counts[credential_name] = (counts[credential_name] || 0) + 1
|
|
229
|
+
REDACTED
|
|
230
|
+
else
|
|
231
|
+
candidate
|
|
232
|
+
end
|
|
233
|
+
end
|
|
234
|
+
end
|
|
235
|
+
|
|
236
|
+
def scan_base64(str, counts)
|
|
237
|
+
str.gsub(BASE64_CANDIDATE) do |candidate|
|
|
238
|
+
decoded = safely_base64_decode(candidate)
|
|
239
|
+
next candidate unless decoded
|
|
240
|
+
|
|
241
|
+
credential_name = first_matching_pattern(decoded)
|
|
242
|
+
if credential_name
|
|
243
|
+
counts[credential_name] = (counts[credential_name] || 0) + 1
|
|
244
|
+
REDACTED
|
|
245
|
+
else
|
|
246
|
+
candidate
|
|
247
|
+
end
|
|
248
|
+
end
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
def first_matching_pattern(value)
|
|
252
|
+
@active_patterns.each do |name, pattern|
|
|
253
|
+
return name if pattern.match?(value)
|
|
254
|
+
end
|
|
255
|
+
nil
|
|
256
|
+
end
|
|
257
|
+
|
|
258
|
+
def safely_unescape(value)
|
|
259
|
+
CGI.unescape(value)
|
|
260
|
+
rescue ArgumentError
|
|
261
|
+
value
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
def safely_base64_decode(value)
|
|
265
|
+
decoded = Base64.strict_decode64(value)
|
|
266
|
+
decoded.force_encoding(Encoding::UTF_8)
|
|
267
|
+
return nil unless decoded.valid_encoding?
|
|
268
|
+
# CPU short-circuit: credential strings are printable ASCII. If
|
|
269
|
+
# the decoded bytes are mostly non-printable (likely a binary
|
|
270
|
+
# hash / random bytes / image), skip the pattern scan entirely —
|
|
271
|
+
# avoids running every `active_patterns` regex over gigabytes of
|
|
272
|
+
# hex dumps on every scan.
|
|
273
|
+
return nil unless mostly_printable?(decoded)
|
|
274
|
+
|
|
275
|
+
decoded
|
|
276
|
+
rescue ArgumentError
|
|
277
|
+
nil
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
PRINTABLE_RATIO_THRESHOLD = 0.85
|
|
281
|
+
private_constant :PRINTABLE_RATIO_THRESHOLD
|
|
282
|
+
|
|
283
|
+
def mostly_printable?(str)
|
|
284
|
+
return false if str.empty?
|
|
285
|
+
|
|
286
|
+
# ASCII printable (0x20-0x7e) + tab/newline; the threshold matches
|
|
287
|
+
# base64 encodings of typical credential shapes (sk_live_..., AKIA...,
|
|
288
|
+
# ghp_..., etc.) which are 100% printable.
|
|
289
|
+
printable = str.each_char.count { |c| c.match?(/[\t\n\x20-\x7e]/) }
|
|
290
|
+
printable.to_f / str.length >= PRINTABLE_RATIO_THRESHOLD
|
|
291
|
+
end
|
|
292
|
+
|
|
293
|
+
def redact_indexed_secrets(str, counts)
|
|
294
|
+
return str unless @secret_index&.match?(str)
|
|
295
|
+
|
|
296
|
+
redacted = @secret_index.redact(str)
|
|
297
|
+
counts[INDEX_HIT] = (counts[INDEX_HIT] || 0) + redacted.scan(CredentialIndex::REDACTED).size
|
|
298
|
+
redacted
|
|
299
|
+
end
|
|
300
|
+
end
|
|
301
|
+
end
|
|
302
|
+
end
|