woods 1.1.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +186 -0
  3. data/README.md +20 -8
  4. data/exe/woods-console +51 -6
  5. data/exe/woods-console-mcp +24 -4
  6. data/exe/woods-mcp +30 -7
  7. data/exe/woods-mcp-http +47 -6
  8. data/lib/generators/woods/install_generator.rb +13 -4
  9. data/lib/generators/woods/templates/woods.rb.tt +155 -0
  10. data/lib/tasks/woods.rake +69 -50
  11. data/lib/woods/builder.rb +174 -9
  12. data/lib/woods/cache/cache_middleware.rb +360 -31
  13. data/lib/woods/chunking/semantic_chunker.rb +334 -7
  14. data/lib/woods/console/adapters/job_adapter.rb +10 -4
  15. data/lib/woods/console/audit_logger.rb +76 -4
  16. data/lib/woods/console/bridge.rb +48 -15
  17. data/lib/woods/console/bridge_protocol.rb +44 -0
  18. data/lib/woods/console/confirmation.rb +3 -4
  19. data/lib/woods/console/console_response_renderer.rb +56 -18
  20. data/lib/woods/console/credential_index.rb +201 -0
  21. data/lib/woods/console/credential_scanner.rb +302 -0
  22. data/lib/woods/console/dispatch_pipeline.rb +138 -0
  23. data/lib/woods/console/embedded_executor.rb +682 -35
  24. data/lib/woods/console/eval_guard.rb +319 -0
  25. data/lib/woods/console/model_validator.rb +1 -3
  26. data/lib/woods/console/rack_middleware.rb +185 -29
  27. data/lib/woods/console/redactor.rb +161 -0
  28. data/lib/woods/console/response_context.rb +127 -0
  29. data/lib/woods/console/safe_context.rb +220 -23
  30. data/lib/woods/console/scope_predicate_parser.rb +131 -0
  31. data/lib/woods/console/server.rb +417 -486
  32. data/lib/woods/console/sql_noise_stripper.rb +87 -0
  33. data/lib/woods/console/sql_table_scanner.rb +213 -0
  34. data/lib/woods/console/sql_validator.rb +81 -31
  35. data/lib/woods/console/table_gate.rb +93 -0
  36. data/lib/woods/console/tool_specs.rb +552 -0
  37. data/lib/woods/console/tools/tier1.rb +3 -3
  38. data/lib/woods/console/tools/tier4.rb +7 -1
  39. data/lib/woods/dependency_graph.rb +66 -7
  40. data/lib/woods/embedding/indexer.rb +190 -6
  41. data/lib/woods/embedding/openai.rb +40 -4
  42. data/lib/woods/embedding/provider.rb +104 -8
  43. data/lib/woods/embedding/text_preparer.rb +23 -3
  44. data/lib/woods/embedding/token_counter.rb +133 -0
  45. data/lib/woods/evaluation/baseline_runner.rb +20 -2
  46. data/lib/woods/evaluation/metrics.rb +4 -1
  47. data/lib/woods/extracted_unit.rb +1 -0
  48. data/lib/woods/extractor.rb +7 -1
  49. data/lib/woods/extractors/controller_extractor.rb +6 -0
  50. data/lib/woods/extractors/mailer_extractor.rb +16 -2
  51. data/lib/woods/extractors/model_extractor.rb +6 -1
  52. data/lib/woods/extractors/phlex_extractor.rb +13 -4
  53. data/lib/woods/extractors/rails_source_extractor.rb +2 -0
  54. data/lib/woods/extractors/route_helper_resolver.rb +130 -0
  55. data/lib/woods/extractors/shared_dependency_scanner.rb +130 -2
  56. data/lib/woods/extractors/view_component_extractor.rb +12 -1
  57. data/lib/woods/extractors/view_engines/base.rb +141 -0
  58. data/lib/woods/extractors/view_engines/erb.rb +145 -0
  59. data/lib/woods/extractors/view_template_extractor.rb +92 -133
  60. data/lib/woods/flow_assembler.rb +23 -15
  61. data/lib/woods/flow_precomputer.rb +21 -2
  62. data/lib/woods/graph_analyzer.rb +210 -0
  63. data/lib/woods/index_artifact.rb +173 -0
  64. data/lib/woods/mcp/bearer_auth.rb +45 -0
  65. data/lib/woods/mcp/bootstrap_state.rb +94 -0
  66. data/lib/woods/mcp/bootstrapper.rb +337 -16
  67. data/lib/woods/mcp/config_resolver.rb +288 -0
  68. data/lib/woods/mcp/errors.rb +134 -0
  69. data/lib/woods/mcp/index_reader.rb +265 -30
  70. data/lib/woods/mcp/origin_guard.rb +132 -0
  71. data/lib/woods/mcp/provider_probe.rb +166 -0
  72. data/lib/woods/mcp/renderers/claude_renderer.rb +6 -0
  73. data/lib/woods/mcp/renderers/markdown_renderer.rb +100 -3
  74. data/lib/woods/mcp/renderers/plain_renderer.rb +16 -2
  75. data/lib/woods/mcp/server.rb +771 -137
  76. data/lib/woods/model_name_cache.rb +78 -2
  77. data/lib/woods/notion/client.rb +25 -2
  78. data/lib/woods/notion/mappers/model_mapper.rb +36 -2
  79. data/lib/woods/railtie.rb +55 -15
  80. data/lib/woods/resilience/circuit_breaker.rb +9 -2
  81. data/lib/woods/resilience/retryable_provider.rb +40 -3
  82. data/lib/woods/resolved_config.rb +299 -0
  83. data/lib/woods/retrieval/context_assembler.rb +112 -5
  84. data/lib/woods/retrieval/query_classifier.rb +1 -1
  85. data/lib/woods/retrieval/ranker.rb +55 -6
  86. data/lib/woods/retrieval/search_executor.rb +42 -13
  87. data/lib/woods/retriever.rb +330 -24
  88. data/lib/woods/session_tracer/middleware.rb +35 -1
  89. data/lib/woods/storage/graph_store.rb +39 -0
  90. data/lib/woods/storage/inapplicable_backend.rb +14 -0
  91. data/lib/woods/storage/metadata_store.rb +129 -1
  92. data/lib/woods/storage/pgvector.rb +70 -8
  93. data/lib/woods/storage/qdrant.rb +196 -5
  94. data/lib/woods/storage/snapshotter/metadata.rb +172 -0
  95. data/lib/woods/storage/snapshotter/vector.rb +238 -0
  96. data/lib/woods/storage/snapshotter.rb +24 -0
  97. data/lib/woods/storage/vector_store.rb +184 -35
  98. data/lib/woods/tasks.rb +85 -0
  99. data/lib/woods/temporal/snapshot_store.rb +49 -1
  100. data/lib/woods/token_utils.rb +44 -5
  101. data/lib/woods/unblocked/client.rb +163 -0
  102. data/lib/woods/unblocked/document_builder.rb +326 -0
  103. data/lib/woods/unblocked/exporter.rb +201 -0
  104. data/lib/woods/unblocked/rate_limiter.rb +94 -0
  105. data/lib/woods/util/host_guard.rb +61 -0
  106. data/lib/woods/version.rb +1 -1
  107. data/lib/woods.rb +130 -6
  108. metadata +73 -4
@@ -52,9 +52,8 @@ module Woods
52
52
  # @param tool [String] Tool name
53
53
  # @param description [String] Human-readable description of the action
54
54
  # @param params [Hash] Tool parameters
55
- # @return [true] if confirmed
56
55
  # @raise [ConfirmationDeniedError] if denied
57
- def request_confirmation(tool:, description:, params:) # rubocop:disable Naming/PredicateMethod
56
+ def request_confirmation(tool:, description:, params:)
58
57
  approved = evaluate(tool: tool, description: description, params: params)
59
58
 
60
59
  @history << {
@@ -65,9 +64,9 @@ module Woods
65
64
  timestamp: Time.now.utc.iso8601
66
65
  }
67
66
 
68
- raise ConfirmationDeniedError, "Confirmation denied for #{tool}: #{description}" unless approved
67
+ return if approved
69
68
 
70
- true
69
+ raise ConfirmationDeniedError, "Confirmation denied for #{tool}: #{description}"
71
70
  end
72
71
 
73
72
  private
@@ -9,11 +9,19 @@ module Woods
9
9
  #
10
10
  # Auto-detects:
11
11
  # - Array<Hash> → Markdown tables
12
- # - Single Hash → Key-value bullet lists
12
+ # - Single Hash → Key-value bullet lists (Array values recurse into tables;
13
+ # `rows` / `values` paired with a sibling `columns` render as positional
14
+ # Markdown tables so sql/query/pluck output isn't collapsed)
13
15
  # - Simple Array → Bullet list
14
16
  # - Scalars → Plain text
15
17
  #
16
18
  class ConsoleResponseRenderer < MCP::ToolResponseRenderer
19
+ # Keys that carry positional row data in tool responses. When any of
20
+ # these appears alongside a `columns` array we render a proper table
21
+ # using the columns as headers instead of dumping bracketed arrays.
22
+ POSITIONAL_ROW_KEYS = %w[rows values].freeze
23
+ private_constant :POSITIONAL_ROW_KEYS
24
+
17
25
  # Smart default: auto-detect data shape and render accordingly.
18
26
  #
19
27
  # @param data [Object] The bridge response result
@@ -43,26 +51,56 @@ module Woods
43
51
 
44
52
  def render_table(rows)
45
53
  keys = rows.first.keys
46
- lines = []
47
- lines << "| #{keys.join(' | ')} |"
48
- lines << "| #{keys.map { '---' }.join(' | ')} |"
49
- rows.each do |row|
50
- lines << "| #{keys.map { |k| row[k] }.join(' | ')} |"
51
- end
52
- lines.join("\n")
54
+ header_row(keys) + rows.map { |row| row_line(keys.map { |k| row[k] }) }.join
53
55
  end
54
56
 
55
57
  def render_hash(data)
56
- data.map do |key, value|
57
- case value
58
- when Hash
59
- "**#{key}:**\n" + value.map { |k, v| " - #{k}: #{v}" }.join("\n")
60
- when Array
61
- "**#{key}:** #{value.size} items"
62
- else
63
- "**#{key}:** #{value}"
64
- end
65
- end.join("\n")
58
+ columns = stringify_array(data['columns'] || data[:columns])
59
+ data.map { |key, value| render_hash_entry(key, value, columns) }.join("\n")
60
+ end
61
+
62
+ def render_hash_entry(key, value, sibling_columns)
63
+ case value
64
+ when Hash
65
+ "**#{key}:**\n" + value.map { |k, v| " - #{k}: #{v}" }.join("\n")
66
+ when Array
67
+ "**#{key}:**\n" + render_hash_array_value(key, value, sibling_columns)
68
+ else
69
+ "**#{key}:** #{value}"
70
+ end
71
+ end
72
+
73
+ def render_hash_array_value(key, value, sibling_columns)
74
+ if POSITIONAL_ROW_KEYS.include?(key.to_s) && sibling_columns && !sibling_columns.empty?
75
+ render_positional_table(value, sibling_columns)
76
+ else
77
+ render_array(value)
78
+ end
79
+ end
80
+
81
+ # Build a Markdown table from positional row data + a columns header.
82
+ # Handles both Array<Array> (multi-column rows) and flat scalar arrays
83
+ # (pluck with a single column — Rails collapses the result).
84
+ def render_positional_table(rows, columns)
85
+ return '_(empty)_' if rows.empty?
86
+
87
+ header_row(columns) + rows.map { |row| row_line(positional_row_values(row)) }.join
88
+ end
89
+
90
+ def positional_row_values(row)
91
+ row.is_a?(Array) ? row : [row]
92
+ end
93
+
94
+ def header_row(keys)
95
+ "| #{keys.join(' | ')} |\n| #{keys.map { '---' }.join(' | ')} |\n"
96
+ end
97
+
98
+ def row_line(values)
99
+ "| #{values.join(' | ')} |\n"
100
+ end
101
+
102
+ def stringify_array(value)
103
+ value.is_a?(Array) ? value.map(&:to_s) : nil
66
104
  end
67
105
  end
68
106
 
@@ -0,0 +1,201 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'set'
4
+
5
+ # @see Woods
6
+ module Woods
7
+ class Error < StandardError; end unless defined?(Woods::Error)
8
+
9
+ module Console
10
+ # Boot-time index of every string leaf in `Rails.application.credentials`.
11
+ #
12
+ # The pattern-based {CredentialScanner} catches *known credential shapes*
13
+ # (Stripe `sk_live_…`, AWS `AKIA…`, etc.). It cannot catch a value whose
14
+ # shape is unknown — a hand-rolled HMAC secret, a Twilio auth token, a
15
+ # third-party webhook signing key. This index closes that gap by
16
+ # remembering the host app's *actual* credential values and substring-
17
+ # matching them in every Console MCP response, so a row whose value
18
+ # exactly matches a stored credential is redacted regardless of column
19
+ # name or value shape.
20
+ #
21
+ # The index is built once at server boot — walking encrypted credentials
22
+ # on every request would be both expensive and unsafe (it requires the
23
+ # master key). Each string leaf with length ≥ {MIN_LENGTH} is added to a
24
+ # frozen Set, and a pre-compiled `Regexp.union` is held for one-pass
25
+ # `gsub` substitution.
26
+ #
27
+ # ### Multi-DB / sharded caveat
28
+ # The index reflects credentials available to the *Rails process* that
29
+ # boots the Console MCP server. A separate database that holds its own
30
+ # secrets (e.g., a vendored CMS app sharing the same Rails host) is not
31
+ # in scope. Use Layer 3 (`console_redacted_columns` /
32
+ # `console_redacted_key_values`) for those.
33
+ #
34
+ # ### Missing master key
35
+ # In environments without `config/master.key` (CI, fresh checkouts) the
36
+ # `Rails.application.credentials.config` call raises
37
+ # `ActiveSupport::EncryptedConfiguration::MissingKeyError` or
38
+ # `ActiveSupport::MessageEncryptor::InvalidMessage`. {.build} catches
39
+ # both *by name* (no constant reference, so the class load order is
40
+ # irrelevant) and returns an empty index — the server still boots, the
41
+ # other defense layers still apply.
42
+ #
43
+ # @example
44
+ # index = CredentialIndex.build(rails_app: Rails.application)
45
+ # index.match?("sk_live_actual_secret_value") # => true
46
+ # index.redact("token: sk_live_actual_secret_value")
47
+ # # => "token: [REDACTED:credential]"
48
+ #
49
+ class CredentialIndex
50
+ # Captured at require time so the mtime-check warning has a stable
51
+ # reference point even if the clock skews later. Frozen immediately
52
+ # to prevent accidental mutation.
53
+ PROCESS_START = Time.now.freeze
54
+
55
+ # Substrings shorter than this are not added to the index. Below ~12
56
+ # chars the false-positive rate climbs sharply (env names like
57
+ # `production`, hostnames, version strings, etc.).
58
+ MIN_LENGTH = 12
59
+
60
+ # Rendered marker for substring hits — distinct from the pattern
61
+ # scanner's `[REDACTED]` so operators reading audit output can see
62
+ # *which* layer caught the leak.
63
+ REDACTED = '[REDACTED:credential]'
64
+
65
+ # Encryption-related exception class names caught by name. Rails moves
66
+ # these around between versions; matching by `Class#name` keeps us
67
+ # from coupling to a specific constant path.
68
+ MISSING_KEY_ERRORS = %w[
69
+ ActiveSupport::EncryptedConfiguration::MissingKeyError
70
+ ActiveSupport::EncryptedFile::MissingKeyError
71
+ ActiveSupport::MessageEncryptor::InvalidMessage
72
+ ].freeze
73
+
74
+ class << self
75
+ # Build an index from a Rails application's encrypted credentials.
76
+ #
77
+ # **Restart required after rotation.** The index is built once at
78
+ # server boot and held in memory for the life of the MCP process.
79
+ # When a host app rotates Rails credentials, the MCP process keeps
80
+ # the pre-rotation secrets in its frozen Set until the process
81
+ # restarts. New secrets added during rotation are NOT in the index
82
+ # — only Layer 2 shape patterns can catch them until restart.
83
+ #
84
+ # To trigger a rebuild without restarting, call
85
+ # `Woods::Console::Server.rebuild_credential_index` from a rotation
86
+ # job or initializer hook. See docs/CONSOLE_MCP_SETUP.md for the
87
+ # full restart guidance.
88
+ #
89
+ # @param rails_app [#credentials] usually `Rails.application`.
90
+ # @return [CredentialIndex] populated index, or an empty index when
91
+ # the credentials store can't be opened.
92
+ def build(rails_app:)
93
+ new(secrets: collect_secrets(rails_app))
94
+ rescue StandardError => e
95
+ raise unless missing_key_error?(e)
96
+
97
+ new(secrets: [])
98
+ end
99
+
100
+ # Emit a structured log warning when any of the given credentials
101
+ # files has a modification time newer than `process_start`. This
102
+ # catches the common "rotated credentials but forgot to restart"
103
+ # situation at boot time.
104
+ #
105
+ # Only files that actually exist on disk are checked; missing paths
106
+ # are silently skipped so CI and fresh-checkout environments (which
107
+ # have no credentials file) produce no noise.
108
+ #
109
+ # @param credentials_files [Array<String>] Paths to check (e.g.
110
+ # `config/credentials.yml.enc`,
111
+ # `config/credentials/production.yml.enc`).
112
+ # @param process_start [Time] When the current process started.
113
+ # Typically `Woods::Console::CredentialIndex::PROCESS_START`.
114
+ # @param logger [#warn] A logger responding to `warn(event, **kwargs)`.
115
+ # @return [void]
116
+ def warn_if_credentials_rotated(credentials_files:, process_start:, logger:)
117
+ credentials_files.each do |path|
118
+ next unless File.exist?(path)
119
+
120
+ mtime = File.mtime(path)
121
+ next unless mtime > process_start
122
+
123
+ logger.warn(
124
+ 'console.credential_index.stale',
125
+ credentials_file: path,
126
+ file_mtime: mtime.iso8601,
127
+ process_start: process_start.iso8601,
128
+ hint: 'Credentials file was modified after process start — ' \
129
+ 'restart the MCP process or call ' \
130
+ 'Woods::Console::Server.rebuild_credential_index to ' \
131
+ 'pick up rotated secrets.'
132
+ )
133
+ end
134
+ end
135
+
136
+ private
137
+
138
+ def collect_secrets(rails_app)
139
+ config = rails_app.credentials.config
140
+ collected = []
141
+ walk(config, collected)
142
+ collected
143
+ end
144
+
145
+ def walk(node, collected)
146
+ case node
147
+ when Hash then node.each_value { |v| walk(v, collected) }
148
+ when Array then node.each { |v| walk(v, collected) }
149
+ when String
150
+ collected << node if node.length >= MIN_LENGTH
151
+ end
152
+ end
153
+
154
+ def missing_key_error?(error)
155
+ MISSING_KEY_ERRORS.include?(error.class.name)
156
+ end
157
+ end
158
+
159
+ # @return [Set<String>] frozen set of secret strings (length ≥ MIN_LENGTH).
160
+ attr_reader :secrets
161
+
162
+ # @return [Regexp, nil] precompiled `Regexp.union` of every secret, or
163
+ # nil when the index is empty (no allocation, no per-string overhead).
164
+ attr_reader :pattern
165
+
166
+ # @param secrets [Enumerable<String>] string leaves harvested from
167
+ # credentials. Duplicates are collapsed; strings shorter than
168
+ # {MIN_LENGTH} are dropped.
169
+ def initialize(secrets:)
170
+ filtered = Array(secrets).select { |s| s.is_a?(String) && s.length >= MIN_LENGTH }
171
+ @secrets = filtered.to_set.freeze
172
+ @pattern = @secrets.empty? ? nil : Regexp.union(@secrets.to_a)
173
+ end
174
+
175
+ # @return [Boolean] true when no secrets were collected (missing key,
176
+ # empty credentials file, or every leaf below MIN_LENGTH).
177
+ def empty?
178
+ @secrets.empty?
179
+ end
180
+
181
+ # @param str [String]
182
+ # @return [Boolean] true when any indexed secret appears as a substring.
183
+ def match?(str)
184
+ return false if empty? || !str.is_a?(String)
185
+
186
+ @pattern.match?(str)
187
+ end
188
+
189
+ # Replace every indexed-secret substring in `str` with {REDACTED}.
190
+ #
191
+ # @param str [String]
192
+ # @return [String] redacted copy. Returns the input unchanged when the
193
+ # index is empty or no secret appears.
194
+ def redact(str)
195
+ return str if empty? || !str.is_a?(String) || !@pattern.match?(str)
196
+
197
+ str.gsub(@pattern, REDACTED)
198
+ end
199
+ end
200
+ end
201
+ end
@@ -0,0 +1,302 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'base64'
4
+ require 'cgi'
5
+ require 'set'
6
+
7
+ require_relative 'credential_index'
8
+
9
+ # @see Woods
10
+ module Woods
11
+ class Error < StandardError; end unless defined?(Woods::Error)
12
+
13
+ module Console
14
+ # Content-shape credential scanner for Console MCP responses.
15
+ #
16
+ # Walks a serialized response tree (strings, nested Hash, nested Array)
17
+ # and replaces substrings that match known credential formats with
18
+ # `[REDACTED]`. Pattern matching is high-specificity (word-boundary
19
+ # anchored, minimum-length bounded) so false positives against UUIDs,
20
+ # email addresses, and short identifiers stay rare.
21
+ #
22
+ # This is Layer 2 of the defense-in-depth stack — it runs AFTER the
23
+ # operator-configured column and EAV redaction layers so it catches
24
+ # credentials those layers missed (newly-added EAV keys, secrets stored
25
+ # in JSONB columns, associated records pulled via nested serialization).
26
+ #
27
+ # @example
28
+ # scanner = CredentialScanner.new
29
+ # value, counts = scanner.scan('token is sk_test_4eC39HqLyjWDarjtT1zdp7dc')
30
+ # value # => "token is [REDACTED]"
31
+ # counts # => { stripe_secret_key: 1 }
32
+ #
33
+ class CredentialScanner # rubocop:disable Metrics/ClassLength
34
+ REDACTED = '[REDACTED]'
35
+
36
+ # High-specificity credential patterns. Each is word-boundary anchored
37
+ # and bounded by a realistic minimum length so random short strings
38
+ # cannot trigger a match.
39
+ #
40
+ # Order matters: more-specific patterns appear before less-specific
41
+ # alternatives (e.g., `anthropic_api_key` before `openai_api_key`)
42
+ # so the specific counter increments rather than the generic one.
43
+ PATTERNS = {
44
+ stripe_secret_key: /\b(?:sk|rk)_(?:live|test)_[A-Za-z0-9]{24,}\b/,
45
+ stripe_publishable_key: /\bpk_(?:live|test)_[A-Za-z0-9]{24,}\b/,
46
+ stripe_webhook_secret: /\bwhsec_[A-Za-z0-9]{24,}\b/,
47
+ # Stripe Connect account IDs are PII per Stripe's ToS even though they
48
+ # are not strictly secret — surfacing one in an MCP response leaks the
49
+ # connected merchant's identity.
50
+ stripe_connect_account_id: /\bacct_[A-Za-z0-9]{16,}\b/,
51
+ # Klaviyo private API keys use a bare `pk_` prefix with no live/test
52
+ # infix — they evade the Stripe publishable regex and grant full API
53
+ # access to the Klaviyo tenant. Order matters: stripe_publishable_key
54
+ # runs first so its more-specific match wins on Stripe values.
55
+ klaviyo_private_key: /\bpk_[A-Za-z0-9]{34}\b/,
56
+ aws_access_key_id: /\b(?:AKIA|ASIA)[0-9A-Z]{16}\b/,
57
+ github_fine_grained_pat: /\bgithub_pat_[A-Za-z0-9_]{82}\b/,
58
+ github_token: /\bgh[pousr]_[A-Za-z0-9]{36,}\b/,
59
+ google_oauth_token: /\bya29\.[A-Za-z0-9_-]{20,}\b/,
60
+ jwt_token: /\beyJ[A-Za-z0-9_-]+\.eyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\b/,
61
+ pem_private_key_block: /-----BEGIN [A-Z ]*PRIVATE KEY-----[\s\S]*?-----END [A-Z ]*PRIVATE KEY-----/,
62
+ slack_token: /\bxox[abpr]-[A-Za-z0-9-]{10,}\b/,
63
+ sendgrid_api_key: /\bSG\.[A-Za-z0-9_-]{22}\.[A-Za-z0-9_-]{43}\b/,
64
+ mailgun_api_key: /\bkey-[a-f0-9]{32}\b/,
65
+ # Matches both the current `sk-ant-api03-…` / `sk-ant-admin01-…` shape
66
+ # and the legacy `sk-ant-…` format that shipped without the
67
+ # `api|admin` infix. Length floor prevents matching on a bare `sk-ant-`
68
+ # prefix in logs or docs.
69
+ anthropic_api_key: /\bsk-ant-(?:(?:api|admin)\d{2}-)?[A-Za-z0-9_-]{80,}\b/,
70
+ openai_api_key: %r{\bsk-(?:proj-)?[A-Za-z0-9/_-]{40,}\b},
71
+ # `rt`/`ua` extend the existing alternation to cover refresh tokens
72
+ # (`shprt_`) and user-access tokens (`shpua_`) — the prefix list
73
+ # before this PR missed both.
74
+ shopify_access_token: /\bshp(?:at|ca|ss|pa|rt|ua)_[a-f0-9]{32}\b/,
75
+ square_access_token: /\bsq0[a-z]{3}-[A-Za-z0-9_-]{22,}\b/,
76
+ paypal_access_token: /\baccess_token\$(?:production|sandbox)\$[A-Za-z0-9]+\$[a-f0-9]+\b/,
77
+ # Distinctive `00D<15-org-id>!<base64 payload>` shape — no FP risk
78
+ # and one of the highest-leverage additions per the research brief.
79
+ salesforce_access_token: /\b00D[A-Za-z0-9]{12}![A-Za-z0-9._]{80,250}\b/,
80
+ launchdarkly_sdk_key: /\bsdk-[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b/,
81
+ launchdarkly_mobile_key: /\bmob-[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b/,
82
+ hubspot_private_app_token: Regexp.new(
83
+ '\bpat-(?:na1|na2|eu1|eu2|ap1)-' \
84
+ '[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b'
85
+ ),
86
+ brevo_api_key: /\bxkeysib-[a-f0-9]{64}-[A-Za-z0-9]{16}\b/,
87
+ brevo_smtp_key: /\bxsmtpsib-[a-f0-9]{64}-[A-Za-z0-9]{16}\b/,
88
+ kit_api_key: /\bkit_[A-Za-z0-9]{20,}\b/,
89
+ twilio_account_sid: /\bAC[0-9a-fA-F]{32}\b/,
90
+ twilio_api_key_sid: /\bSK[0-9a-fA-F]{32}\b/,
91
+ twilio_verify_service_sid: /\bVA[0-9a-fA-F]{32}\b/,
92
+ # Connection strings with embedded credentials: `postgres://user:pass@host/db`,
93
+ # `mysql2://user:pass@host/db`, `mongodb://…`, `amqp://…`, `redis://…`.
94
+ # Captures the entire URL — the password is part of it and redacting
95
+ # just the password field while leaving `user@host` visible is not
96
+ # worth the regex complexity when the host may itself be sensitive.
97
+ database_url_with_password: Regexp.new(
98
+ '\b(?:postgres|postgresql|mysql|mysql2|mongodb|mongodb\+srv|amqp|amqps|redis|rediss|' \
99
+ 'clickhouse|cockroachdb|mariadb)://[^\s:@/]+:[^\s@/]+@\S+'
100
+ )
101
+ }.freeze
102
+
103
+ # @return [Array<Symbol>] every pattern name the scanner knows about.
104
+ def self.patterns
105
+ PATTERNS.keys
106
+ end
107
+
108
+ # Replace the boot-time credential index with a freshly built one.
109
+ #
110
+ # Called by `Woods::Console::Server.rebuild_credential_index` after a
111
+ # host app rotates its Rails credentials. Thread-safe: the assignment
112
+ # is atomic on MRI (GVL) and the new index is fully constructed before
113
+ # being swapped in, so in-flight scans see either the old or the new
114
+ # index — never a partial one.
115
+ #
116
+ # @param new_index [CredentialIndex, nil] The replacement index.
117
+ # @return [void]
118
+ def replace_index!(new_index)
119
+ @secret_index = if new_index.respond_to?(:empty?) && new_index.empty?
120
+ nil
121
+ else
122
+ new_index
123
+ end
124
+ end
125
+
126
+ # Counter key emitted when {CredentialIndex} substring-matches a value
127
+ # before any shape pattern fires. Distinct from pattern names so
128
+ # observability can tell the two layers apart.
129
+ INDEX_HIT = :credential_index
130
+
131
+ # @param disabled_patterns [Array<Symbol, String>] names to skip at scan
132
+ # time. Strings are coerced to Symbols.
133
+ # @param secret_index [#match?, #redact, nil] Optional {CredentialIndex}
134
+ # built from the host app's actual credentials. When present, every
135
+ # string is run through the index *before* the pattern pass — so a
136
+ # value whose shape no pattern recognizes (Twilio auth tokens,
137
+ # hand-rolled HMAC keys, etc.) is still redacted when it matches a
138
+ # stored credential exactly. Pass `nil` (or a `CredentialIndex#empty?`
139
+ # index) to skip the substring layer.
140
+ def initialize(disabled_patterns: [], secret_index: nil)
141
+ disabled = Array(disabled_patterns).to_set(&:to_sym)
142
+ @active_patterns = PATTERNS.except(*disabled)
143
+ @secret_index = secret_index unless secret_index.respond_to?(:empty?) && secret_index.empty?
144
+ end
145
+
146
+ # Scan a value (String, Hash, Array, or any other object) for credentials.
147
+ #
148
+ # Strings are gsub'd against every active pattern. Hash values and Array
149
+ # elements are walked recursively; keys and non-string scalars
150
+ # (Integer, Float, true/false, nil) pass through untouched.
151
+ #
152
+ # @param value [Object]
153
+ # @return [Array(Object, Hash{Symbol=>Integer})] two-tuple of the scanned
154
+ # value and a per-pattern match count. Count entries are only present
155
+ # for patterns that fired — callers should treat a missing key as zero.
156
+ def scan(value)
157
+ counts = {}
158
+ scanned = walk(value, counts)
159
+ [scanned, counts]
160
+ end
161
+
162
+ private
163
+
164
+ def walk(value, counts)
165
+ case value
166
+ when String then scan_string(value, counts)
167
+ when Hash then walk_hash(value, counts)
168
+ when Array then value.map { |item| walk(item, counts) }
169
+ else value
170
+ end
171
+ end
172
+
173
+ # Walk a Hash, scanning both keys and values for credentials.
174
+ #
175
+ # Keys are scanned by coercing them to String, running the scan, then
176
+ # restoring the original key type — a Symbol key that carries a
177
+ # credential-shaped string is emitted as a Symbol after redaction
178
+ # (e.g. `:"[REDACTED]"`). String keys stay Strings. Non-String,
179
+ # non-Symbol keys (Integer, etc.) pass through unchanged.
180
+ def walk_hash(hash, counts)
181
+ hash.each_with_object({}) do |(key, val), out|
182
+ scanned_key = case key
183
+ when String then scan_string(key, counts)
184
+ when Symbol then scan_string(key.to_s, counts).to_sym
185
+ else key
186
+ end
187
+ out[scanned_key] = walk(val, counts)
188
+ end
189
+ end
190
+
191
+ def scan_string(str, counts)
192
+ result = redact_indexed_secrets(str, counts)
193
+ result = scan_encoded_forms(result, counts)
194
+ @active_patterns.inject(result) do |acc, (name, pattern)|
195
+ acc.gsub(pattern) do
196
+ counts[name] = (counts[name] || 0) + 1
197
+ REDACTED
198
+ end
199
+ end
200
+ end
201
+
202
+ # Best-effort redaction of credentials that slip past the literal-pattern
203
+ # pass because they are URL-encoded (`sk%5Ftest%5F…`) or base64-wrapped
204
+ # (`c2tfdGVzdF8…`). For each candidate substring we try the decoded form
205
+ # against every active pattern; if any pattern matches, we redact the
206
+ # encoded substring in the original string. This is a defense-in-depth
207
+ # layer — the literal patterns remain the primary guarantee and the
208
+ # decoded scan runs only over substrings whose shape suggests encoding.
209
+ def scan_encoded_forms(str, counts)
210
+ str = scan_url_encoded(str, counts) if str.include?('%')
211
+ scan_base64(str, counts)
212
+ end
213
+
214
+ URL_ENCODED_CANDIDATE = %r{[A-Za-z0-9%_.~+/=-]{24,}}
215
+ # No trailing `\b` anchor — `\b` requires a word/non-word transition
216
+ # and fails at end-of-string after a `=` padding char. Use a negative
217
+ # look-ahead to stop at the next alphanumeric character instead.
218
+ BASE64_CANDIDATE = %r{\b[A-Za-z0-9+/]{40,}={0,2}(?![A-Za-z0-9+/])}
219
+ private_constant :URL_ENCODED_CANDIDATE, :BASE64_CANDIDATE
220
+
221
+ def scan_url_encoded(str, counts)
222
+ str.gsub(URL_ENCODED_CANDIDATE) do |candidate|
223
+ next candidate unless candidate.include?('%')
224
+
225
+ decoded = safely_unescape(candidate)
226
+ credential_name = first_matching_pattern(decoded)
227
+ if credential_name
228
+ counts[credential_name] = (counts[credential_name] || 0) + 1
229
+ REDACTED
230
+ else
231
+ candidate
232
+ end
233
+ end
234
+ end
235
+
236
+ def scan_base64(str, counts)
237
+ str.gsub(BASE64_CANDIDATE) do |candidate|
238
+ decoded = safely_base64_decode(candidate)
239
+ next candidate unless decoded
240
+
241
+ credential_name = first_matching_pattern(decoded)
242
+ if credential_name
243
+ counts[credential_name] = (counts[credential_name] || 0) + 1
244
+ REDACTED
245
+ else
246
+ candidate
247
+ end
248
+ end
249
+ end
250
+
251
+ def first_matching_pattern(value)
252
+ @active_patterns.each do |name, pattern|
253
+ return name if pattern.match?(value)
254
+ end
255
+ nil
256
+ end
257
+
258
+ def safely_unescape(value)
259
+ CGI.unescape(value)
260
+ rescue ArgumentError
261
+ value
262
+ end
263
+
264
+ def safely_base64_decode(value)
265
+ decoded = Base64.strict_decode64(value)
266
+ decoded.force_encoding(Encoding::UTF_8)
267
+ return nil unless decoded.valid_encoding?
268
+ # CPU short-circuit: credential strings are printable ASCII. If
269
+ # the decoded bytes are mostly non-printable (likely a binary
270
+ # hash / random bytes / image), skip the pattern scan entirely —
271
+ # avoids running every `active_patterns` regex over gigabytes of
272
+ # hex dumps on every scan.
273
+ return nil unless mostly_printable?(decoded)
274
+
275
+ decoded
276
+ rescue ArgumentError
277
+ nil
278
+ end
279
+
280
+ PRINTABLE_RATIO_THRESHOLD = 0.85
281
+ private_constant :PRINTABLE_RATIO_THRESHOLD
282
+
283
+ def mostly_printable?(str)
284
+ return false if str.empty?
285
+
286
+ # ASCII printable (0x20-0x7e) + tab/newline; the threshold matches
287
+ # base64 encodings of typical credential shapes (sk_live_..., AKIA...,
288
+ # ghp_..., etc.) which are 100% printable.
289
+ printable = str.each_char.count { |c| c.match?(/[\t\n\x20-\x7e]/) }
290
+ printable.to_f / str.length >= PRINTABLE_RATIO_THRESHOLD
291
+ end
292
+
293
+ def redact_indexed_secrets(str, counts)
294
+ return str unless @secret_index&.match?(str)
295
+
296
+ redacted = @secret_index.redact(str)
297
+ counts[INDEX_HIT] = (counts[INDEX_HIT] || 0) + redacted.scan(CredentialIndex::REDACTED).size
298
+ redacted
299
+ end
300
+ end
301
+ end
302
+ end