woods 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +169 -0
  3. data/README.md +20 -8
  4. data/exe/woods-console +51 -6
  5. data/exe/woods-console-mcp +24 -4
  6. data/exe/woods-mcp +30 -7
  7. data/exe/woods-mcp-http +47 -6
  8. data/lib/generators/woods/install_generator.rb +13 -4
  9. data/lib/generators/woods/templates/woods.rb.tt +155 -0
  10. data/lib/tasks/woods.rake +15 -50
  11. data/lib/woods/builder.rb +174 -9
  12. data/lib/woods/cache/cache_middleware.rb +360 -31
  13. data/lib/woods/chunking/semantic_chunker.rb +334 -7
  14. data/lib/woods/console/adapters/job_adapter.rb +10 -4
  15. data/lib/woods/console/audit_logger.rb +76 -4
  16. data/lib/woods/console/bridge.rb +48 -15
  17. data/lib/woods/console/bridge_protocol.rb +44 -0
  18. data/lib/woods/console/confirmation.rb +3 -4
  19. data/lib/woods/console/console_response_renderer.rb +56 -18
  20. data/lib/woods/console/credential_index.rb +201 -0
  21. data/lib/woods/console/credential_scanner.rb +302 -0
  22. data/lib/woods/console/dispatch_pipeline.rb +138 -0
  23. data/lib/woods/console/embedded_executor.rb +682 -35
  24. data/lib/woods/console/eval_guard.rb +319 -0
  25. data/lib/woods/console/model_validator.rb +1 -3
  26. data/lib/woods/console/rack_middleware.rb +185 -29
  27. data/lib/woods/console/redactor.rb +161 -0
  28. data/lib/woods/console/response_context.rb +127 -0
  29. data/lib/woods/console/safe_context.rb +220 -23
  30. data/lib/woods/console/scope_predicate_parser.rb +131 -0
  31. data/lib/woods/console/server.rb +417 -486
  32. data/lib/woods/console/sql_noise_stripper.rb +87 -0
  33. data/lib/woods/console/sql_table_scanner.rb +213 -0
  34. data/lib/woods/console/sql_validator.rb +81 -31
  35. data/lib/woods/console/table_gate.rb +93 -0
  36. data/lib/woods/console/tool_specs.rb +552 -0
  37. data/lib/woods/console/tools/tier1.rb +3 -3
  38. data/lib/woods/console/tools/tier4.rb +7 -1
  39. data/lib/woods/dependency_graph.rb +66 -7
  40. data/lib/woods/embedding/indexer.rb +190 -6
  41. data/lib/woods/embedding/openai.rb +40 -4
  42. data/lib/woods/embedding/provider.rb +104 -8
  43. data/lib/woods/embedding/text_preparer.rb +23 -3
  44. data/lib/woods/embedding/token_counter.rb +133 -0
  45. data/lib/woods/evaluation/baseline_runner.rb +20 -2
  46. data/lib/woods/evaluation/metrics.rb +4 -1
  47. data/lib/woods/extracted_unit.rb +1 -0
  48. data/lib/woods/extractor.rb +7 -1
  49. data/lib/woods/extractors/controller_extractor.rb +6 -0
  50. data/lib/woods/extractors/mailer_extractor.rb +16 -2
  51. data/lib/woods/extractors/model_extractor.rb +6 -1
  52. data/lib/woods/extractors/phlex_extractor.rb +13 -4
  53. data/lib/woods/extractors/rails_source_extractor.rb +2 -0
  54. data/lib/woods/extractors/route_helper_resolver.rb +130 -0
  55. data/lib/woods/extractors/shared_dependency_scanner.rb +130 -2
  56. data/lib/woods/extractors/view_component_extractor.rb +12 -1
  57. data/lib/woods/extractors/view_engines/base.rb +141 -0
  58. data/lib/woods/extractors/view_engines/erb.rb +145 -0
  59. data/lib/woods/extractors/view_template_extractor.rb +92 -133
  60. data/lib/woods/flow_assembler.rb +23 -15
  61. data/lib/woods/flow_precomputer.rb +21 -2
  62. data/lib/woods/graph_analyzer.rb +3 -4
  63. data/lib/woods/index_artifact.rb +173 -0
  64. data/lib/woods/mcp/bearer_auth.rb +45 -0
  65. data/lib/woods/mcp/bootstrap_state.rb +94 -0
  66. data/lib/woods/mcp/bootstrapper.rb +337 -16
  67. data/lib/woods/mcp/config_resolver.rb +288 -0
  68. data/lib/woods/mcp/errors.rb +134 -0
  69. data/lib/woods/mcp/index_reader.rb +265 -30
  70. data/lib/woods/mcp/origin_guard.rb +132 -0
  71. data/lib/woods/mcp/provider_probe.rb +166 -0
  72. data/lib/woods/mcp/renderers/claude_renderer.rb +6 -0
  73. data/lib/woods/mcp/renderers/markdown_renderer.rb +39 -3
  74. data/lib/woods/mcp/renderers/plain_renderer.rb +16 -2
  75. data/lib/woods/mcp/server.rb +737 -137
  76. data/lib/woods/model_name_cache.rb +78 -2
  77. data/lib/woods/notion/client.rb +25 -2
  78. data/lib/woods/notion/mappers/model_mapper.rb +36 -2
  79. data/lib/woods/railtie.rb +55 -15
  80. data/lib/woods/resilience/circuit_breaker.rb +9 -2
  81. data/lib/woods/resilience/retryable_provider.rb +40 -3
  82. data/lib/woods/resolved_config.rb +299 -0
  83. data/lib/woods/retrieval/context_assembler.rb +112 -5
  84. data/lib/woods/retrieval/query_classifier.rb +1 -1
  85. data/lib/woods/retrieval/ranker.rb +55 -6
  86. data/lib/woods/retrieval/search_executor.rb +42 -13
  87. data/lib/woods/retriever.rb +330 -24
  88. data/lib/woods/session_tracer/middleware.rb +35 -1
  89. data/lib/woods/storage/graph_store.rb +39 -0
  90. data/lib/woods/storage/inapplicable_backend.rb +14 -0
  91. data/lib/woods/storage/metadata_store.rb +129 -1
  92. data/lib/woods/storage/pgvector.rb +70 -8
  93. data/lib/woods/storage/qdrant.rb +196 -5
  94. data/lib/woods/storage/snapshotter/metadata.rb +172 -0
  95. data/lib/woods/storage/snapshotter/vector.rb +238 -0
  96. data/lib/woods/storage/snapshotter.rb +24 -0
  97. data/lib/woods/storage/vector_store.rb +184 -35
  98. data/lib/woods/tasks.rb +85 -0
  99. data/lib/woods/temporal/snapshot_store.rb +49 -1
  100. data/lib/woods/token_utils.rb +44 -5
  101. data/lib/woods/unblocked/client.rb +1 -1
  102. data/lib/woods/unblocked/document_builder.rb +35 -10
  103. data/lib/woods/unblocked/exporter.rb +1 -1
  104. data/lib/woods/util/host_guard.rb +61 -0
  105. data/lib/woods/version.rb +1 -1
  106. data/lib/woods.rb +126 -6
  107. metadata +69 -4
@@ -7,12 +7,25 @@ module Woods
7
7
  # Avoids O(n*m) per-extractor iteration of ActiveRecord::Base.descendants.
8
8
  # Invalidated per extraction run (call .reset! before a new run).
9
9
  #
10
+ # Provides two resolution layers:
11
+ # 1. {.model_names_regex} — whole-word match against every fully-qualified
12
+ # model name. Catches `User`, `Library::Book`, and `"Library::Book"`
13
+ # (as a string literal) because `\b` treats `:` and `"` as boundaries.
14
+ # 2. {.resolve_short_name} — when source references the bare inner name
15
+ # (e.g. `Book.new` inside `module Library`), resolve it back to its
16
+ # fully-qualified owner when the short name is unambiguous. Needed
17
+ # because the cache holds `Library::Book` but the source writes
18
+ # `Book` after a `module Library` opens.
19
+ #
10
20
  # @example
11
21
  # Woods::ModelNameCache.model_names
12
- # # => ["User", "Order", "Product", ...]
22
+ # # => ["User", "Library::Book", ...]
13
23
  #
14
24
  # Woods::ModelNameCache.model_names_regex
15
- # # => /\b(?:User|Order|Product|...)\b/
25
+ # # => /\b(?:User|Library::Book|...)\b/
26
+ #
27
+ # Woods::ModelNameCache.resolve_short_name("Book")
28
+ # # => "Library::Book" (or nil when ambiguous)
16
29
  #
17
30
  module ModelNameCache
18
31
  class << self
@@ -26,10 +39,40 @@ module Woods
26
39
  @model_names_regex ||= build_regex
27
40
  end
28
41
 
42
+ # Short-name → fully-qualified owner mapping. Ambiguous short names
43
+ # (two different models sharing the same inner name) map to nil so
44
+ # callers can detect the collision and skip the edge rather than
45
+ # guess.
46
+ #
47
+ # @return [Hash{String => String, nil}]
48
+ def short_name_map
49
+ @short_name_map ||= build_short_name_map
50
+ end
51
+
52
+ # Resolve a bare short name (e.g. `Book`) to its fully-qualified
53
+ # owner (`Library::Book`) when unambiguous. Returns nil otherwise.
54
+ #
55
+ # @param short [String]
56
+ # @return [String, nil]
57
+ def resolve_short_name(short)
58
+ short_name_map[short.to_s]
59
+ end
60
+
61
+ # Regex matching bare short names of namespaced models. Used by the
62
+ # dependency scanner to surface references like `Book.new`
63
+ # inside the `Library` module, which the full-name regex misses.
64
+ #
65
+ # @return [Regexp]
66
+ def short_names_regex
67
+ @short_names_regex ||= build_short_names_regex
68
+ end
69
+
29
70
  # Clear cache (call at the start of each extraction run)
30
71
  def reset!
31
72
  @model_names = nil
32
73
  @model_names_regex = nil
74
+ @short_name_map = nil
75
+ @short_names_regex = nil
33
76
  end
34
77
 
35
78
  private
@@ -46,6 +89,39 @@ module Woods
46
89
 
47
90
  /\b(?:#{names.map { |n| Regexp.escape(n) }.join('|')})\b/
48
91
  end
92
+
93
+ # Build short-name → full-name mapping. A short name that appears on
94
+ # multiple fully-qualified models resolves to nil so ambiguity bubbles
95
+ # up (instead of silently picking one). Bare top-level names
96
+ # (no `::`) map to themselves.
97
+ def build_short_name_map
98
+ map = {}
99
+ model_names.each do |full|
100
+ short = full.split('::').last
101
+ map[short] = if map.key?(short) && map[short] != full
102
+ nil # mark ambiguous
103
+ else
104
+ full
105
+ end
106
+ end
107
+ map
108
+ end
109
+
110
+ def build_short_names_regex
111
+ unambiguous = short_name_map.select { |short, full| full && short != full }.keys
112
+ return /(?!)/ if unambiguous.empty?
113
+
114
+ # Match the short name only when:
115
+ # - NOT preceded by `::`, `.`, or another word char (avoids
116
+ # double-counting the full-name hit + rejects `RareBook`).
117
+ # - Followed by a recognisable constant-use context: method call
118
+ # (`.` / `(`), namespace (`::`), list boundary (`,` / `)` / `]`),
119
+ # or end-of-line. This filters out mentions inside sentences
120
+ # (" ... update Book later") and inside string literals
121
+ # that lack a follow-up method call (`"Book"` alone).
122
+ names = unambiguous.map { |n| Regexp.escape(n) }.join('|')
123
+ /(?<![:.\w])(?:#{names})\b(?=\s*(?:\.|::|\(|,|\)|\]|=(?!=)|$))/
124
+ end
49
125
  end
50
126
  end
51
127
  end
@@ -146,6 +146,11 @@ module Woods
146
146
 
147
147
  # Execute HTTP with rate limiting and network error retry.
148
148
  #
149
+ # Any message from an underlying network error is run through
150
+ # {#redact_token} before being re-raised — a malformed reflected
151
+ # URL or request dump from the stdlib must not leak the bearer
152
+ # token into logs or backtraces.
153
+ #
149
154
  # @return [Net::HTTPResponse]
150
155
  # @raise [Woods::Error] on persistent network failures
151
156
  def execute_with_retry(method, path, body)
@@ -154,7 +159,10 @@ module Woods
154
159
  @rate_limiter.throttle { execute_http(method, path, body) }
155
160
  rescue Net::OpenTimeout, Net::ReadTimeout, Errno::ECONNRESET, Errno::ECONNREFUSED => e
156
161
  attempts += 1
157
- raise Woods::Error, "Network error after #{attempts} retries: #{e.message}" if attempts >= MAX_RETRIES
162
+ if attempts >= MAX_RETRIES
163
+ raise Woods::Error,
164
+ "Network error after #{attempts} retries: #{redact_token(e.message)}"
165
+ end
158
166
 
159
167
  sleep(2**attempts)
160
168
  retry
@@ -162,6 +170,9 @@ module Woods
162
170
  end
163
171
 
164
172
  # Raise a descriptive error from a non-success Notion response.
173
+ # The response body is scrubbed before being formatted into the
174
+ # exception — if the Notion API ever echoes back a header (or a
175
+ # proxy does), the bearer token must not surface here.
165
176
  #
166
177
  # @raise [Woods::Error]
167
178
  def raise_api_error(response)
@@ -171,7 +182,19 @@ module Woods
171
182
  { 'message' => "Unparseable response body: #{response.body&.slice(0, 200)}" }
172
183
  end
173
184
  message = parsed['message'] || 'Unknown error'
174
- raise Woods::Error, "Notion API error #{response.code}: #{message}"
185
+ raise Woods::Error,
186
+ "Notion API error #{response.code}: #{redact_token(message)}"
187
+ end
188
+
189
+ # Replace every occurrence of the bearer token with `[REDACTED]`.
190
+ # Defense in depth — no exception message emitted by this client
191
+ # should carry the secret even if a future code path embeds the
192
+ # request headers verbatim.
193
+ def redact_token(message)
194
+ return message if message.nil? || message.empty?
195
+ return message if @api_token.nil? || @api_token.empty?
196
+
197
+ message.to_s.gsub(@api_token, '[REDACTED]')
175
198
  end
176
199
 
177
200
  # Perform the raw HTTP request.
@@ -15,7 +15,7 @@ module Woods
15
15
  # properties = mapper.map(unit_data)
16
16
  # client.create_page(database_id: db_id, properties: properties)
17
17
  #
18
- class ModelMapper
18
+ class ModelMapper # rubocop:disable Metrics/ClassLength
19
19
  include Shared
20
20
 
21
21
  # Map a model unit to Notion Data Models page properties.
@@ -66,6 +66,16 @@ module Woods
66
66
  metadata['column_count'] || (metadata['columns'] || []).size
67
67
  end
68
68
 
69
+ # Extract the leading comment block from a model file, redacting
70
+ # any credential-shaped content before shipping it to Notion.
71
+ #
72
+ # Model header comments occasionally contain sample API keys,
73
+ # integration URLs with embedded passwords, or TODO references to
74
+ # internal secrets. Without redaction those land verbatim in a
75
+ # third-party SaaS database. This uses the same {CredentialScanner}
76
+ # that protects the Console MCP so Notion export inherits the same
77
+ # defenses.
78
+ #
69
79
  # @return [String]
70
80
  def extract_description(source_code)
71
81
  return '' unless source_code
@@ -80,7 +90,31 @@ module Woods
80
90
  end
81
91
  end
82
92
 
83
- comment_lines.any? ? comment_lines.join(' ').strip : ''
93
+ return '' if comment_lines.empty?
94
+
95
+ raw = comment_lines.join(' ').strip
96
+ redact_credentials(raw)
97
+ end
98
+
99
+ def redact_credentials(text)
100
+ return text if text.empty?
101
+
102
+ # CredentialScanner#scan returns `[redacted_value, match_counts]`.
103
+ # Unpack the tuple — returning the whole Array would serialize to
104
+ # Notion as a stringified `["text...", {}]` blob.
105
+ redacted, _counts = scanner.scan(text)
106
+ redacted
107
+ rescue StandardError
108
+ # Scanner construction or scan failure — fail closed: return an
109
+ # empty description rather than risk leaking anything.
110
+ ''
111
+ end
112
+
113
+ def scanner
114
+ @scanner ||= begin
115
+ require 'woods/console/credential_scanner'
116
+ Woods::Console::CredentialScanner.new
117
+ end
84
118
  end
85
119
 
86
120
  # @return [String]
data/lib/woods/railtie.rb CHANGED
@@ -11,28 +11,68 @@ module Woods
11
11
 
12
12
  initializer 'woods.session_tracer' do |app|
13
13
  config = Woods.configuration
14
- if config.session_tracer_enabled
15
- require 'woods/session_tracer/middleware'
16
-
17
- app.middleware.use(
18
- Woods::SessionTracer::Middleware,
19
- store: config.session_store,
20
- session_id_proc: config.session_id_proc,
21
- exclude_paths: config.session_exclude_paths
22
- )
14
+ next unless config.session_tracer_enabled
15
+
16
+ if defined?(Rails) && Rails.env.production? && !config.session_tracer_allow_production
17
+ msg = '[Woods] session tracer disabled in production; ' \
18
+ 'set `session_tracer_allow_production = true` to opt in.'
19
+ if defined?(Rails.logger) && Rails.logger
20
+ Rails.logger.warn(msg)
21
+ else
22
+ warn msg
23
+ end
24
+ next
23
25
  end
26
+
27
+ require 'woods/session_tracer/middleware'
28
+
29
+ app.middleware.use(
30
+ Woods::SessionTracer::Middleware,
31
+ store: config.session_store,
32
+ session_id_proc: config.session_id_proc,
33
+ exclude_paths: config.session_exclude_paths
34
+ )
24
35
  end
25
36
 
26
37
  initializer 'woods.console_mcp' do |app|
27
38
  config = Woods.configuration
28
- if config.console_mcp_enabled
29
- require 'woods/console/rack_middleware'
39
+ next unless config.console_mcp_enabled
40
+
41
+ require 'woods/console/rack_middleware'
42
+ require 'woods/mcp/bearer_auth'
43
+ require 'woods/mcp/origin_guard'
30
44
 
31
- app.middleware.use(
32
- Woods::Console::RackMiddleware,
33
- path: config.console_mcp_path
34
- )
45
+ token = config.console_mcp_token
46
+ production = defined?(Rails) && Rails.env.production?
47
+ token_missing = token.nil? || token.to_s.empty?
48
+
49
+ if token_missing
50
+ msg = '[Woods Console] console_mcp_token is not set — Console MCP is a high-privilege ' \
51
+ 'endpoint that runs SQL and model introspection against the live database. ' \
52
+ 'Set Woods.configuration.console_mcp_token (or WOODS_CONSOLE_MCP_TOKEN env var) ' \
53
+ 'to a 32+ character random string.'
54
+ raise Woods::ConfigurationError, msg if production
55
+
56
+ # Non-prod without a token: refuse to wire the middleware at all.
57
+ # Earlier iterations fell through and installed the RackMiddleware
58
+ # with ZERO auth/origin guard in front of it — a binding on 0.0.0.0
59
+ # (common in devcontainers/docker-compose) would expose an
60
+ # unauthenticated SQL-bearing endpoint to every local process.
61
+ # Fail-closed: warn and skip.
62
+ warn "#{msg} Refusing to mount the Console MCP middleware until a token is configured."
63
+ next
35
64
  end
65
+
66
+ # Origin guard first — rejects cross-origin POSTs before any auth cost.
67
+ # BearerAuth next — requires `Authorization: Bearer <token>` on every request.
68
+ app.middleware.use(Woods::MCP::OriginGuard, allowed_origins: Array(config.console_mcp_allowed_origins))
69
+ app.middleware.use(Woods::MCP::BearerAuth, token: token)
70
+
71
+ app.middleware.use(
72
+ Woods::Console::RackMiddleware,
73
+ path: config.console_mcp_path,
74
+ embedded_read_tools: config.console_embedded_read_tools
75
+ )
36
76
  end
37
77
  end
38
78
  end
@@ -56,7 +56,7 @@ module Woods
56
56
  @mutex.synchronize do
57
57
  case @state
58
58
  when :open
59
- unless Time.now - @last_failure_time >= @reset_timeout
59
+ unless monotonic_now - @last_failure_time >= @reset_timeout
60
60
  raise CircuitOpenError, "Circuit breaker is open (#{@failure_count} failures)"
61
61
  end
62
62
 
@@ -81,10 +81,17 @@ module Woods
81
81
 
82
82
  private
83
83
 
84
+ # Monotonic clock reading — immune to NTP slews and DST adjustments.
85
+ #
86
+ # @return [Float] seconds from an unspecified epoch.
87
+ def monotonic_now
88
+ Process.clock_gettime(Process::CLOCK_MONOTONIC)
89
+ end
90
+
84
91
  # Record a failure and potentially open the circuit.
85
92
  def record_failure
86
93
  @failure_count += 1
87
- @last_failure_time = Time.now
94
+ @last_failure_time = monotonic_now
88
95
  @state = :open if @failure_count >= @threshold
89
96
  end
90
97
 
@@ -69,29 +69,66 @@ module Woods
69
69
  @provider.model_name
70
70
  end
71
71
 
72
+ # Delegate the per-provider input cap. The retry wrapper does not
73
+ # change the provider's budget, so just hand through whatever the
74
+ # inner provider reports. Without this, `respond_to?` returns true
75
+ # via Interface but the call raises NotImplementedError.
76
+ #
77
+ # @return [Integer, nil]
78
+ def max_input_tokens
79
+ return @provider.max_input_tokens if @provider.respond_to?(:max_input_tokens)
80
+
81
+ nil
82
+ end
83
+
84
+ # Maximum backoff delay in seconds. Without a cap, attempts 8+ sleep
85
+ # longer than most service-level timeouts (>25s) and compound retry
86
+ # storms across correlated workers.
87
+ MAX_BACKOFF_SECONDS = 30.0
88
+
89
+ # Base multiplier for exponential backoff. Delay is roughly
90
+ # `BACKOFF_BASE * 2**attempt` with full jitter applied on top.
91
+ BACKOFF_BASE = 0.1
92
+
72
93
  private
73
94
 
74
- # Execute a block with retry logic and exponential backoff.
95
+ # Execute a block with retry logic, exponential backoff, and jitter.
96
+ #
97
+ # Argument errors surface immediately (non-retryable — they indicate
98
+ # a programming mistake or invalid input, not a transient failure).
75
99
  #
76
100
  # @yield The block to execute
77
101
  # @return [Object] The return value of the block
78
102
  # @raise [CircuitOpenError] immediately without retrying
103
+ # @raise [ArgumentError] immediately without retrying
79
104
  # @raise [StandardError] the last error if all retries are exhausted
80
105
  def with_retries
81
106
  attempt = 0
82
107
  begin
83
108
  attempt += 1
84
109
  yield
85
- rescue CircuitOpenError
110
+ rescue CircuitOpenError, ArgumentError
86
111
  raise
87
112
  rescue StandardError => e
88
113
  raise e if attempt > @max_retries
89
114
 
90
- sleep((2**attempt) * 0.1)
115
+ sleep(backoff_seconds(attempt))
91
116
  retry
92
117
  end
93
118
  end
94
119
 
120
+ # Full-jitter exponential backoff with a hard cap. See "Exponential
121
+ # Backoff and Jitter", AWS Architecture Blog (Marc Brooker, 2015):
122
+ # a uniformly random delay in [0, base*2**attempt] de-correlates
123
+ # competing retry waves.
124
+ #
125
+ # @param attempt [Integer] 1-based attempt counter
126
+ # @return [Float] seconds to sleep before the next retry
127
+ def backoff_seconds(attempt)
128
+ ceiling = [BACKOFF_BASE * (2**attempt), MAX_BACKOFF_SECONDS].min
129
+ rand * ceiling
130
+ end
131
+
95
132
  # Route a call through the circuit breaker if one is configured.
96
133
  #
97
134
  # @yield The block to execute