woods 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +169 -0
- data/README.md +20 -8
- data/exe/woods-console +51 -6
- data/exe/woods-console-mcp +24 -4
- data/exe/woods-mcp +30 -7
- data/exe/woods-mcp-http +47 -6
- data/lib/generators/woods/install_generator.rb +13 -4
- data/lib/generators/woods/templates/woods.rb.tt +155 -0
- data/lib/tasks/woods.rake +15 -50
- data/lib/woods/builder.rb +174 -9
- data/lib/woods/cache/cache_middleware.rb +360 -31
- data/lib/woods/chunking/semantic_chunker.rb +334 -7
- data/lib/woods/console/adapters/job_adapter.rb +10 -4
- data/lib/woods/console/audit_logger.rb +76 -4
- data/lib/woods/console/bridge.rb +48 -15
- data/lib/woods/console/bridge_protocol.rb +44 -0
- data/lib/woods/console/confirmation.rb +3 -4
- data/lib/woods/console/console_response_renderer.rb +56 -18
- data/lib/woods/console/credential_index.rb +201 -0
- data/lib/woods/console/credential_scanner.rb +302 -0
- data/lib/woods/console/dispatch_pipeline.rb +138 -0
- data/lib/woods/console/embedded_executor.rb +682 -35
- data/lib/woods/console/eval_guard.rb +319 -0
- data/lib/woods/console/model_validator.rb +1 -3
- data/lib/woods/console/rack_middleware.rb +185 -29
- data/lib/woods/console/redactor.rb +161 -0
- data/lib/woods/console/response_context.rb +127 -0
- data/lib/woods/console/safe_context.rb +220 -23
- data/lib/woods/console/scope_predicate_parser.rb +131 -0
- data/lib/woods/console/server.rb +417 -486
- data/lib/woods/console/sql_noise_stripper.rb +87 -0
- data/lib/woods/console/sql_table_scanner.rb +213 -0
- data/lib/woods/console/sql_validator.rb +81 -31
- data/lib/woods/console/table_gate.rb +93 -0
- data/lib/woods/console/tool_specs.rb +552 -0
- data/lib/woods/console/tools/tier1.rb +3 -3
- data/lib/woods/console/tools/tier4.rb +7 -1
- data/lib/woods/dependency_graph.rb +66 -7
- data/lib/woods/embedding/indexer.rb +190 -6
- data/lib/woods/embedding/openai.rb +40 -4
- data/lib/woods/embedding/provider.rb +104 -8
- data/lib/woods/embedding/text_preparer.rb +23 -3
- data/lib/woods/embedding/token_counter.rb +133 -0
- data/lib/woods/evaluation/baseline_runner.rb +20 -2
- data/lib/woods/evaluation/metrics.rb +4 -1
- data/lib/woods/extracted_unit.rb +1 -0
- data/lib/woods/extractor.rb +7 -1
- data/lib/woods/extractors/controller_extractor.rb +6 -0
- data/lib/woods/extractors/mailer_extractor.rb +16 -2
- data/lib/woods/extractors/model_extractor.rb +6 -1
- data/lib/woods/extractors/phlex_extractor.rb +13 -4
- data/lib/woods/extractors/rails_source_extractor.rb +2 -0
- data/lib/woods/extractors/route_helper_resolver.rb +130 -0
- data/lib/woods/extractors/shared_dependency_scanner.rb +130 -2
- data/lib/woods/extractors/view_component_extractor.rb +12 -1
- data/lib/woods/extractors/view_engines/base.rb +141 -0
- data/lib/woods/extractors/view_engines/erb.rb +145 -0
- data/lib/woods/extractors/view_template_extractor.rb +92 -133
- data/lib/woods/flow_assembler.rb +23 -15
- data/lib/woods/flow_precomputer.rb +21 -2
- data/lib/woods/graph_analyzer.rb +3 -4
- data/lib/woods/index_artifact.rb +173 -0
- data/lib/woods/mcp/bearer_auth.rb +45 -0
- data/lib/woods/mcp/bootstrap_state.rb +94 -0
- data/lib/woods/mcp/bootstrapper.rb +337 -16
- data/lib/woods/mcp/config_resolver.rb +288 -0
- data/lib/woods/mcp/errors.rb +134 -0
- data/lib/woods/mcp/index_reader.rb +265 -30
- data/lib/woods/mcp/origin_guard.rb +132 -0
- data/lib/woods/mcp/provider_probe.rb +166 -0
- data/lib/woods/mcp/renderers/claude_renderer.rb +6 -0
- data/lib/woods/mcp/renderers/markdown_renderer.rb +39 -3
- data/lib/woods/mcp/renderers/plain_renderer.rb +16 -2
- data/lib/woods/mcp/server.rb +737 -137
- data/lib/woods/model_name_cache.rb +78 -2
- data/lib/woods/notion/client.rb +25 -2
- data/lib/woods/notion/mappers/model_mapper.rb +36 -2
- data/lib/woods/railtie.rb +55 -15
- data/lib/woods/resilience/circuit_breaker.rb +9 -2
- data/lib/woods/resilience/retryable_provider.rb +40 -3
- data/lib/woods/resolved_config.rb +299 -0
- data/lib/woods/retrieval/context_assembler.rb +112 -5
- data/lib/woods/retrieval/query_classifier.rb +1 -1
- data/lib/woods/retrieval/ranker.rb +55 -6
- data/lib/woods/retrieval/search_executor.rb +42 -13
- data/lib/woods/retriever.rb +330 -24
- data/lib/woods/session_tracer/middleware.rb +35 -1
- data/lib/woods/storage/graph_store.rb +39 -0
- data/lib/woods/storage/inapplicable_backend.rb +14 -0
- data/lib/woods/storage/metadata_store.rb +129 -1
- data/lib/woods/storage/pgvector.rb +70 -8
- data/lib/woods/storage/qdrant.rb +196 -5
- data/lib/woods/storage/snapshotter/metadata.rb +172 -0
- data/lib/woods/storage/snapshotter/vector.rb +238 -0
- data/lib/woods/storage/snapshotter.rb +24 -0
- data/lib/woods/storage/vector_store.rb +184 -35
- data/lib/woods/tasks.rb +85 -0
- data/lib/woods/temporal/snapshot_store.rb +49 -1
- data/lib/woods/token_utils.rb +44 -5
- data/lib/woods/unblocked/client.rb +1 -1
- data/lib/woods/unblocked/document_builder.rb +35 -10
- data/lib/woods/unblocked/exporter.rb +1 -1
- data/lib/woods/util/host_guard.rb +61 -0
- data/lib/woods/version.rb +1 -1
- data/lib/woods.rb +126 -6
- metadata +69 -4
|
@@ -5,20 +5,29 @@ require 'rails/generators/active_record'
|
|
|
5
5
|
|
|
6
6
|
module Woods
|
|
7
7
|
module Generators
|
|
8
|
-
# Rails generator that
|
|
8
|
+
# Rails generator that installs Woods into a Rails application.
|
|
9
9
|
#
|
|
10
10
|
# Usage:
|
|
11
11
|
# rails generate woods:install
|
|
12
12
|
#
|
|
13
|
-
# Creates
|
|
14
|
-
#
|
|
13
|
+
# Creates:
|
|
14
|
+
# config/initializers/woods.rb — annotated configuration file
|
|
15
|
+
# db/migrate/<ts>_create_woods_tables.rb — migration for Woods tables
|
|
16
|
+
#
|
|
17
|
+
# The migration creates woods_units, woods_edges, and woods_embeddings
|
|
18
|
+
# tables. Works with PostgreSQL, MySQL, and SQLite.
|
|
15
19
|
#
|
|
16
20
|
class InstallGenerator < Rails::Generators::Base
|
|
17
21
|
include ActiveRecord::Generators::Migration
|
|
18
22
|
|
|
19
23
|
source_root File.expand_path('templates', __dir__)
|
|
20
24
|
|
|
21
|
-
desc 'Creates a migration for Woods tables
|
|
25
|
+
desc 'Creates a Woods initializer and migration for Woods tables'
|
|
26
|
+
|
|
27
|
+
# @return [void]
|
|
28
|
+
def create_initializer_file
|
|
29
|
+
template 'woods.rb.tt', 'config/initializers/woods.rb'
|
|
30
|
+
end
|
|
22
31
|
|
|
23
32
|
# @return [void]
|
|
24
33
|
def create_migration_file
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Woods configuration
|
|
4
|
+
# Full reference: https://github.com/bigcartel/woods/blob/main/docs/CONFIGURATION_REFERENCE.md
|
|
5
|
+
#
|
|
6
|
+
# Quick-start presets (uncomment one instead of the full block below):
|
|
7
|
+
# Woods.configure_with_preset(:local) # in-memory + Ollama, no external services
|
|
8
|
+
# Woods.configure_with_preset(:postgresql) # pgvector + OpenAI (PostgreSQL required)
|
|
9
|
+
# Woods.configure_with_preset(:production) # Qdrant + OpenAI (production-scale)
|
|
10
|
+
#
|
|
11
|
+
# Presets accept a block for overrides:
|
|
12
|
+
# Woods.configure_with_preset(:local) { |c| c.max_context_tokens = 16_000 }
|
|
13
|
+
|
|
14
|
+
Woods.configure do |config|
|
|
15
|
+
# ── Core ────────────────────────────────────────────────────────────────
|
|
16
|
+
|
|
17
|
+
# Directory where extracted JSON is written.
|
|
18
|
+
# Default: Rails.root.join('tmp/woods')
|
|
19
|
+
config.output_dir = Rails.root.join('tmp/woods')
|
|
20
|
+
|
|
21
|
+
# Maximum tokens returned in a retrieval context window.
|
|
22
|
+
# config.max_context_tokens = 8_000
|
|
23
|
+
|
|
24
|
+
# Minimum vector similarity score (0.0–1.0) for retrieval results.
|
|
25
|
+
# config.similarity_threshold = 0.7
|
|
26
|
+
|
|
27
|
+
# Output format for retrieval: :claude, :markdown, :plain, :json
|
|
28
|
+
# config.context_format = :markdown
|
|
29
|
+
|
|
30
|
+
# Pretty-print extracted JSON (disable in CI to save disk space).
|
|
31
|
+
# config.pretty_json = true
|
|
32
|
+
|
|
33
|
+
# ── Extractors ──────────────────────────────────────────────────────────
|
|
34
|
+
|
|
35
|
+
# Enabled extractors. Default set covers the most common Rails layers.
|
|
36
|
+
# See CONFIGURATION_REFERENCE.md for the full list of available symbols.
|
|
37
|
+
# config.extractors = %i[
|
|
38
|
+
# models controllers services components view_components
|
|
39
|
+
# jobs mailers graphql serializers managers policies validators
|
|
40
|
+
# rails_source
|
|
41
|
+
# ]
|
|
42
|
+
|
|
43
|
+
# Include Rails / gem source in the index (increases extraction time).
|
|
44
|
+
# config.include_framework_sources = true
|
|
45
|
+
|
|
46
|
+
# Enable parallel extraction (experimental — may conflict with some apps).
|
|
47
|
+
# config.concurrent_extraction = false
|
|
48
|
+
|
|
49
|
+
# ── Embedding ───────────────────────────────────────────────────────────
|
|
50
|
+
|
|
51
|
+
# Embedding provider: :openai or :ollama
|
|
52
|
+
# config.embedding_provider = :openai
|
|
53
|
+
# config.embedding_model = 'text-embedding-3-small'
|
|
54
|
+
# config.embedding_options = { api_key: ENV['OPENAI_API_KEY'] }
|
|
55
|
+
|
|
56
|
+
# Ollama (local, no API key needed). `num_ctx` is auto-selected per model
|
|
57
|
+
# (nomic-embed-text → 2048, bge-m3 → 8192). Install `gem "tokenizers"` for
|
|
58
|
+
# exact BERT WordPiece token counting on dense Ruby source. See
|
|
59
|
+
# docs/EMBEDDING_MODELS.md for the full model comparison.
|
|
60
|
+
# config.embedding_provider = :ollama
|
|
61
|
+
# config.embedding_options = {
|
|
62
|
+
# model: 'nomic-embed-text',
|
|
63
|
+
# host: ENV.fetch('OLLAMA_URL', 'http://localhost:11434')
|
|
64
|
+
# }
|
|
65
|
+
|
|
66
|
+
# ── Storage ─────────────────────────────────────────────────────────────
|
|
67
|
+
|
|
68
|
+
# Vector store: :in_memory, :pgvector (PostgreSQL), :qdrant
|
|
69
|
+
# config.vector_store = :in_memory
|
|
70
|
+
|
|
71
|
+
# pgvector — run `rails generate woods:pgvector && rails db:migrate` first.
|
|
72
|
+
# config.vector_store = :pgvector
|
|
73
|
+
# config.vector_store_options = {
|
|
74
|
+
# connection: ActiveRecord::Base.connection,
|
|
75
|
+
# dimensions: 1_536
|
|
76
|
+
# }
|
|
77
|
+
|
|
78
|
+
# Qdrant:
|
|
79
|
+
# config.vector_store = :qdrant
|
|
80
|
+
# config.vector_store_options = {
|
|
81
|
+
# url: ENV.fetch('QDRANT_URL', 'http://localhost:6333'),
|
|
82
|
+
# collection: 'woods',
|
|
83
|
+
# dimensions: 1_536
|
|
84
|
+
# }
|
|
85
|
+
|
|
86
|
+
# Metadata store: :in_memory, :sqlite
|
|
87
|
+
# config.metadata_store = :in_memory
|
|
88
|
+
# config.metadata_store_options = {
|
|
89
|
+
# database: Rails.root.join('tmp/woods/metadata.sqlite3').to_s
|
|
90
|
+
# }
|
|
91
|
+
|
|
92
|
+
# ── Pipeline ────────────────────────────────────────────────────────────
|
|
93
|
+
|
|
94
|
+
# Pre-compute per-action request flow maps during extraction (slow).
|
|
95
|
+
# config.precompute_flows = false
|
|
96
|
+
|
|
97
|
+
# Extract link_to / redirect_to / form_action navigation edges.
|
|
98
|
+
# config.extract_navigation_edges = true
|
|
99
|
+
|
|
100
|
+
# Temporal snapshots — requires migrations 004+005.
|
|
101
|
+
# config.enable_snapshots = false
|
|
102
|
+
|
|
103
|
+
# ── Console MCP ─────────────────────────────────────────────────────────
|
|
104
|
+
#
|
|
105
|
+
# The Console MCP server lets AI tools query your live Rails app.
|
|
106
|
+
# It is DISABLED by default. Enable only after reviewing the security
|
|
107
|
+
# documentation in docs/CONSOLE_MCP_SETUP.md.
|
|
108
|
+
#
|
|
109
|
+
# Defense layers (all active by default when the server is on):
|
|
110
|
+
# Layer 1 — SqlValidator: rejects DML/DDL before any DB interaction.
|
|
111
|
+
# Layer 2 — SafeContext: wraps every request in a rolled-back transaction;
|
|
112
|
+
# writes are silently discarded even if Layer 1 is bypassed.
|
|
113
|
+
# Layer 3 — Column redaction: credential columns are replaced with
|
|
114
|
+
# [REDACTED] in every tool response.
|
|
115
|
+
|
|
116
|
+
# config.console_mcp_enabled = false
|
|
117
|
+
# config.console_mcp_path = '/mcp/console'
|
|
118
|
+
|
|
119
|
+
# Credential-column redaction (Layer 3).
|
|
120
|
+
# Starts from a safe default list (passwords, tokens, secrets).
|
|
121
|
+
# Extend: Woods::DEFAULT_CONSOLE_REDACTED_COLUMNS + %w[my_secret_col]
|
|
122
|
+
# Override entirely to remove a default:
|
|
123
|
+
# config.console_redacted_columns = %w[password token api_key]
|
|
124
|
+
|
|
125
|
+
# Key-value pairs where the value should be redacted (e.g., env var names).
|
|
126
|
+
# config.console_redacted_key_values = []
|
|
127
|
+
|
|
128
|
+
# Tables completely blocked from queries.
|
|
129
|
+
# config.console_blocked_tables = []
|
|
130
|
+
|
|
131
|
+
# Disable specific scanner patterns (rare — prefer blocked_tables).
|
|
132
|
+
# config.console_disabled_scanner_patterns = []
|
|
133
|
+
|
|
134
|
+
# Allow the AI console to execute Ruby eval (off by default; very dangerous).
|
|
135
|
+
# config.console_unsafe_eval_enabled = false
|
|
136
|
+
|
|
137
|
+
# Expose SQL/query read tools inside embedded console (adds read-only DB
|
|
138
|
+
# access via the rake task or Docker exec path; SqlValidator still applies).
|
|
139
|
+
# config.console_embedded_read_tools = false
|
|
140
|
+
|
|
141
|
+
# ── Caching ─────────────────────────────────────────────────────────────
|
|
142
|
+
|
|
143
|
+
# Cache embedding and retrieval responses to reduce API cost.
|
|
144
|
+
# config.cache_enabled = false
|
|
145
|
+
# config.cache_store = :redis # :redis, :solid_cache, :memory
|
|
146
|
+
# config.cache_options = { redis: Redis.new(url: ENV['REDIS_URL']) }
|
|
147
|
+
|
|
148
|
+
# ── Notion Export ───────────────────────────────────────────────────────
|
|
149
|
+
|
|
150
|
+
# config.notion_api_token = ENV['NOTION_API_TOKEN']
|
|
151
|
+
# config.notion_database_ids = {
|
|
152
|
+
# data_models: 'your-database-id',
|
|
153
|
+
# columns: 'your-database-id'
|
|
154
|
+
# }
|
|
155
|
+
end
|
data/lib/tasks/woods.rake
CHANGED
|
@@ -354,33 +354,11 @@ namespace :woods do
|
|
|
354
354
|
desc 'Embed all extracted units'
|
|
355
355
|
task embed: :environment do
|
|
356
356
|
require 'woods'
|
|
357
|
-
require 'woods/
|
|
358
|
-
require 'woods/embedding/text_preparer'
|
|
359
|
-
require 'woods/embedding/provider'
|
|
360
|
-
require 'woods/storage/vector_store'
|
|
361
|
-
|
|
362
|
-
config = Woods.configuration
|
|
363
|
-
output_dir = ENV.fetch('WOODS_OUTPUT', config.output_dir)
|
|
364
|
-
|
|
365
|
-
provider = Woods::Embedding::Provider::Ollama.new
|
|
366
|
-
text_preparer = Woods::Embedding::TextPreparer.new
|
|
367
|
-
vector_store = Woods::Storage::VectorStore::InMemory.new
|
|
368
|
-
|
|
369
|
-
indexer = Woods::Embedding::Indexer.new(
|
|
370
|
-
provider: provider,
|
|
371
|
-
text_preparer: text_preparer,
|
|
372
|
-
vector_store: vector_store,
|
|
373
|
-
output_dir: output_dir
|
|
374
|
-
)
|
|
357
|
+
require 'woods/tasks'
|
|
375
358
|
|
|
359
|
+
indexer = Woods::Tasks.build_embed_indexer
|
|
376
360
|
puts 'Embedding all extracted units...'
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
puts
|
|
380
|
-
puts 'Embedding complete!'
|
|
381
|
-
puts " Processed: #{stats[:processed]}"
|
|
382
|
-
puts " Skipped: #{stats[:skipped]}"
|
|
383
|
-
puts " Errors: #{stats[:errors]}"
|
|
361
|
+
Woods::Tasks.print_embed_stats(indexer.index_all, mode: :full)
|
|
384
362
|
end
|
|
385
363
|
|
|
386
364
|
desc 'Nest the data — embed all units (alias for embed)'
|
|
@@ -389,33 +367,11 @@ namespace :woods do
|
|
|
389
367
|
desc 'Embed changed units only (incremental)'
|
|
390
368
|
task embed_incremental: :environment do
|
|
391
369
|
require 'woods'
|
|
392
|
-
require 'woods/
|
|
393
|
-
require 'woods/embedding/text_preparer'
|
|
394
|
-
require 'woods/embedding/provider'
|
|
395
|
-
require 'woods/storage/vector_store'
|
|
396
|
-
|
|
397
|
-
config = Woods.configuration
|
|
398
|
-
output_dir = ENV.fetch('WOODS_OUTPUT', config.output_dir)
|
|
399
|
-
|
|
400
|
-
provider = Woods::Embedding::Provider::Ollama.new
|
|
401
|
-
text_preparer = Woods::Embedding::TextPreparer.new
|
|
402
|
-
vector_store = Woods::Storage::VectorStore::InMemory.new
|
|
403
|
-
|
|
404
|
-
indexer = Woods::Embedding::Indexer.new(
|
|
405
|
-
provider: provider,
|
|
406
|
-
text_preparer: text_preparer,
|
|
407
|
-
vector_store: vector_store,
|
|
408
|
-
output_dir: output_dir
|
|
409
|
-
)
|
|
370
|
+
require 'woods/tasks'
|
|
410
371
|
|
|
372
|
+
indexer = Woods::Tasks.build_embed_indexer
|
|
411
373
|
puts 'Embedding changed units (incremental)...'
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
puts
|
|
415
|
-
puts 'Incremental embedding complete!'
|
|
416
|
-
puts " Processed: #{stats[:processed]}"
|
|
417
|
-
puts " Skipped: #{stats[:skipped]}"
|
|
418
|
-
puts " Errors: #{stats[:errors]}"
|
|
374
|
+
Woods::Tasks.print_embed_stats(indexer.index_incremental, mode: :incremental)
|
|
419
375
|
end
|
|
420
376
|
|
|
421
377
|
desc 'Hone the blade — incremental embedding (alias for embed_incremental)'
|
|
@@ -672,4 +628,13 @@ namespace :woods do
|
|
|
672
628
|
|
|
673
629
|
desc 'Relay findings to Unblocked (alias for unblocked_sync)'
|
|
674
630
|
task relay: :unblocked_sync
|
|
631
|
+
|
|
632
|
+
desc 'Generate a random bearer token for woods-mcp-http (WOODS_MCP_HTTP_TOKEN)'
|
|
633
|
+
task :generate_token do
|
|
634
|
+
require 'securerandom'
|
|
635
|
+
token = SecureRandom.hex(32)
|
|
636
|
+
puts token
|
|
637
|
+
warn 'Set WOODS_MCP_HTTP_TOKEN to this value in the environment where woods-mcp-http runs,'
|
|
638
|
+
warn 'and send it as `Authorization: Bearer <token>` from clients.'
|
|
639
|
+
end
|
|
675
640
|
end
|
data/lib/woods/builder.rb
CHANGED
|
@@ -8,6 +8,10 @@ require_relative 'storage/metadata_store'
|
|
|
8
8
|
require_relative 'storage/graph_store'
|
|
9
9
|
require_relative 'embedding/provider'
|
|
10
10
|
require_relative 'embedding/openai'
|
|
11
|
+
require_relative 'embedding/text_preparer'
|
|
12
|
+
require_relative 'embedding/token_counter'
|
|
13
|
+
require_relative 'token_utils'
|
|
14
|
+
require_relative 'chunking/semantic_chunker'
|
|
11
15
|
|
|
12
16
|
module Woods
|
|
13
17
|
# Builder reads a {Configuration} and instantiates the appropriate adapters,
|
|
@@ -29,9 +33,13 @@ module Woods
|
|
|
29
33
|
class Builder # rubocop:disable Metrics/ClassLength
|
|
30
34
|
# Named presets mapping to default adapter types.
|
|
31
35
|
#
|
|
32
|
-
# :local
|
|
33
|
-
# :
|
|
34
|
-
#
|
|
36
|
+
# :local — fully local, no external services required (requires sqlite3 gem)
|
|
37
|
+
# :shared_filesystem — Shape 2: rake embed → separate MCP server reads from disk.
|
|
38
|
+
# All stores in-memory + persisted to output_dir via the
|
|
39
|
+
# Snapshotter. No sqlite3 gem needed. Requires output_dir set
|
|
40
|
+
# AND readable by both the embed process and the MCP server.
|
|
41
|
+
# :postgresql — pgvector for vectors, OpenAI for embeddings
|
|
42
|
+
# :production — Qdrant for vectors, OpenAI for embeddings
|
|
35
43
|
PRESETS = {
|
|
36
44
|
local: {
|
|
37
45
|
vector_store: :in_memory,
|
|
@@ -39,6 +47,12 @@ module Woods
|
|
|
39
47
|
graph_store: :in_memory,
|
|
40
48
|
embedding_provider: :ollama
|
|
41
49
|
},
|
|
50
|
+
shared_filesystem: {
|
|
51
|
+
vector_store: :in_memory,
|
|
52
|
+
metadata_store: :in_memory,
|
|
53
|
+
graph_store: :in_memory,
|
|
54
|
+
embedding_provider: :ollama
|
|
55
|
+
},
|
|
42
56
|
postgresql: {
|
|
43
57
|
vector_store: :pgvector,
|
|
44
58
|
metadata_store: :sqlite,
|
|
@@ -78,17 +92,30 @@ module Woods
|
|
|
78
92
|
# {Cache::CachedEmbeddingProvider} and the retriever is wrapped with
|
|
79
93
|
# {Cache::CachedRetriever} for transparent caching of expensive operations.
|
|
80
94
|
#
|
|
95
|
+
# Callers that need stores pre-populated from a dump (the Shape-2
|
|
96
|
+
# MCP-serve path) can inject them via +vector_store:+ / +metadata_store:+.
|
|
97
|
+
# Without these, fresh empty stores are constructed from config. This
|
|
98
|
+
# is how the Bootstrapper hydrates from `Snapshotter.load_or_empty`
|
|
99
|
+
# without Builder needing to know the Snapshotter exists.
|
|
100
|
+
#
|
|
101
|
+
# @param vector_store [Storage::VectorStore::Interface, nil]
|
|
102
|
+
# @param metadata_store [Storage::MetadataStore::Interface, nil]
|
|
103
|
+
# @param graph_store [Storage::GraphStore::Interface, nil] Pre-populated
|
|
104
|
+
# graph store. Without this, the retriever gets a fresh empty graph,
|
|
105
|
+
# which silently degrades +:hybrid+ retrieval (graph expansion returns
|
|
106
|
+
# no candidates). The Bootstrapper hydrates from +dependency_graph.json+
|
|
107
|
+
# on disk and passes the populated store here.
|
|
81
108
|
# @return [Retriever, Cache::CachedRetriever] A fully wired retriever
|
|
82
|
-
def build_retriever
|
|
109
|
+
def build_retriever(vector_store: nil, metadata_store: nil, graph_store: nil)
|
|
83
110
|
provider = build_embedding_provider
|
|
84
111
|
cache = build_cache_store
|
|
85
112
|
|
|
86
113
|
provider = wrap_with_embedding_cache(provider, cache) if cache
|
|
87
114
|
|
|
88
115
|
retriever = Retriever.new(
|
|
89
|
-
vector_store: build_vector_store,
|
|
90
|
-
metadata_store: build_metadata_store,
|
|
91
|
-
graph_store: build_graph_store,
|
|
116
|
+
vector_store: vector_store || build_vector_store,
|
|
117
|
+
metadata_store: metadata_store || build_metadata_store,
|
|
118
|
+
graph_store: graph_store || build_graph_store,
|
|
92
119
|
embedding_provider: provider
|
|
93
120
|
)
|
|
94
121
|
|
|
@@ -110,18 +137,154 @@ module Woods
|
|
|
110
137
|
|
|
111
138
|
# Instantiate the embedding provider specified by the configuration.
|
|
112
139
|
#
|
|
140
|
+
# Strips `embedding_options` keys that belong to the ResolvedConfig layer
|
|
141
|
+
# (like `:dimension`) before splatting into the provider's constructor —
|
|
142
|
+
# those keys are useful for the Snapshotter's schema header but
|
|
143
|
+
# aren't part of the provider's API.
|
|
144
|
+
#
|
|
113
145
|
# @return [Embedding::Provider::Interface] Embedding provider instance
|
|
114
146
|
# @raise [ArgumentError] if the configured type is not recognized
|
|
115
147
|
def build_embedding_provider
|
|
148
|
+
opts = provider_kwargs
|
|
116
149
|
case @config.embedding_provider
|
|
117
|
-
when :openai then Embedding::Provider::OpenAI.new(**
|
|
118
|
-
when :ollama then Embedding::Provider::Ollama.new(**
|
|
150
|
+
when :openai then Embedding::Provider::OpenAI.new(**opts)
|
|
151
|
+
when :ollama then Embedding::Provider::Ollama.new(**opts)
|
|
119
152
|
else raise ArgumentError, "Unknown embedding_provider: #{@config.embedding_provider}"
|
|
120
153
|
end
|
|
121
154
|
end
|
|
122
155
|
|
|
156
|
+
# Kwargs accepted by embedding provider constructors — everything in
|
|
157
|
+
# `embedding_options` except metadata fields that live there for
|
|
158
|
+
# ResolvedConfig bookkeeping.
|
|
159
|
+
SNAPSHOT_ONLY_KEYS = %i[dimension].freeze
|
|
160
|
+
private_constant :SNAPSHOT_ONLY_KEYS
|
|
161
|
+
|
|
162
|
+
def provider_kwargs
|
|
163
|
+
opts = (@config.embedding_options || {}).transform_keys(&:to_sym)
|
|
164
|
+
SNAPSHOT_ONLY_KEYS.each { |k| opts.delete(k) }
|
|
165
|
+
opts
|
|
166
|
+
end
|
|
167
|
+
private :provider_kwargs
|
|
168
|
+
|
|
169
|
+
# Build a {Embedding::TextPreparer} calibrated to a given provider.
|
|
170
|
+
#
|
|
171
|
+
# OpenAI embedders use tiktoken (cl100k_base) — 4.0 chars/token is a
|
|
172
|
+
# good conservative average. Ollama BERT/WordPiece tokenizers
|
|
173
|
+
# (nomic-embed-text, bge-*) run much hotter on dense Ruby/Rails
|
|
174
|
+
# source — long CamelCase constants, docstrings, callback DSLs, and
|
|
175
|
+
# heavy symbol use all sit below 2.0 chars/token in practice.
|
|
176
|
+
# Empirically, a 16 KB chunk of `ActionMailer::Base` still blows the
|
|
177
|
+
# 8192-token budget at 2.0 chars/token, so we budget at 1.5 to stay
|
|
178
|
+
# clear of tokenizer surprises even on the densest Rails internals.
|
|
179
|
+
#
|
|
180
|
+
# `max_tokens` tracks the provider's actual input budget when it
|
|
181
|
+
# reports one, falling back to the TextPreparer default otherwise.
|
|
182
|
+
#
|
|
183
|
+
# @param provider [Embedding::Provider::Interface]
|
|
184
|
+
# @return [Embedding::TextPreparer]
|
|
185
|
+
def build_text_preparer(provider)
|
|
186
|
+
chars_per_token = chars_per_token_for(provider)
|
|
187
|
+
budget = provider.respond_to?(:max_input_tokens) ? provider.max_input_tokens : nil
|
|
188
|
+
max_tokens = budget || Embedding::TextPreparer::DEFAULT_MAX_TOKENS
|
|
189
|
+
|
|
190
|
+
Embedding::TextPreparer.new(max_tokens: max_tokens, chars_per_token: chars_per_token)
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
# Build a {Chunking::SemanticChunker} sized to a given provider.
|
|
194
|
+
#
|
|
195
|
+
# `max_chars` is derived from the provider's input budget and the
|
|
196
|
+
# matching chars-per-token ratio, minus the context-prefix
|
|
197
|
+
# allowance the Indexer accounts for separately. Units that exceed
|
|
198
|
+
# this ceiling get sliced so no single chunk can blow the provider's
|
|
199
|
+
# input cap.
|
|
200
|
+
#
|
|
201
|
+
# For Ollama (and other BERT/WordPiece-backed models), char-based
|
|
202
|
+
# estimation is unreliable — CamelCase, `::` separators, and symbol
|
|
203
|
+
# literals tokenize much denser than chars/token averages suggest.
|
|
204
|
+
# When the optional `tokenizers` gem is installed, pass a
|
|
205
|
+
# {Embedding::TokenCounter} and `max_tokens` so the chunker can
|
|
206
|
+
# verify every slice with the real tokenizer and re-split any piece
|
|
207
|
+
# that still exceeds `num_ctx`. See docs/EMBEDDING_MODELS.md.
|
|
208
|
+
#
|
|
209
|
+
# Ollama v0.13.5+ stopped honouring `truncate: true` on `/api/embed`
|
|
210
|
+
# (ollama/ollama#14186), so any chunk that exceeds `num_ctx` returns
|
|
211
|
+
# a 400 rather than being silently truncated. Exact client-side
|
|
212
|
+
# sizing is the only reliable path until the regression is fixed
|
|
213
|
+
# upstream.
|
|
214
|
+
#
|
|
215
|
+
# @param provider [Embedding::Provider::Interface]
|
|
216
|
+
# @return [Chunking::SemanticChunker]
|
|
217
|
+
def build_chunker(provider)
|
|
218
|
+
budget = provider.respond_to?(:max_input_tokens) ? provider.max_input_tokens : nil
|
|
219
|
+
max_chars = ((budget * chars_per_token_for(provider)).floor - CHUNKER_PREFIX_ALLOWANCE if budget)
|
|
220
|
+
|
|
221
|
+
# Guard against a budget so small that the prefix allowance leaves
|
|
222
|
+
# no room for content. Without this, SemanticChunker#slice_by_lines
|
|
223
|
+
# passes a negative repeat count to String#scan, which returns []
|
|
224
|
+
# — every chunk becomes empty and is silently dropped, producing
|
|
225
|
+
# zero embeddings with no error. Surface the misconfiguration loudly.
|
|
226
|
+
raise ArgumentError, chunker_budget_message(provider, budget) if max_chars && max_chars <= 0
|
|
227
|
+
|
|
228
|
+
token_counter = token_counter_for(provider)
|
|
229
|
+
max_tokens = token_counter && budget ? budget - PREFIX_TOKEN_ALLOWANCE : nil
|
|
230
|
+
|
|
231
|
+
Chunking::SemanticChunker.new(
|
|
232
|
+
max_chars: max_chars,
|
|
233
|
+
token_counter: token_counter,
|
|
234
|
+
max_tokens: max_tokens
|
|
235
|
+
)
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
# Character allowance reserved for the TextPreparer context prefix
|
|
239
|
+
# ([type] id / namespace / file / deps) — kept in sync with the
|
|
240
|
+
# Indexer's own PREFIX_CHAR_ALLOWANCE constant.
|
|
241
|
+
CHUNKER_PREFIX_ALLOWANCE = 512
|
|
242
|
+
private_constant :CHUNKER_PREFIX_ALLOWANCE
|
|
243
|
+
|
|
244
|
+
# Token-side sibling of {CHUNKER_PREFIX_ALLOWANCE}. Reserved for the
|
|
245
|
+
# TextPreparer prefix when tokenizer-driven sizing is active — a bit
|
|
246
|
+
# generous to cover long file paths and dep lists.
|
|
247
|
+
PREFIX_TOKEN_ALLOWANCE = 256
|
|
248
|
+
private_constant :PREFIX_TOKEN_ALLOWANCE
|
|
249
|
+
|
|
123
250
|
private
|
|
124
251
|
|
|
252
|
+
# Return a TokenCounter for providers that benefit from exact token
|
|
253
|
+
# counting. OpenAI's tiktoken ratios are already stable at 4.0
|
|
254
|
+
# chars/token on code, so it doesn't need this.
|
|
255
|
+
#
|
|
256
|
+
# @param provider [Embedding::Provider::Interface]
|
|
257
|
+
# @return [Embedding::TokenCounter, nil]
|
|
258
|
+
def token_counter_for(provider)
|
|
259
|
+
return unless provider.is_a?(Embedding::Provider::Ollama)
|
|
260
|
+
|
|
261
|
+
Embedding::TokenCounter.new
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
# Tokenizer-calibrated chars/token ratio for the given provider.
|
|
265
|
+
# Delegates to {Woods::TokenUtils.chars_per_token_for} — the single
|
|
266
|
+
# source of truth — after reducing the provider instance to a symbol.
|
|
267
|
+
#
|
|
268
|
+
# @param provider [Embedding::Provider::Interface]
|
|
269
|
+
# @return [Float]
|
|
270
|
+
def chars_per_token_for(provider)
|
|
271
|
+
symbol = case provider
|
|
272
|
+
when Embedding::Provider::Ollama then :ollama
|
|
273
|
+
else :openai
|
|
274
|
+
end
|
|
275
|
+
TokenUtils.chars_per_token_for(symbol)
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
# Diagnostic for the build_chunker budget guard.
|
|
279
|
+
def chunker_budget_message(provider, budget)
|
|
280
|
+
"embedding model '#{provider.respond_to?(:model) ? provider.model : provider.class}' " \
|
|
281
|
+
"reports a max_input_tokens of #{budget}, which leaves no room for " \
|
|
282
|
+
"the chunk prefix (#{CHUNKER_PREFIX_ALLOWANCE} chars). Configure a " \
|
|
283
|
+
'model with a larger native context, or set num_ctx explicitly.'
|
|
284
|
+
end
|
|
285
|
+
|
|
286
|
+
public
|
|
287
|
+
|
|
125
288
|
# Instantiate the metadata store adapter specified by the configuration.
|
|
126
289
|
#
|
|
127
290
|
# @return [Storage::MetadataStore::Interface] Metadata store adapter instance
|
|
@@ -145,6 +308,8 @@ module Woods
|
|
|
145
308
|
end
|
|
146
309
|
end
|
|
147
310
|
|
|
311
|
+
private
|
|
312
|
+
|
|
148
313
|
# Build a cache store from configuration, or nil if caching is disabled.
|
|
149
314
|
#
|
|
150
315
|
# @return [Cache::CacheStore, nil]
|