woods 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +169 -0
  3. data/README.md +20 -8
  4. data/exe/woods-console +51 -6
  5. data/exe/woods-console-mcp +24 -4
  6. data/exe/woods-mcp +30 -7
  7. data/exe/woods-mcp-http +47 -6
  8. data/lib/generators/woods/install_generator.rb +13 -4
  9. data/lib/generators/woods/templates/woods.rb.tt +155 -0
  10. data/lib/tasks/woods.rake +15 -50
  11. data/lib/woods/builder.rb +174 -9
  12. data/lib/woods/cache/cache_middleware.rb +360 -31
  13. data/lib/woods/chunking/semantic_chunker.rb +334 -7
  14. data/lib/woods/console/adapters/job_adapter.rb +10 -4
  15. data/lib/woods/console/audit_logger.rb +76 -4
  16. data/lib/woods/console/bridge.rb +48 -15
  17. data/lib/woods/console/bridge_protocol.rb +44 -0
  18. data/lib/woods/console/confirmation.rb +3 -4
  19. data/lib/woods/console/console_response_renderer.rb +56 -18
  20. data/lib/woods/console/credential_index.rb +201 -0
  21. data/lib/woods/console/credential_scanner.rb +302 -0
  22. data/lib/woods/console/dispatch_pipeline.rb +138 -0
  23. data/lib/woods/console/embedded_executor.rb +682 -35
  24. data/lib/woods/console/eval_guard.rb +319 -0
  25. data/lib/woods/console/model_validator.rb +1 -3
  26. data/lib/woods/console/rack_middleware.rb +185 -29
  27. data/lib/woods/console/redactor.rb +161 -0
  28. data/lib/woods/console/response_context.rb +127 -0
  29. data/lib/woods/console/safe_context.rb +220 -23
  30. data/lib/woods/console/scope_predicate_parser.rb +131 -0
  31. data/lib/woods/console/server.rb +417 -486
  32. data/lib/woods/console/sql_noise_stripper.rb +87 -0
  33. data/lib/woods/console/sql_table_scanner.rb +213 -0
  34. data/lib/woods/console/sql_validator.rb +81 -31
  35. data/lib/woods/console/table_gate.rb +93 -0
  36. data/lib/woods/console/tool_specs.rb +552 -0
  37. data/lib/woods/console/tools/tier1.rb +3 -3
  38. data/lib/woods/console/tools/tier4.rb +7 -1
  39. data/lib/woods/dependency_graph.rb +66 -7
  40. data/lib/woods/embedding/indexer.rb +190 -6
  41. data/lib/woods/embedding/openai.rb +40 -4
  42. data/lib/woods/embedding/provider.rb +104 -8
  43. data/lib/woods/embedding/text_preparer.rb +23 -3
  44. data/lib/woods/embedding/token_counter.rb +133 -0
  45. data/lib/woods/evaluation/baseline_runner.rb +20 -2
  46. data/lib/woods/evaluation/metrics.rb +4 -1
  47. data/lib/woods/extracted_unit.rb +1 -0
  48. data/lib/woods/extractor.rb +7 -1
  49. data/lib/woods/extractors/controller_extractor.rb +6 -0
  50. data/lib/woods/extractors/mailer_extractor.rb +16 -2
  51. data/lib/woods/extractors/model_extractor.rb +6 -1
  52. data/lib/woods/extractors/phlex_extractor.rb +13 -4
  53. data/lib/woods/extractors/rails_source_extractor.rb +2 -0
  54. data/lib/woods/extractors/route_helper_resolver.rb +130 -0
  55. data/lib/woods/extractors/shared_dependency_scanner.rb +130 -2
  56. data/lib/woods/extractors/view_component_extractor.rb +12 -1
  57. data/lib/woods/extractors/view_engines/base.rb +141 -0
  58. data/lib/woods/extractors/view_engines/erb.rb +145 -0
  59. data/lib/woods/extractors/view_template_extractor.rb +92 -133
  60. data/lib/woods/flow_assembler.rb +23 -15
  61. data/lib/woods/flow_precomputer.rb +21 -2
  62. data/lib/woods/graph_analyzer.rb +3 -4
  63. data/lib/woods/index_artifact.rb +173 -0
  64. data/lib/woods/mcp/bearer_auth.rb +45 -0
  65. data/lib/woods/mcp/bootstrap_state.rb +94 -0
  66. data/lib/woods/mcp/bootstrapper.rb +337 -16
  67. data/lib/woods/mcp/config_resolver.rb +288 -0
  68. data/lib/woods/mcp/errors.rb +134 -0
  69. data/lib/woods/mcp/index_reader.rb +265 -30
  70. data/lib/woods/mcp/origin_guard.rb +132 -0
  71. data/lib/woods/mcp/provider_probe.rb +166 -0
  72. data/lib/woods/mcp/renderers/claude_renderer.rb +6 -0
  73. data/lib/woods/mcp/renderers/markdown_renderer.rb +39 -3
  74. data/lib/woods/mcp/renderers/plain_renderer.rb +16 -2
  75. data/lib/woods/mcp/server.rb +737 -137
  76. data/lib/woods/model_name_cache.rb +78 -2
  77. data/lib/woods/notion/client.rb +25 -2
  78. data/lib/woods/notion/mappers/model_mapper.rb +36 -2
  79. data/lib/woods/railtie.rb +55 -15
  80. data/lib/woods/resilience/circuit_breaker.rb +9 -2
  81. data/lib/woods/resilience/retryable_provider.rb +40 -3
  82. data/lib/woods/resolved_config.rb +299 -0
  83. data/lib/woods/retrieval/context_assembler.rb +112 -5
  84. data/lib/woods/retrieval/query_classifier.rb +1 -1
  85. data/lib/woods/retrieval/ranker.rb +55 -6
  86. data/lib/woods/retrieval/search_executor.rb +42 -13
  87. data/lib/woods/retriever.rb +330 -24
  88. data/lib/woods/session_tracer/middleware.rb +35 -1
  89. data/lib/woods/storage/graph_store.rb +39 -0
  90. data/lib/woods/storage/inapplicable_backend.rb +14 -0
  91. data/lib/woods/storage/metadata_store.rb +129 -1
  92. data/lib/woods/storage/pgvector.rb +70 -8
  93. data/lib/woods/storage/qdrant.rb +196 -5
  94. data/lib/woods/storage/snapshotter/metadata.rb +172 -0
  95. data/lib/woods/storage/snapshotter/vector.rb +238 -0
  96. data/lib/woods/storage/snapshotter.rb +24 -0
  97. data/lib/woods/storage/vector_store.rb +184 -35
  98. data/lib/woods/tasks.rb +85 -0
  99. data/lib/woods/temporal/snapshot_store.rb +49 -1
  100. data/lib/woods/token_utils.rb +44 -5
  101. data/lib/woods/unblocked/client.rb +1 -1
  102. data/lib/woods/unblocked/document_builder.rb +35 -10
  103. data/lib/woods/unblocked/exporter.rb +1 -1
  104. data/lib/woods/util/host_guard.rb +61 -0
  105. data/lib/woods/version.rb +1 -1
  106. data/lib/woods.rb +126 -6
  107. metadata +69 -4
@@ -5,20 +5,29 @@ require 'rails/generators/active_record'
5
5
 
6
6
  module Woods
7
7
  module Generators
8
- # Rails generator that creates a migration for Woods tables.
8
+ # Rails generator that installs Woods into a Rails application.
9
9
  #
10
10
  # Usage:
11
11
  # rails generate woods:install
12
12
  #
13
- # Creates a migration with woods_units, woods_edges, and
14
- # woods_embeddings tables. Works with PostgreSQL, MySQL, and SQLite.
13
+ # Creates:
14
+ # config/initializers/woods.rb — annotated configuration file
15
+ # db/migrate/<ts>_create_woods_tables.rb — migration for Woods tables
16
+ #
17
+ # The migration creates woods_units, woods_edges, and woods_embeddings
18
+ # tables. Works with PostgreSQL, MySQL, and SQLite.
15
19
  #
16
20
  class InstallGenerator < Rails::Generators::Base
17
21
  include ActiveRecord::Generators::Migration
18
22
 
19
23
  source_root File.expand_path('templates', __dir__)
20
24
 
21
- desc 'Creates a migration for Woods tables (units, edges, embeddings)'
25
+ desc 'Creates a Woods initializer and migration for Woods tables'
26
+
27
+ # @return [void]
28
+ def create_initializer_file
29
+ template 'woods.rb.tt', 'config/initializers/woods.rb'
30
+ end
22
31
 
23
32
  # @return [void]
24
33
  def create_migration_file
@@ -0,0 +1,155 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Woods configuration
4
+ # Full reference: https://github.com/bigcartel/woods/blob/main/docs/CONFIGURATION_REFERENCE.md
5
+ #
6
+ # Quick-start presets (uncomment one instead of the full block below):
7
+ # Woods.configure_with_preset(:local) # in-memory + Ollama, no external services
8
+ # Woods.configure_with_preset(:postgresql) # pgvector + OpenAI (PostgreSQL required)
9
+ # Woods.configure_with_preset(:production) # Qdrant + OpenAI (production-scale)
10
+ #
11
+ # Presets accept a block for overrides:
12
+ # Woods.configure_with_preset(:local) { |c| c.max_context_tokens = 16_000 }
13
+
14
+ Woods.configure do |config|
15
+ # ── Core ────────────────────────────────────────────────────────────────
16
+
17
+ # Directory where extracted JSON is written.
18
+ # Default: Rails.root.join('tmp/woods')
19
+ config.output_dir = Rails.root.join('tmp/woods')
20
+
21
+ # Maximum tokens returned in a retrieval context window.
22
+ # config.max_context_tokens = 8_000
23
+
24
+ # Minimum vector similarity score (0.0–1.0) for retrieval results.
25
+ # config.similarity_threshold = 0.7
26
+
27
+ # Output format for retrieval: :claude, :markdown, :plain, :json
28
+ # config.context_format = :markdown
29
+
30
+ # Pretty-print extracted JSON (disable in CI to save disk space).
31
+ # config.pretty_json = true
32
+
33
+ # ── Extractors ──────────────────────────────────────────────────────────
34
+
35
+ # Enabled extractors. Default set covers the most common Rails layers.
36
+ # See CONFIGURATION_REFERENCE.md for the full list of available symbols.
37
+ # config.extractors = %i[
38
+ # models controllers services components view_components
39
+ # jobs mailers graphql serializers managers policies validators
40
+ # rails_source
41
+ # ]
42
+
43
+ # Include Rails / gem source in the index (increases extraction time).
44
+ # config.include_framework_sources = true
45
+
46
+ # Enable parallel extraction (experimental — may conflict with some apps).
47
+ # config.concurrent_extraction = false
48
+
49
+ # ── Embedding ───────────────────────────────────────────────────────────
50
+
51
+ # Embedding provider: :openai or :ollama
52
+ # config.embedding_provider = :openai
53
+ # config.embedding_model = 'text-embedding-3-small'
54
+ # config.embedding_options = { api_key: ENV['OPENAI_API_KEY'] }
55
+
56
+ # Ollama (local, no API key needed). `num_ctx` is auto-selected per model
57
+ # (nomic-embed-text → 2048, bge-m3 → 8192). Install `gem "tokenizers"` for
58
+ # exact BERT WordPiece token counting on dense Ruby source. See
59
+ # docs/EMBEDDING_MODELS.md for the full model comparison.
60
+ # config.embedding_provider = :ollama
61
+ # config.embedding_options = {
62
+ # model: 'nomic-embed-text',
63
+ # host: ENV.fetch('OLLAMA_URL', 'http://localhost:11434')
64
+ # }
65
+
66
+ # ── Storage ─────────────────────────────────────────────────────────────
67
+
68
+ # Vector store: :in_memory, :pgvector (PostgreSQL), :qdrant
69
+ # config.vector_store = :in_memory
70
+
71
+ # pgvector — run `rails generate woods:pgvector && rails db:migrate` first.
72
+ # config.vector_store = :pgvector
73
+ # config.vector_store_options = {
74
+ # connection: ActiveRecord::Base.connection,
75
+ # dimensions: 1_536
76
+ # }
77
+
78
+ # Qdrant:
79
+ # config.vector_store = :qdrant
80
+ # config.vector_store_options = {
81
+ # url: ENV.fetch('QDRANT_URL', 'http://localhost:6333'),
82
+ # collection: 'woods',
83
+ # dimensions: 1_536
84
+ # }
85
+
86
+ # Metadata store: :in_memory, :sqlite
87
+ # config.metadata_store = :in_memory
88
+ # config.metadata_store_options = {
89
+ # database: Rails.root.join('tmp/woods/metadata.sqlite3').to_s
90
+ # }
91
+
92
+ # ── Pipeline ────────────────────────────────────────────────────────────
93
+
94
+ # Pre-compute per-action request flow maps during extraction (slow).
95
+ # config.precompute_flows = false
96
+
97
+ # Extract link_to / redirect_to / form_action navigation edges.
98
+ # config.extract_navigation_edges = true
99
+
100
+ # Temporal snapshots — requires migrations 004+005.
101
+ # config.enable_snapshots = false
102
+
103
+ # ── Console MCP ─────────────────────────────────────────────────────────
104
+ #
105
+ # The Console MCP server lets AI tools query your live Rails app.
106
+ # It is DISABLED by default. Enable only after reviewing the security
107
+ # documentation in docs/CONSOLE_MCP_SETUP.md.
108
+ #
109
+ # Defense layers (all active by default when the server is on):
110
+ # Layer 1 — SqlValidator: rejects DML/DDL before any DB interaction.
111
+ # Layer 2 — SafeContext: wraps every request in a rolled-back transaction;
112
+ # writes are silently discarded even if Layer 1 is bypassed.
113
+ # Layer 3 — Column redaction: credential columns are replaced with
114
+ # [REDACTED] in every tool response.
115
+
116
+ # config.console_mcp_enabled = false
117
+ # config.console_mcp_path = '/mcp/console'
118
+
119
+ # Credential-column redaction (Layer 3).
120
+ # Starts from a safe default list (passwords, tokens, secrets).
121
+ # Extend: Woods::DEFAULT_CONSOLE_REDACTED_COLUMNS + %w[my_secret_col]
122
+ # Override entirely to remove a default:
123
+ # config.console_redacted_columns = %w[password token api_key]
124
+
125
+ # Key-value pairs where the value should be redacted (e.g., env var names).
126
+ # config.console_redacted_key_values = []
127
+
128
+ # Tables completely blocked from queries.
129
+ # config.console_blocked_tables = []
130
+
131
+ # Disable specific scanner patterns (rare — prefer blocked_tables).
132
+ # config.console_disabled_scanner_patterns = []
133
+
134
+ # Allow the AI console to execute Ruby eval (off by default; very dangerous).
135
+ # config.console_unsafe_eval_enabled = false
136
+
137
+ # Expose SQL/query read tools inside embedded console (adds read-only DB
138
+ # access via the rake task or Docker exec path; SqlValidator still applies).
139
+ # config.console_embedded_read_tools = false
140
+
141
+ # ── Caching ─────────────────────────────────────────────────────────────
142
+
143
+ # Cache embedding and retrieval responses to reduce API cost.
144
+ # config.cache_enabled = false
145
+ # config.cache_store = :redis # :redis, :solid_cache, :memory
146
+ # config.cache_options = { redis: Redis.new(url: ENV['REDIS_URL']) }
147
+
148
+ # ── Notion Export ───────────────────────────────────────────────────────
149
+
150
+ # config.notion_api_token = ENV['NOTION_API_TOKEN']
151
+ # config.notion_database_ids = {
152
+ # data_models: 'your-database-id',
153
+ # columns: 'your-database-id'
154
+ # }
155
+ end
data/lib/tasks/woods.rake CHANGED
@@ -354,33 +354,11 @@ namespace :woods do
354
354
  desc 'Embed all extracted units'
355
355
  task embed: :environment do
356
356
  require 'woods'
357
- require 'woods/embedding/indexer'
358
- require 'woods/embedding/text_preparer'
359
- require 'woods/embedding/provider'
360
- require 'woods/storage/vector_store'
361
-
362
- config = Woods.configuration
363
- output_dir = ENV.fetch('WOODS_OUTPUT', config.output_dir)
364
-
365
- provider = Woods::Embedding::Provider::Ollama.new
366
- text_preparer = Woods::Embedding::TextPreparer.new
367
- vector_store = Woods::Storage::VectorStore::InMemory.new
368
-
369
- indexer = Woods::Embedding::Indexer.new(
370
- provider: provider,
371
- text_preparer: text_preparer,
372
- vector_store: vector_store,
373
- output_dir: output_dir
374
- )
357
+ require 'woods/tasks'
375
358
 
359
+ indexer = Woods::Tasks.build_embed_indexer
376
360
  puts 'Embedding all extracted units...'
377
- stats = indexer.index_all
378
-
379
- puts
380
- puts 'Embedding complete!'
381
- puts " Processed: #{stats[:processed]}"
382
- puts " Skipped: #{stats[:skipped]}"
383
- puts " Errors: #{stats[:errors]}"
361
+ Woods::Tasks.print_embed_stats(indexer.index_all, mode: :full)
384
362
  end
385
363
 
386
364
  desc 'Nest the data — embed all units (alias for embed)'
@@ -389,33 +367,11 @@ namespace :woods do
389
367
  desc 'Embed changed units only (incremental)'
390
368
  task embed_incremental: :environment do
391
369
  require 'woods'
392
- require 'woods/embedding/indexer'
393
- require 'woods/embedding/text_preparer'
394
- require 'woods/embedding/provider'
395
- require 'woods/storage/vector_store'
396
-
397
- config = Woods.configuration
398
- output_dir = ENV.fetch('WOODS_OUTPUT', config.output_dir)
399
-
400
- provider = Woods::Embedding::Provider::Ollama.new
401
- text_preparer = Woods::Embedding::TextPreparer.new
402
- vector_store = Woods::Storage::VectorStore::InMemory.new
403
-
404
- indexer = Woods::Embedding::Indexer.new(
405
- provider: provider,
406
- text_preparer: text_preparer,
407
- vector_store: vector_store,
408
- output_dir: output_dir
409
- )
370
+ require 'woods/tasks'
410
371
 
372
+ indexer = Woods::Tasks.build_embed_indexer
411
373
  puts 'Embedding changed units (incremental)...'
412
- stats = indexer.index_incremental
413
-
414
- puts
415
- puts 'Incremental embedding complete!'
416
- puts " Processed: #{stats[:processed]}"
417
- puts " Skipped: #{stats[:skipped]}"
418
- puts " Errors: #{stats[:errors]}"
374
+ Woods::Tasks.print_embed_stats(indexer.index_incremental, mode: :incremental)
419
375
  end
420
376
 
421
377
  desc 'Hone the blade — incremental embedding (alias for embed_incremental)'
@@ -672,4 +628,13 @@ namespace :woods do
672
628
 
673
629
  desc 'Relay findings to Unblocked (alias for unblocked_sync)'
674
630
  task relay: :unblocked_sync
631
+
632
+ desc 'Generate a random bearer token for woods-mcp-http (WOODS_MCP_HTTP_TOKEN)'
633
+ task :generate_token do
634
+ require 'securerandom'
635
+ token = SecureRandom.hex(32)
636
+ puts token
637
+ warn 'Set WOODS_MCP_HTTP_TOKEN to this value in the environment where woods-mcp-http runs,'
638
+ warn 'and send it as `Authorization: Bearer <token>` from clients.'
639
+ end
675
640
  end
data/lib/woods/builder.rb CHANGED
@@ -8,6 +8,10 @@ require_relative 'storage/metadata_store'
8
8
  require_relative 'storage/graph_store'
9
9
  require_relative 'embedding/provider'
10
10
  require_relative 'embedding/openai'
11
+ require_relative 'embedding/text_preparer'
12
+ require_relative 'embedding/token_counter'
13
+ require_relative 'token_utils'
14
+ require_relative 'chunking/semantic_chunker'
11
15
 
12
16
  module Woods
13
17
  # Builder reads a {Configuration} and instantiates the appropriate adapters,
@@ -29,9 +33,13 @@ module Woods
29
33
  class Builder # rubocop:disable Metrics/ClassLength
30
34
  # Named presets mapping to default adapter types.
31
35
  #
32
- # :local — fully local, no external services required
33
- # :postgresql pgvector for vectors, OpenAI for embeddings
34
- # :production Qdrant for vectors, OpenAI for embeddings
36
+ # :local — fully local, no external services required (requires sqlite3 gem)
37
+ # :shared_filesystem Shape 2: rake embed separate MCP server reads from disk.
38
+ # All stores in-memory + persisted to output_dir via the
39
+ # Snapshotter. No sqlite3 gem needed. Requires output_dir set
40
+ # AND readable by both the embed process and the MCP server.
41
+ # :postgresql — pgvector for vectors, OpenAI for embeddings
42
+ # :production — Qdrant for vectors, OpenAI for embeddings
35
43
  PRESETS = {
36
44
  local: {
37
45
  vector_store: :in_memory,
@@ -39,6 +47,12 @@ module Woods
39
47
  graph_store: :in_memory,
40
48
  embedding_provider: :ollama
41
49
  },
50
+ shared_filesystem: {
51
+ vector_store: :in_memory,
52
+ metadata_store: :in_memory,
53
+ graph_store: :in_memory,
54
+ embedding_provider: :ollama
55
+ },
42
56
  postgresql: {
43
57
  vector_store: :pgvector,
44
58
  metadata_store: :sqlite,
@@ -78,17 +92,30 @@ module Woods
78
92
  # {Cache::CachedEmbeddingProvider} and the retriever is wrapped with
79
93
  # {Cache::CachedRetriever} for transparent caching of expensive operations.
80
94
  #
95
+ # Callers that need stores pre-populated from a dump (the Shape-2
96
+ # MCP-serve path) can inject them via +vector_store:+ / +metadata_store:+.
97
+ # Without these, fresh empty stores are constructed from config. This
98
+ # is how the Bootstrapper hydrates from `Snapshotter.load_or_empty`
99
+ # without Builder needing to know the Snapshotter exists.
100
+ #
101
+ # @param vector_store [Storage::VectorStore::Interface, nil]
102
+ # @param metadata_store [Storage::MetadataStore::Interface, nil]
103
+ # @param graph_store [Storage::GraphStore::Interface, nil] Pre-populated
104
+ # graph store. Without this, the retriever gets a fresh empty graph,
105
+ # which silently degrades +:hybrid+ retrieval (graph expansion returns
106
+ # no candidates). The Bootstrapper hydrates from +dependency_graph.json+
107
+ # on disk and passes the populated store here.
81
108
  # @return [Retriever, Cache::CachedRetriever] A fully wired retriever
82
- def build_retriever
109
+ def build_retriever(vector_store: nil, metadata_store: nil, graph_store: nil)
83
110
  provider = build_embedding_provider
84
111
  cache = build_cache_store
85
112
 
86
113
  provider = wrap_with_embedding_cache(provider, cache) if cache
87
114
 
88
115
  retriever = Retriever.new(
89
- vector_store: build_vector_store,
90
- metadata_store: build_metadata_store,
91
- graph_store: build_graph_store,
116
+ vector_store: vector_store || build_vector_store,
117
+ metadata_store: metadata_store || build_metadata_store,
118
+ graph_store: graph_store || build_graph_store,
92
119
  embedding_provider: provider
93
120
  )
94
121
 
@@ -110,18 +137,154 @@ module Woods
110
137
 
111
138
  # Instantiate the embedding provider specified by the configuration.
112
139
  #
140
+ # Strips `embedding_options` keys that belong to the ResolvedConfig layer
141
+ # (like `:dimension`) before splatting into the provider's constructor —
142
+ # those keys are useful for the Snapshotter's schema header but
143
+ # aren't part of the provider's API.
144
+ #
113
145
  # @return [Embedding::Provider::Interface] Embedding provider instance
114
146
  # @raise [ArgumentError] if the configured type is not recognized
115
147
  def build_embedding_provider
148
+ opts = provider_kwargs
116
149
  case @config.embedding_provider
117
- when :openai then Embedding::Provider::OpenAI.new(**(@config.embedding_options || {}))
118
- when :ollama then Embedding::Provider::Ollama.new(**(@config.embedding_options || {}))
150
+ when :openai then Embedding::Provider::OpenAI.new(**opts)
151
+ when :ollama then Embedding::Provider::Ollama.new(**opts)
119
152
  else raise ArgumentError, "Unknown embedding_provider: #{@config.embedding_provider}"
120
153
  end
121
154
  end
122
155
 
156
+ # Kwargs accepted by embedding provider constructors — everything in
157
+ # `embedding_options` except metadata fields that live there for
158
+ # ResolvedConfig bookkeeping.
159
+ SNAPSHOT_ONLY_KEYS = %i[dimension].freeze
160
+ private_constant :SNAPSHOT_ONLY_KEYS
161
+
162
+ def provider_kwargs
163
+ opts = (@config.embedding_options || {}).transform_keys(&:to_sym)
164
+ SNAPSHOT_ONLY_KEYS.each { |k| opts.delete(k) }
165
+ opts
166
+ end
167
+ private :provider_kwargs
168
+
169
+ # Build a {Embedding::TextPreparer} calibrated to a given provider.
170
+ #
171
+ # OpenAI embedders use tiktoken (cl100k_base) — 4.0 chars/token is a
172
+ # good conservative average. Ollama BERT/WordPiece tokenizers
173
+ # (nomic-embed-text, bge-*) run much hotter on dense Ruby/Rails
174
+ # source — long CamelCase constants, docstrings, callback DSLs, and
175
+ # heavy symbol use all sit below 2.0 chars/token in practice.
176
+ # Empirically, a 16 KB chunk of `ActionMailer::Base` still blows the
177
+ # 8192-token budget at 2.0 chars/token, so we budget at 1.5 to stay
178
+ # clear of tokenizer surprises even on the densest Rails internals.
179
+ #
180
+ # `max_tokens` tracks the provider's actual input budget when it
181
+ # reports one, falling back to the TextPreparer default otherwise.
182
+ #
183
+ # @param provider [Embedding::Provider::Interface]
184
+ # @return [Embedding::TextPreparer]
185
+ def build_text_preparer(provider)
186
+ chars_per_token = chars_per_token_for(provider)
187
+ budget = provider.respond_to?(:max_input_tokens) ? provider.max_input_tokens : nil
188
+ max_tokens = budget || Embedding::TextPreparer::DEFAULT_MAX_TOKENS
189
+
190
+ Embedding::TextPreparer.new(max_tokens: max_tokens, chars_per_token: chars_per_token)
191
+ end
192
+
193
+ # Build a {Chunking::SemanticChunker} sized to a given provider.
194
+ #
195
+ # `max_chars` is derived from the provider's input budget and the
196
+ # matching chars-per-token ratio, minus the context-prefix
197
+ # allowance the Indexer accounts for separately. Units that exceed
198
+ # this ceiling get sliced so no single chunk can blow the provider's
199
+ # input cap.
200
+ #
201
+ # For Ollama (and other BERT/WordPiece-backed models), char-based
202
+ # estimation is unreliable — CamelCase, `::` separators, and symbol
203
+ # literals tokenize much denser than chars/token averages suggest.
204
+ # When the optional `tokenizers` gem is installed, pass a
205
+ # {Embedding::TokenCounter} and `max_tokens` so the chunker can
206
+ # verify every slice with the real tokenizer and re-split any piece
207
+ # that still exceeds `num_ctx`. See docs/EMBEDDING_MODELS.md.
208
+ #
209
+ # Ollama v0.13.5+ stopped honouring `truncate: true` on `/api/embed`
210
+ # (ollama/ollama#14186), so any chunk that exceeds `num_ctx` returns
211
+ # a 400 rather than being silently truncated. Exact client-side
212
+ # sizing is the only reliable path until the regression is fixed
213
+ # upstream.
214
+ #
215
+ # @param provider [Embedding::Provider::Interface]
216
+ # @return [Chunking::SemanticChunker]
217
+ def build_chunker(provider)
218
+ budget = provider.respond_to?(:max_input_tokens) ? provider.max_input_tokens : nil
219
+ max_chars = ((budget * chars_per_token_for(provider)).floor - CHUNKER_PREFIX_ALLOWANCE if budget)
220
+
221
+ # Guard against a budget so small that the prefix allowance leaves
222
+ # no room for content. Without this, SemanticChunker#slice_by_lines
223
+ # passes a negative repeat count to String#scan, which returns []
224
+ # — every chunk becomes empty and is silently dropped, producing
225
+ # zero embeddings with no error. Surface the misconfiguration loudly.
226
+ raise ArgumentError, chunker_budget_message(provider, budget) if max_chars && max_chars <= 0
227
+
228
+ token_counter = token_counter_for(provider)
229
+ max_tokens = token_counter && budget ? budget - PREFIX_TOKEN_ALLOWANCE : nil
230
+
231
+ Chunking::SemanticChunker.new(
232
+ max_chars: max_chars,
233
+ token_counter: token_counter,
234
+ max_tokens: max_tokens
235
+ )
236
+ end
237
+
238
+ # Character allowance reserved for the TextPreparer context prefix
239
+ # ([type] id / namespace / file / deps) — kept in sync with the
240
+ # Indexer's own PREFIX_CHAR_ALLOWANCE constant.
241
+ CHUNKER_PREFIX_ALLOWANCE = 512
242
+ private_constant :CHUNKER_PREFIX_ALLOWANCE
243
+
244
+ # Token-side sibling of {CHUNKER_PREFIX_ALLOWANCE}. Reserved for the
245
+ # TextPreparer prefix when tokenizer-driven sizing is active — a bit
246
+ # generous to cover long file paths and dep lists.
247
+ PREFIX_TOKEN_ALLOWANCE = 256
248
+ private_constant :PREFIX_TOKEN_ALLOWANCE
249
+
123
250
  private
124
251
 
252
+ # Return a TokenCounter for providers that benefit from exact token
253
+ # counting. OpenAI's tiktoken ratios are already stable at 4.0
254
+ # chars/token on code, so it doesn't need this.
255
+ #
256
+ # @param provider [Embedding::Provider::Interface]
257
+ # @return [Embedding::TokenCounter, nil]
258
+ def token_counter_for(provider)
259
+ return unless provider.is_a?(Embedding::Provider::Ollama)
260
+
261
+ Embedding::TokenCounter.new
262
+ end
263
+
264
+ # Tokenizer-calibrated chars/token ratio for the given provider.
265
+ # Delegates to {Woods::TokenUtils.chars_per_token_for} — the single
266
+ # source of truth — after reducing the provider instance to a symbol.
267
+ #
268
+ # @param provider [Embedding::Provider::Interface]
269
+ # @return [Float]
270
+ def chars_per_token_for(provider)
271
+ symbol = case provider
272
+ when Embedding::Provider::Ollama then :ollama
273
+ else :openai
274
+ end
275
+ TokenUtils.chars_per_token_for(symbol)
276
+ end
277
+
278
+ # Diagnostic for the build_chunker budget guard.
279
+ def chunker_budget_message(provider, budget)
280
+ "embedding model '#{provider.respond_to?(:model) ? provider.model : provider.class}' " \
281
+ "reports a max_input_tokens of #{budget}, which leaves no room for " \
282
+ "the chunk prefix (#{CHUNKER_PREFIX_ALLOWANCE} chars). Configure a " \
283
+ 'model with a larger native context, or set num_ctx explicitly.'
284
+ end
285
+
286
+ public
287
+
125
288
  # Instantiate the metadata store adapter specified by the configuration.
126
289
  #
127
290
  # @return [Storage::MetadataStore::Interface] Metadata store adapter instance
@@ -145,6 +308,8 @@ module Woods
145
308
  end
146
309
  end
147
310
 
311
+ private
312
+
148
313
  # Build a cache store from configuration, or nil if caching is disabled.
149
314
  #
150
315
  # @return [Cache::CacheStore, nil]