woods 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +89 -0
- data/CODE_OF_CONDUCT.md +83 -0
- data/CONTRIBUTING.md +65 -0
- data/LICENSE.txt +21 -0
- data/README.md +406 -0
- data/exe/woods-console +59 -0
- data/exe/woods-console-mcp +22 -0
- data/exe/woods-mcp +34 -0
- data/exe/woods-mcp-http +37 -0
- data/exe/woods-mcp-start +58 -0
- data/lib/generators/woods/install_generator.rb +32 -0
- data/lib/generators/woods/pgvector_generator.rb +37 -0
- data/lib/generators/woods/templates/add_pgvector_to_woods.rb.erb +15 -0
- data/lib/generators/woods/templates/create_woods_tables.rb.erb +43 -0
- data/lib/tasks/woods.rake +621 -0
- data/lib/tasks/woods_evaluation.rake +115 -0
- data/lib/woods/ast/call_site_extractor.rb +106 -0
- data/lib/woods/ast/method_extractor.rb +71 -0
- data/lib/woods/ast/node.rb +116 -0
- data/lib/woods/ast/parser.rb +614 -0
- data/lib/woods/ast.rb +6 -0
- data/lib/woods/builder.rb +200 -0
- data/lib/woods/cache/cache_middleware.rb +199 -0
- data/lib/woods/cache/cache_store.rb +264 -0
- data/lib/woods/cache/redis_cache_store.rb +116 -0
- data/lib/woods/cache/solid_cache_store.rb +111 -0
- data/lib/woods/chunking/chunk.rb +84 -0
- data/lib/woods/chunking/semantic_chunker.rb +295 -0
- data/lib/woods/console/adapters/cache_adapter.rb +58 -0
- data/lib/woods/console/adapters/good_job_adapter.rb +33 -0
- data/lib/woods/console/adapters/job_adapter.rb +68 -0
- data/lib/woods/console/adapters/sidekiq_adapter.rb +33 -0
- data/lib/woods/console/adapters/solid_queue_adapter.rb +33 -0
- data/lib/woods/console/audit_logger.rb +75 -0
- data/lib/woods/console/bridge.rb +177 -0
- data/lib/woods/console/confirmation.rb +90 -0
- data/lib/woods/console/connection_manager.rb +173 -0
- data/lib/woods/console/console_response_renderer.rb +74 -0
- data/lib/woods/console/embedded_executor.rb +373 -0
- data/lib/woods/console/model_validator.rb +81 -0
- data/lib/woods/console/rack_middleware.rb +87 -0
- data/lib/woods/console/safe_context.rb +82 -0
- data/lib/woods/console/server.rb +612 -0
- data/lib/woods/console/sql_validator.rb +172 -0
- data/lib/woods/console/tools/tier1.rb +118 -0
- data/lib/woods/console/tools/tier2.rb +117 -0
- data/lib/woods/console/tools/tier3.rb +110 -0
- data/lib/woods/console/tools/tier4.rb +79 -0
- data/lib/woods/coordination/pipeline_lock.rb +109 -0
- data/lib/woods/cost_model/embedding_cost.rb +88 -0
- data/lib/woods/cost_model/estimator.rb +128 -0
- data/lib/woods/cost_model/provider_pricing.rb +67 -0
- data/lib/woods/cost_model/storage_cost.rb +52 -0
- data/lib/woods/cost_model.rb +22 -0
- data/lib/woods/db/migrations/001_create_units.rb +38 -0
- data/lib/woods/db/migrations/002_create_edges.rb +35 -0
- data/lib/woods/db/migrations/003_create_embeddings.rb +37 -0
- data/lib/woods/db/migrations/004_create_snapshots.rb +45 -0
- data/lib/woods/db/migrations/005_create_snapshot_units.rb +40 -0
- data/lib/woods/db/migrations/006_rename_tables.rb +34 -0
- data/lib/woods/db/migrator.rb +73 -0
- data/lib/woods/db/schema_version.rb +73 -0
- data/lib/woods/dependency_graph.rb +236 -0
- data/lib/woods/embedding/indexer.rb +140 -0
- data/lib/woods/embedding/openai.rb +126 -0
- data/lib/woods/embedding/provider.rb +162 -0
- data/lib/woods/embedding/text_preparer.rb +112 -0
- data/lib/woods/evaluation/baseline_runner.rb +115 -0
- data/lib/woods/evaluation/evaluator.rb +139 -0
- data/lib/woods/evaluation/metrics.rb +79 -0
- data/lib/woods/evaluation/query_set.rb +148 -0
- data/lib/woods/evaluation/report_generator.rb +90 -0
- data/lib/woods/extracted_unit.rb +145 -0
- data/lib/woods/extractor.rb +1028 -0
- data/lib/woods/extractors/action_cable_extractor.rb +201 -0
- data/lib/woods/extractors/ast_source_extraction.rb +46 -0
- data/lib/woods/extractors/behavioral_profile.rb +309 -0
- data/lib/woods/extractors/caching_extractor.rb +261 -0
- data/lib/woods/extractors/callback_analyzer.rb +246 -0
- data/lib/woods/extractors/concern_extractor.rb +292 -0
- data/lib/woods/extractors/configuration_extractor.rb +219 -0
- data/lib/woods/extractors/controller_extractor.rb +404 -0
- data/lib/woods/extractors/database_view_extractor.rb +278 -0
- data/lib/woods/extractors/decorator_extractor.rb +253 -0
- data/lib/woods/extractors/engine_extractor.rb +223 -0
- data/lib/woods/extractors/event_extractor.rb +211 -0
- data/lib/woods/extractors/factory_extractor.rb +289 -0
- data/lib/woods/extractors/graphql_extractor.rb +892 -0
- data/lib/woods/extractors/i18n_extractor.rb +117 -0
- data/lib/woods/extractors/job_extractor.rb +374 -0
- data/lib/woods/extractors/lib_extractor.rb +218 -0
- data/lib/woods/extractors/mailer_extractor.rb +269 -0
- data/lib/woods/extractors/manager_extractor.rb +188 -0
- data/lib/woods/extractors/middleware_extractor.rb +133 -0
- data/lib/woods/extractors/migration_extractor.rb +469 -0
- data/lib/woods/extractors/model_extractor.rb +988 -0
- data/lib/woods/extractors/phlex_extractor.rb +252 -0
- data/lib/woods/extractors/policy_extractor.rb +191 -0
- data/lib/woods/extractors/poro_extractor.rb +229 -0
- data/lib/woods/extractors/pundit_extractor.rb +223 -0
- data/lib/woods/extractors/rails_source_extractor.rb +473 -0
- data/lib/woods/extractors/rake_task_extractor.rb +343 -0
- data/lib/woods/extractors/route_extractor.rb +181 -0
- data/lib/woods/extractors/scheduled_job_extractor.rb +331 -0
- data/lib/woods/extractors/serializer_extractor.rb +339 -0
- data/lib/woods/extractors/service_extractor.rb +217 -0
- data/lib/woods/extractors/shared_dependency_scanner.rb +91 -0
- data/lib/woods/extractors/shared_utility_methods.rb +281 -0
- data/lib/woods/extractors/state_machine_extractor.rb +398 -0
- data/lib/woods/extractors/test_mapping_extractor.rb +225 -0
- data/lib/woods/extractors/validator_extractor.rb +211 -0
- data/lib/woods/extractors/view_component_extractor.rb +311 -0
- data/lib/woods/extractors/view_template_extractor.rb +261 -0
- data/lib/woods/feedback/gap_detector.rb +89 -0
- data/lib/woods/feedback/store.rb +119 -0
- data/lib/woods/filename_utils.rb +32 -0
- data/lib/woods/flow_analysis/operation_extractor.rb +206 -0
- data/lib/woods/flow_analysis/response_code_mapper.rb +154 -0
- data/lib/woods/flow_assembler.rb +290 -0
- data/lib/woods/flow_document.rb +191 -0
- data/lib/woods/flow_precomputer.rb +102 -0
- data/lib/woods/formatting/base.rb +30 -0
- data/lib/woods/formatting/claude_adapter.rb +98 -0
- data/lib/woods/formatting/generic_adapter.rb +56 -0
- data/lib/woods/formatting/gpt_adapter.rb +64 -0
- data/lib/woods/formatting/human_adapter.rb +78 -0
- data/lib/woods/graph_analyzer.rb +374 -0
- data/lib/woods/mcp/bootstrapper.rb +96 -0
- data/lib/woods/mcp/index_reader.rb +394 -0
- data/lib/woods/mcp/renderers/claude_renderer.rb +81 -0
- data/lib/woods/mcp/renderers/json_renderer.rb +17 -0
- data/lib/woods/mcp/renderers/markdown_renderer.rb +353 -0
- data/lib/woods/mcp/renderers/plain_renderer.rb +240 -0
- data/lib/woods/mcp/server.rb +962 -0
- data/lib/woods/mcp/tool_response_renderer.rb +85 -0
- data/lib/woods/model_name_cache.rb +51 -0
- data/lib/woods/notion/client.rb +217 -0
- data/lib/woods/notion/exporter.rb +219 -0
- data/lib/woods/notion/mapper.rb +40 -0
- data/lib/woods/notion/mappers/column_mapper.rb +57 -0
- data/lib/woods/notion/mappers/migration_mapper.rb +39 -0
- data/lib/woods/notion/mappers/model_mapper.rb +161 -0
- data/lib/woods/notion/mappers/shared.rb +22 -0
- data/lib/woods/notion/rate_limiter.rb +68 -0
- data/lib/woods/observability/health_check.rb +79 -0
- data/lib/woods/observability/instrumentation.rb +34 -0
- data/lib/woods/observability/structured_logger.rb +57 -0
- data/lib/woods/operator/error_escalator.rb +81 -0
- data/lib/woods/operator/pipeline_guard.rb +92 -0
- data/lib/woods/operator/status_reporter.rb +80 -0
- data/lib/woods/railtie.rb +38 -0
- data/lib/woods/resilience/circuit_breaker.rb +99 -0
- data/lib/woods/resilience/index_validator.rb +167 -0
- data/lib/woods/resilience/retryable_provider.rb +108 -0
- data/lib/woods/retrieval/context_assembler.rb +261 -0
- data/lib/woods/retrieval/query_classifier.rb +133 -0
- data/lib/woods/retrieval/ranker.rb +277 -0
- data/lib/woods/retrieval/search_executor.rb +316 -0
- data/lib/woods/retriever.rb +152 -0
- data/lib/woods/ruby_analyzer/class_analyzer.rb +170 -0
- data/lib/woods/ruby_analyzer/dataflow_analyzer.rb +77 -0
- data/lib/woods/ruby_analyzer/fqn_builder.rb +18 -0
- data/lib/woods/ruby_analyzer/mermaid_renderer.rb +280 -0
- data/lib/woods/ruby_analyzer/method_analyzer.rb +143 -0
- data/lib/woods/ruby_analyzer/trace_enricher.rb +143 -0
- data/lib/woods/ruby_analyzer.rb +87 -0
- data/lib/woods/session_tracer/file_store.rb +104 -0
- data/lib/woods/session_tracer/middleware.rb +143 -0
- data/lib/woods/session_tracer/redis_store.rb +106 -0
- data/lib/woods/session_tracer/session_flow_assembler.rb +254 -0
- data/lib/woods/session_tracer/session_flow_document.rb +223 -0
- data/lib/woods/session_tracer/solid_cache_store.rb +139 -0
- data/lib/woods/session_tracer/store.rb +81 -0
- data/lib/woods/storage/graph_store.rb +120 -0
- data/lib/woods/storage/metadata_store.rb +196 -0
- data/lib/woods/storage/pgvector.rb +195 -0
- data/lib/woods/storage/qdrant.rb +205 -0
- data/lib/woods/storage/vector_store.rb +167 -0
- data/lib/woods/temporal/json_snapshot_store.rb +245 -0
- data/lib/woods/temporal/snapshot_store.rb +345 -0
- data/lib/woods/token_utils.rb +19 -0
- data/lib/woods/version.rb +5 -0
- data/lib/woods.rb +246 -0
- metadata +270 -0
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'retriever'
|
|
4
|
+
require_relative 'storage/vector_store'
|
|
5
|
+
require_relative 'storage/pgvector'
|
|
6
|
+
require_relative 'storage/qdrant'
|
|
7
|
+
require_relative 'storage/metadata_store'
|
|
8
|
+
require_relative 'storage/graph_store'
|
|
9
|
+
require_relative 'embedding/provider'
|
|
10
|
+
require_relative 'embedding/openai'
|
|
11
|
+
|
|
12
|
+
module Woods
|
|
13
|
+
# Builder reads a {Configuration} and instantiates the appropriate adapters,
|
|
14
|
+
# returning a fully wired {Retriever} ready for use.
|
|
15
|
+
#
|
|
16
|
+
# Named presets are provided for common deployment scenarios. All presets can
|
|
17
|
+
# be further customized with a block passed to {Woods.configure_with_preset}.
|
|
18
|
+
#
|
|
19
|
+
# @example Using a preset
|
|
20
|
+
# Woods.configure_with_preset(:local)
|
|
21
|
+
# result = Woods.retrieve("How does the User model work?")
|
|
22
|
+
#
|
|
23
|
+
# @example Using a preset with block customization
|
|
24
|
+
# Woods.configure_with_preset(:production) do |config|
|
|
25
|
+
# config.embedding_options = { api_key: ENV['OPENAI_API_KEY'] }
|
|
26
|
+
# config.vector_store_options = { url: ENV['QDRANT_URL'], collection: 'myapp' }
|
|
27
|
+
# end
|
|
28
|
+
#
|
|
29
|
+
class Builder # rubocop:disable Metrics/ClassLength
|
|
30
|
+
# Named presets mapping to default adapter types.
|
|
31
|
+
#
|
|
32
|
+
# :local — fully local, no external services required
|
|
33
|
+
# :postgresql — pgvector for vectors, OpenAI for embeddings
|
|
34
|
+
# :production — Qdrant for vectors, OpenAI for embeddings
|
|
35
|
+
PRESETS = {
|
|
36
|
+
local: {
|
|
37
|
+
vector_store: :in_memory,
|
|
38
|
+
metadata_store: :sqlite,
|
|
39
|
+
graph_store: :in_memory,
|
|
40
|
+
embedding_provider: :ollama
|
|
41
|
+
},
|
|
42
|
+
postgresql: {
|
|
43
|
+
vector_store: :pgvector,
|
|
44
|
+
metadata_store: :sqlite,
|
|
45
|
+
graph_store: :in_memory,
|
|
46
|
+
embedding_provider: :openai
|
|
47
|
+
},
|
|
48
|
+
production: {
|
|
49
|
+
vector_store: :qdrant,
|
|
50
|
+
metadata_store: :sqlite,
|
|
51
|
+
graph_store: :in_memory,
|
|
52
|
+
embedding_provider: :openai
|
|
53
|
+
}
|
|
54
|
+
}.freeze
|
|
55
|
+
|
|
56
|
+
# Build a {Configuration} populated with the named preset's adapter types.
|
|
57
|
+
#
|
|
58
|
+
# @param name [Symbol] Preset name — one of :local, :postgresql, or :production
|
|
59
|
+
# @return [Configuration] A new Configuration with preset values applied
|
|
60
|
+
# @raise [ArgumentError] if the preset name is not recognized
|
|
61
|
+
def self.preset_config(name)
|
|
62
|
+
preset = PRESETS.fetch(name) do
|
|
63
|
+
raise ArgumentError, "Unknown preset: #{name}. Valid: #{PRESETS.keys.join(', ')}"
|
|
64
|
+
end
|
|
65
|
+
config = Configuration.new
|
|
66
|
+
preset.each { |key, value| config.public_send(:"#{key}=", value) }
|
|
67
|
+
config
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# @param config [Configuration] Configuration to read adapter types from
|
|
71
|
+
def initialize(config = Woods.configuration)
|
|
72
|
+
@config = config
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Build a {Retriever} wired with adapters from the configuration.
|
|
76
|
+
#
|
|
77
|
+
# When `cache_enabled` is true, the embedding provider is wrapped with
|
|
78
|
+
# {Cache::CachedEmbeddingProvider} and the retriever is wrapped with
|
|
79
|
+
# {Cache::CachedRetriever} for transparent caching of expensive operations.
|
|
80
|
+
#
|
|
81
|
+
# @return [Retriever, Cache::CachedRetriever] A fully wired retriever
|
|
82
|
+
def build_retriever
|
|
83
|
+
provider = build_embedding_provider
|
|
84
|
+
cache = build_cache_store
|
|
85
|
+
|
|
86
|
+
provider = wrap_with_embedding_cache(provider, cache) if cache
|
|
87
|
+
|
|
88
|
+
retriever = Retriever.new(
|
|
89
|
+
vector_store: build_vector_store,
|
|
90
|
+
metadata_store: build_metadata_store,
|
|
91
|
+
graph_store: build_graph_store,
|
|
92
|
+
embedding_provider: provider
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
cache ? wrap_with_retriever_cache(retriever, cache) : retriever
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# Instantiate the vector store adapter specified by the configuration.
|
|
99
|
+
#
|
|
100
|
+
# @return [Storage::VectorStore::Interface] Vector store adapter instance
|
|
101
|
+
# @raise [ArgumentError] if the configured type is not recognized
|
|
102
|
+
def build_vector_store
|
|
103
|
+
case @config.vector_store
|
|
104
|
+
when :in_memory then Storage::VectorStore::InMemory.new
|
|
105
|
+
when :pgvector then Storage::VectorStore::Pgvector.new(**(@config.vector_store_options || {}))
|
|
106
|
+
when :qdrant then Storage::VectorStore::Qdrant.new(**(@config.vector_store_options || {}))
|
|
107
|
+
else raise ArgumentError, "Unknown vector_store: #{@config.vector_store}"
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# Instantiate the embedding provider specified by the configuration.
|
|
112
|
+
#
|
|
113
|
+
# @return [Embedding::Provider::Interface] Embedding provider instance
|
|
114
|
+
# @raise [ArgumentError] if the configured type is not recognized
|
|
115
|
+
def build_embedding_provider
|
|
116
|
+
case @config.embedding_provider
|
|
117
|
+
when :openai then Embedding::Provider::OpenAI.new(**(@config.embedding_options || {}))
|
|
118
|
+
when :ollama then Embedding::Provider::Ollama.new(**(@config.embedding_options || {}))
|
|
119
|
+
else raise ArgumentError, "Unknown embedding_provider: #{@config.embedding_provider}"
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
private
|
|
124
|
+
|
|
125
|
+
# Instantiate the metadata store adapter specified by the configuration.
|
|
126
|
+
#
|
|
127
|
+
# @return [Storage::MetadataStore::Interface] Metadata store adapter instance
|
|
128
|
+
# @raise [ArgumentError] if the configured type is not recognized
|
|
129
|
+
def build_metadata_store
|
|
130
|
+
case @config.metadata_store
|
|
131
|
+
when :in_memory then Storage::MetadataStore::InMemory.new
|
|
132
|
+
when :sqlite then Storage::MetadataStore::SQLite.new(**(@config.metadata_store_options || {}))
|
|
133
|
+
else raise ArgumentError, "Unknown metadata_store: #{@config.metadata_store}"
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# Instantiate the graph store adapter specified by the configuration.
|
|
138
|
+
#
|
|
139
|
+
# @return [Storage::GraphStore::Interface] Graph store adapter instance
|
|
140
|
+
# @raise [ArgumentError] if the configured type is not recognized
|
|
141
|
+
def build_graph_store
|
|
142
|
+
case @config.graph_store
|
|
143
|
+
when :in_memory then Storage::GraphStore::Memory.new
|
|
144
|
+
else raise ArgumentError, "Unknown graph_store: #{@config.graph_store}"
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
# Build a cache store from configuration, or nil if caching is disabled.
|
|
149
|
+
#
|
|
150
|
+
# @return [Cache::CacheStore, nil]
|
|
151
|
+
def build_cache_store
|
|
152
|
+
return nil unless @config.cache_enabled
|
|
153
|
+
|
|
154
|
+
opts = @config.cache_options || {}
|
|
155
|
+
|
|
156
|
+
case @config.cache_store
|
|
157
|
+
when :memory
|
|
158
|
+
Cache::InMemory.new(max_entries: opts.fetch(:max_entries, 500))
|
|
159
|
+
when :redis
|
|
160
|
+
require_relative 'cache/redis_cache_store'
|
|
161
|
+
Cache::RedisCacheStore.new(redis: opts.fetch(:redis), default_ttl: opts[:default_ttl])
|
|
162
|
+
when :solid_cache
|
|
163
|
+
require_relative 'cache/solid_cache_store'
|
|
164
|
+
Cache::SolidCacheStore.new(cache: opts.fetch(:cache), default_ttl: opts[:default_ttl])
|
|
165
|
+
when Cache::CacheStore
|
|
166
|
+
@config.cache_store
|
|
167
|
+
else
|
|
168
|
+
raise ArgumentError, "Unknown cache_store: #{@config.cache_store}"
|
|
169
|
+
end
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
# Wrap an embedding provider with caching.
|
|
173
|
+
#
|
|
174
|
+
# @param provider [Embedding::Provider::Interface]
|
|
175
|
+
# @param cache [Cache::CacheStore]
|
|
176
|
+
# @return [Cache::CachedEmbeddingProvider]
|
|
177
|
+
def wrap_with_embedding_cache(provider, cache)
|
|
178
|
+
ttls = (@config.cache_options || {}).fetch(:ttl, {})
|
|
179
|
+
Cache::CachedEmbeddingProvider.new(
|
|
180
|
+
provider: provider,
|
|
181
|
+
cache_store: cache,
|
|
182
|
+
ttl: ttls.fetch(:embeddings, Cache::DEFAULT_TTLS[:embeddings])
|
|
183
|
+
)
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
# Wrap a retriever with caching.
|
|
187
|
+
#
|
|
188
|
+
# @param retriever [Retriever]
|
|
189
|
+
# @param cache [Cache::CacheStore]
|
|
190
|
+
# @return [Cache::CachedRetriever]
|
|
191
|
+
def wrap_with_retriever_cache(retriever, cache)
|
|
192
|
+
ttls = (@config.cache_options || {}).fetch(:ttl, {})
|
|
193
|
+
Cache::CachedRetriever.new(
|
|
194
|
+
retriever: retriever,
|
|
195
|
+
cache_store: cache,
|
|
196
|
+
context_ttl: ttls.fetch(:context, Cache::DEFAULT_TTLS[:context])
|
|
197
|
+
)
|
|
198
|
+
end
|
|
199
|
+
end
|
|
200
|
+
end
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'digest'
|
|
4
|
+
require_relative 'cache_store'
|
|
5
|
+
|
|
6
|
+
module Woods
|
|
7
|
+
module Cache
|
|
8
|
+
# Decorator that wraps an embedding provider with cache-through logic.
|
|
9
|
+
#
|
|
10
|
+
# Implements the same {Embedding::Provider::Interface} so it can be
|
|
11
|
+
# injected transparently in place of the real provider. On cache hit,
|
|
12
|
+
# the expensive API call (OpenAI, Ollama) is skipped entirely.
|
|
13
|
+
#
|
|
14
|
+
# @example
|
|
15
|
+
# real_provider = Embedding::Provider::OpenAI.new(api_key: key)
|
|
16
|
+
# cached = CachedEmbeddingProvider.new(provider: real_provider, cache_store: store)
|
|
17
|
+
# cached.embed("How does User work?") # API call + cache write
|
|
18
|
+
# cached.embed("How does User work?") # cache hit, no API call
|
|
19
|
+
#
|
|
20
|
+
class CachedEmbeddingProvider
|
|
21
|
+
include Embedding::Provider::Interface
|
|
22
|
+
|
|
23
|
+
# @param provider [Embedding::Provider::Interface] The real embedding provider
|
|
24
|
+
# @param cache_store [CacheStore] Cache backend instance
|
|
25
|
+
# @param ttl [Integer] TTL for cached embeddings in seconds
|
|
26
|
+
def initialize(provider:, cache_store:, ttl: DEFAULT_TTLS[:embeddings])
|
|
27
|
+
@provider = provider
|
|
28
|
+
@cache_store = cache_store
|
|
29
|
+
@ttl = ttl
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Embed a single text, returning a cached vector when available.
|
|
33
|
+
#
|
|
34
|
+
# @param text [String] Text to embed
|
|
35
|
+
# @return [Array<Float>] Embedding vector
|
|
36
|
+
def embed(text)
|
|
37
|
+
key = embedding_key(text)
|
|
38
|
+
@cache_store.fetch(key, ttl: @ttl) { @provider.embed(text) }
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Embed a batch of texts, using cached vectors for any previously seen texts.
|
|
42
|
+
#
|
|
43
|
+
# Only texts that are not already cached are sent to the real provider.
|
|
44
|
+
# Results are merged back in original order.
|
|
45
|
+
#
|
|
46
|
+
# @param texts [Array<String>] Texts to embed
|
|
47
|
+
# @return [Array<Array<Float>>] Embedding vectors (same order as input)
|
|
48
|
+
def embed_batch(texts)
|
|
49
|
+
results, misses, miss_indices = partition_cached(texts)
|
|
50
|
+
|
|
51
|
+
if misses.any?
|
|
52
|
+
fresh_vectors = @provider.embed_batch(misses)
|
|
53
|
+
misses.each_with_index do |text, i|
|
|
54
|
+
results[miss_indices[i]] = fresh_vectors[i]
|
|
55
|
+
begin
|
|
56
|
+
@cache_store.write(embedding_key(text), fresh_vectors[i], ttl: @ttl)
|
|
57
|
+
rescue StandardError => e
|
|
58
|
+
warn("[Woods] CachedEmbeddingProvider cache write failed: #{e.message}")
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
results
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Delegate dimensions to the underlying provider.
|
|
67
|
+
#
|
|
68
|
+
# @return [Integer]
|
|
69
|
+
def dimensions
|
|
70
|
+
@provider.dimensions
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Delegate model_name to the underlying provider.
|
|
74
|
+
#
|
|
75
|
+
# @return [String]
|
|
76
|
+
def model_name
|
|
77
|
+
@provider.model_name
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
private
|
|
81
|
+
|
|
82
|
+
# Split texts into cached hits and uncached misses.
|
|
83
|
+
#
|
|
84
|
+
# @param texts [Array<String>]
|
|
85
|
+
# @return [Array(Array, Array<String>, Array<Integer>)]
|
|
86
|
+
def partition_cached(texts)
|
|
87
|
+
results = Array.new(texts.size)
|
|
88
|
+
misses = []
|
|
89
|
+
miss_indices = []
|
|
90
|
+
|
|
91
|
+
texts.each_with_index do |text, idx|
|
|
92
|
+
cached = @cache_store.read(embedding_key(text))
|
|
93
|
+
if cached
|
|
94
|
+
results[idx] = cached
|
|
95
|
+
else
|
|
96
|
+
misses << text
|
|
97
|
+
miss_indices << idx
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
[results, misses, miss_indices]
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# Build a cache key for an embedding text.
|
|
105
|
+
#
|
|
106
|
+
# @param text [String]
|
|
107
|
+
# @return [String]
|
|
108
|
+
def embedding_key(text)
|
|
109
|
+
Cache.cache_key(:embeddings, Digest::SHA256.hexdigest(text))
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# Decorator that wraps a {Retriever} with result caching.
|
|
114
|
+
#
|
|
115
|
+
# Caches the full formatted context output (the most token-expensive artifact)
|
|
116
|
+
# keyed by query + budget. Also caches the structural context overview
|
|
117
|
+
# separately with a longer TTL.
|
|
118
|
+
#
|
|
119
|
+
# @example
|
|
120
|
+
# retriever = Woods::Retriever.new(...)
|
|
121
|
+
# cached = CachedRetriever.new(retriever: retriever, cache_store: store)
|
|
122
|
+
# cached.retrieve("How does User work?") # full pipeline + cache
|
|
123
|
+
# cached.retrieve("How does User work?") # instant cache hit
|
|
124
|
+
#
|
|
125
|
+
class CachedRetriever
|
|
126
|
+
# @param retriever [Retriever] The real retriever instance
|
|
127
|
+
# @param cache_store [CacheStore] Cache backend instance
|
|
128
|
+
# @param context_ttl [Integer] TTL for formatted context results
|
|
129
|
+
def initialize(retriever:, cache_store:, context_ttl: DEFAULT_TTLS[:context])
|
|
130
|
+
@retriever = retriever
|
|
131
|
+
@cache_store = cache_store
|
|
132
|
+
@context_ttl = context_ttl
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
# Execute the retrieval pipeline with context-level caching.
|
|
136
|
+
#
|
|
137
|
+
# On cache hit, returns a RetrievalResult reconstructed from cached data
|
|
138
|
+
# without running any pipeline stages. On miss, delegates to the real
|
|
139
|
+
# retriever and caches the serializable parts of the result.
|
|
140
|
+
#
|
|
141
|
+
# @param query [String] Natural language query
|
|
142
|
+
# @param budget [Integer] Token budget
|
|
143
|
+
# @return [Retriever::RetrievalResult]
|
|
144
|
+
def retrieve(query, budget: 8000)
|
|
145
|
+
key = context_key(query, budget)
|
|
146
|
+
cached = @cache_store.read(key)
|
|
147
|
+
|
|
148
|
+
if cached
|
|
149
|
+
return Retriever::RetrievalResult.new(
|
|
150
|
+
context: cached['context'],
|
|
151
|
+
sources: cached['sources'],
|
|
152
|
+
classification: nil,
|
|
153
|
+
strategy: cached['strategy']&.to_sym,
|
|
154
|
+
tokens_used: cached['tokens_used'],
|
|
155
|
+
budget: budget,
|
|
156
|
+
trace: nil
|
|
157
|
+
)
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
result = @retriever.retrieve(query, budget: budget)
|
|
161
|
+
|
|
162
|
+
begin
|
|
163
|
+
@cache_store.write(key, serialize_result(result), ttl: @context_ttl)
|
|
164
|
+
rescue StandardError => e
|
|
165
|
+
warn("[Woods] CachedRetriever cache write failed: #{e.message}")
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
result
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
private
|
|
172
|
+
|
|
173
|
+
# Build a cache key for a context result.
|
|
174
|
+
#
|
|
175
|
+
# @param query [String]
|
|
176
|
+
# @param budget [Integer]
|
|
177
|
+
# @return [String]
|
|
178
|
+
def context_key(query, budget)
|
|
179
|
+
Cache.cache_key(:context, query, budget.to_s)
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
# Serialize a RetrievalResult to a JSON-safe hash.
|
|
183
|
+
#
|
|
184
|
+
# Only caches the fields needed to reconstruct a useful result:
|
|
185
|
+
# context string, sources list, strategy, and token count.
|
|
186
|
+
#
|
|
187
|
+
# @param result [Retriever::RetrievalResult]
|
|
188
|
+
# @return [Hash]
|
|
189
|
+
def serialize_result(result)
|
|
190
|
+
{
|
|
191
|
+
'context' => result.context,
|
|
192
|
+
'sources' => result.sources,
|
|
193
|
+
'strategy' => result.strategy&.to_s,
|
|
194
|
+
'tokens_used' => result.tokens_used
|
|
195
|
+
}
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
end
|
|
199
|
+
end
|
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'digest'
|
|
4
|
+
require 'json'
|
|
5
|
+
require 'logger'
|
|
6
|
+
|
|
7
|
+
module Woods
|
|
8
|
+
module Cache
|
|
9
|
+
# Default TTLs (in seconds) for each cache domain.
|
|
10
|
+
#
|
|
11
|
+
# Embedding vectors are stable (same text → same vector) so they get 24h.
|
|
12
|
+
# Metadata and structural context refresh on re-extraction (1h).
|
|
13
|
+
# Search results and formatted context are session-scoped (15min).
|
|
14
|
+
DEFAULT_TTLS = {
|
|
15
|
+
embeddings: 86_400,
|
|
16
|
+
metadata: 3_600,
|
|
17
|
+
structural: 3_600,
|
|
18
|
+
search: 900,
|
|
19
|
+
context: 900
|
|
20
|
+
}.freeze
|
|
21
|
+
|
|
22
|
+
# Build a namespaced cache key from a domain and raw parts.
|
|
23
|
+
#
|
|
24
|
+
# @param domain [Symbol] Cache domain (:embeddings, :metadata, etc.)
|
|
25
|
+
# @param parts [Array<String>] Key components (will be SHA256-hashed if long)
|
|
26
|
+
# @return [String] Namespaced key
|
|
27
|
+
def self.cache_key(domain, *parts)
|
|
28
|
+
raw = parts.join(':')
|
|
29
|
+
suffix = raw.length > 64 ? Digest::SHA256.hexdigest(raw) : raw
|
|
30
|
+
"woods:cache:#{domain}:#{suffix}"
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Abstract cache store interface.
|
|
34
|
+
#
|
|
35
|
+
# All cache backends must implement these methods. The interface is modeled
|
|
36
|
+
# after ActiveSupport::Cache::Store for familiarity but kept minimal.
|
|
37
|
+
#
|
|
38
|
+
# @abstract Subclass and override all public methods.
|
|
39
|
+
class CacheStore
|
|
40
|
+
# Read a value from the cache.
|
|
41
|
+
#
|
|
42
|
+
# @param key [String] Cache key
|
|
43
|
+
# @return [Object, nil] Cached value or nil if missing/expired
|
|
44
|
+
def read(key)
|
|
45
|
+
raise NotImplementedError
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Write a value to the cache.
|
|
49
|
+
#
|
|
50
|
+
# @param key [String] Cache key
|
|
51
|
+
# @param value [Object] Value to cache (must be JSON-serializable)
|
|
52
|
+
# @param ttl [Integer, nil] Time-to-live in seconds (nil = use domain default)
|
|
53
|
+
# @return [void]
|
|
54
|
+
def write(key, value, ttl: nil)
|
|
55
|
+
raise NotImplementedError
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Delete a key from the cache.
|
|
59
|
+
#
|
|
60
|
+
# @param key [String] Cache key
|
|
61
|
+
# @return [void]
|
|
62
|
+
def delete(key)
|
|
63
|
+
raise NotImplementedError
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Check if a key exists and is not expired.
|
|
67
|
+
#
|
|
68
|
+
# @param key [String] Cache key
|
|
69
|
+
# @return [Boolean]
|
|
70
|
+
def exist?(key)
|
|
71
|
+
raise NotImplementedError
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Clear cached entries. If namespace is given, only clear that domain.
|
|
75
|
+
#
|
|
76
|
+
# @param namespace [Symbol, nil] Cache domain to clear, or nil for all
|
|
77
|
+
# @return [void]
|
|
78
|
+
def clear(namespace: nil)
|
|
79
|
+
raise NotImplementedError
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# Read-through cache: return cached value or execute block and cache result.
|
|
83
|
+
#
|
|
84
|
+
# @note nil is treated as a cache miss. If the wrapped operation legitimately
|
|
85
|
+
# returns nil, every call will re-execute the block. Custom backend
|
|
86
|
+
# implementers should preserve this semantic — do not return nil for keys
|
|
87
|
+
# that were written with a non-nil value. This is acceptable for the
|
|
88
|
+
# built-in use cases (embeddings and formatted context are never nil).
|
|
89
|
+
#
|
|
90
|
+
# @param key [String] Cache key
|
|
91
|
+
# @param ttl [Integer, nil] TTL in seconds
|
|
92
|
+
# @yield Block that computes the value on cache miss
|
|
93
|
+
# @return [Object] Cached or freshly computed value
|
|
94
|
+
def fetch(key, ttl: nil)
|
|
95
|
+
cached = read(key)
|
|
96
|
+
return cached unless cached.nil?
|
|
97
|
+
|
|
98
|
+
value = yield
|
|
99
|
+
begin
|
|
100
|
+
write(key, value, ttl: ttl)
|
|
101
|
+
rescue StandardError => e
|
|
102
|
+
logger.warn("[Woods] CacheStore#fetch write failed for #{key}: #{e.message}")
|
|
103
|
+
end
|
|
104
|
+
value
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
private
|
|
108
|
+
|
|
109
|
+
# Return a logger instance (Rails.logger in Rails apps, stderr elsewhere).
|
|
110
|
+
#
|
|
111
|
+
# @return [Logger]
|
|
112
|
+
def logger
|
|
113
|
+
@logger ||= defined?(Rails) ? Rails.logger : Logger.new($stderr)
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# Build a wildcard pattern for clearing cache entries.
|
|
117
|
+
#
|
|
118
|
+
# @param namespace [Symbol, nil] Cache domain, or nil for all entries
|
|
119
|
+
# @return [String]
|
|
120
|
+
def clear_pattern(namespace)
|
|
121
|
+
namespace ? "woods:cache:#{namespace}:*" : 'woods:cache:*'
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
# Delete a key, silently swallowing any errors.
|
|
125
|
+
#
|
|
126
|
+
# Used for cleanup on corrupt/stale entries where failure is acceptable.
|
|
127
|
+
#
|
|
128
|
+
# @param key [String]
|
|
129
|
+
# @return [nil]
|
|
130
|
+
def delete_silently(key)
|
|
131
|
+
delete(key)
|
|
132
|
+
rescue StandardError
|
|
133
|
+
nil
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# In-memory cache store with LRU eviction and TTL support.
|
|
138
|
+
#
|
|
139
|
+
# Zero external dependencies. Suitable for single-process use, development,
|
|
140
|
+
# and as a fallback when Redis/SolidCache are not available. Thread-safe.
|
|
141
|
+
#
|
|
142
|
+
# @example
|
|
143
|
+
# store = InMemory.new(max_entries: 200)
|
|
144
|
+
# store.write("ci:emb:abc", [0.1, 0.2], ttl: 3600)
|
|
145
|
+
# store.read("ci:emb:abc") # => [0.1, 0.2]
|
|
146
|
+
#
|
|
147
|
+
class InMemory < CacheStore
|
|
148
|
+
# @param max_entries [Integer] Maximum cached entries before LRU eviction
|
|
149
|
+
def initialize(max_entries: 500)
|
|
150
|
+
super()
|
|
151
|
+
@max_entries = max_entries
|
|
152
|
+
@entries = {}
|
|
153
|
+
@access_order = []
|
|
154
|
+
@mutex = Mutex.new
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
# Read a value, returning nil if missing or expired.
|
|
158
|
+
#
|
|
159
|
+
# @param key [String] Cache key
|
|
160
|
+
# @return [Object, nil]
|
|
161
|
+
def read(key)
|
|
162
|
+
@mutex.synchronize do
|
|
163
|
+
entry = @entries[key]
|
|
164
|
+
return nil unless entry
|
|
165
|
+
|
|
166
|
+
if entry[:expires_at] && Time.now > entry[:expires_at]
|
|
167
|
+
evict_key(key)
|
|
168
|
+
return nil
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
touch(key)
|
|
172
|
+
entry[:value]
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
# Write a value with optional TTL.
|
|
177
|
+
#
|
|
178
|
+
# @param key [String] Cache key
|
|
179
|
+
# @param value [Object] Value to cache
|
|
180
|
+
# @param ttl [Integer, nil] TTL in seconds
|
|
181
|
+
# @return [void]
|
|
182
|
+
def write(key, value, ttl: nil)
|
|
183
|
+
@mutex.synchronize do
|
|
184
|
+
evict_key(key) if @entries.key?(key)
|
|
185
|
+
|
|
186
|
+
if @entries.size >= @max_entries
|
|
187
|
+
oldest = @access_order.shift
|
|
188
|
+
@entries.delete(oldest) if oldest
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
expires_at = ttl ? Time.now + ttl : nil
|
|
192
|
+
@entries[key] = { value: value, expires_at: expires_at }
|
|
193
|
+
@access_order.push(key)
|
|
194
|
+
end
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
# Delete a key.
|
|
198
|
+
#
|
|
199
|
+
# @param key [String] Cache key
|
|
200
|
+
# @return [void]
|
|
201
|
+
def delete(key)
|
|
202
|
+
@mutex.synchronize { evict_key(key) }
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
# Check if a key exists and is not expired.
|
|
206
|
+
#
|
|
207
|
+
# @param key [String] Cache key
|
|
208
|
+
# @return [Boolean]
|
|
209
|
+
def exist?(key)
|
|
210
|
+
@mutex.synchronize do
|
|
211
|
+
entry = @entries[key]
|
|
212
|
+
return false unless entry
|
|
213
|
+
return false if entry[:expires_at] && Time.now > entry[:expires_at]
|
|
214
|
+
|
|
215
|
+
true
|
|
216
|
+
end
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
# Clear entries. If namespace is given, only clear keys matching that domain.
|
|
220
|
+
#
|
|
221
|
+
# @param namespace [Symbol, nil] Domain to clear (:embeddings, :metadata, etc.)
|
|
222
|
+
# @return [void]
|
|
223
|
+
def clear(namespace: nil)
|
|
224
|
+
@mutex.synchronize do
|
|
225
|
+
if namespace
|
|
226
|
+
prefix = "woods:cache:#{namespace}:"
|
|
227
|
+
keys_to_delete = @entries.keys.select { |k| k.start_with?(prefix) }
|
|
228
|
+
keys_to_delete.each { |k| evict_key(k) }
|
|
229
|
+
else
|
|
230
|
+
@entries.clear
|
|
231
|
+
@access_order.clear
|
|
232
|
+
end
|
|
233
|
+
end
|
|
234
|
+
end
|
|
235
|
+
|
|
236
|
+
# Number of entries currently in the cache (for testing/diagnostics).
|
|
237
|
+
#
|
|
238
|
+
# @return [Integer]
|
|
239
|
+
def size
|
|
240
|
+
@mutex.synchronize { @entries.size }
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
private
|
|
244
|
+
|
|
245
|
+
# Remove a key from both the entry hash and access order.
|
|
246
|
+
#
|
|
247
|
+
# @param key [String]
|
|
248
|
+
# @return [void]
|
|
249
|
+
def evict_key(key)
|
|
250
|
+
@entries.delete(key)
|
|
251
|
+
@access_order.delete(key)
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
# Move a key to the end of the access order (most recently used).
|
|
255
|
+
#
|
|
256
|
+
# @param key [String]
|
|
257
|
+
# @return [void]
|
|
258
|
+
def touch(key)
|
|
259
|
+
@access_order.delete(key)
|
|
260
|
+
@access_order.push(key)
|
|
261
|
+
end
|
|
262
|
+
end
|
|
263
|
+
end
|
|
264
|
+
end
|