RubyGems - woods - Versions diffs - 1.2.0 → 1.4.0 - Mend

woods 1.2.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (109) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +229 -0
data/README.md +24 -8
data/exe/woods-console +51 -6
data/exe/woods-console-mcp +24 -4
data/exe/woods-mcp +30 -7
data/exe/woods-mcp-http +47 -6
data/lib/generators/woods/install_generator.rb +13 -4
data/lib/generators/woods/templates/woods.rb.tt +155 -0
data/lib/tasks/woods.rake +37 -51
data/lib/woods/builder.rb +174 -9
data/lib/woods/cache/cache_middleware.rb +360 -31
data/lib/woods/chunking/semantic_chunker.rb +334 -7
data/lib/woods/console/adapters/job_adapter.rb +10 -4
data/lib/woods/console/audit_logger.rb +76 -4
data/lib/woods/console/bridge.rb +48 -15
data/lib/woods/console/bridge_protocol.rb +44 -0
data/lib/woods/console/confirmation.rb +3 -4
data/lib/woods/console/console_response_renderer.rb +56 -18
data/lib/woods/console/credential_index.rb +201 -0
data/lib/woods/console/credential_scanner.rb +302 -0
data/lib/woods/console/dispatch_pipeline.rb +138 -0
data/lib/woods/console/embedded_executor.rb +682 -35
data/lib/woods/console/eval_guard.rb +319 -0
data/lib/woods/console/model_validator.rb +1 -3
data/lib/woods/console/rack_middleware.rb +185 -29
data/lib/woods/console/redactor.rb +161 -0
data/lib/woods/console/response_context.rb +127 -0
data/lib/woods/console/safe_context.rb +220 -23
data/lib/woods/console/scope_predicate_parser.rb +131 -0
data/lib/woods/console/server.rb +417 -486
data/lib/woods/console/sql_noise_stripper.rb +87 -0
data/lib/woods/console/sql_table_scanner.rb +213 -0
data/lib/woods/console/sql_validator.rb +81 -31
data/lib/woods/console/table_gate.rb +93 -0
data/lib/woods/console/tool_specs.rb +552 -0
data/lib/woods/console/tools/tier1.rb +3 -3
data/lib/woods/console/tools/tier4.rb +7 -1
data/lib/woods/dependency_graph.rb +66 -7
data/lib/woods/embedding/indexer.rb +190 -6
data/lib/woods/embedding/openai.rb +40 -4
data/lib/woods/embedding/provider.rb +104 -8
data/lib/woods/embedding/text_preparer.rb +23 -3
data/lib/woods/embedding/token_counter.rb +133 -0
data/lib/woods/evaluation/baseline_runner.rb +20 -2
data/lib/woods/evaluation/metrics.rb +4 -1
data/lib/woods/extracted_unit.rb +1 -0
data/lib/woods/extractor.rb +7 -1
data/lib/woods/extractors/controller_extractor.rb +10 -4
data/lib/woods/extractors/mailer_extractor.rb +16 -2
data/lib/woods/extractors/model_extractor.rb +6 -1
data/lib/woods/extractors/phlex_extractor.rb +13 -4
data/lib/woods/extractors/rails_source_extractor.rb +2 -0
data/lib/woods/extractors/route_helper_resolver.rb +130 -0
data/lib/woods/extractors/shared_dependency_scanner.rb +130 -2
data/lib/woods/extractors/view_component_extractor.rb +12 -1
data/lib/woods/extractors/view_engines/base.rb +141 -0
data/lib/woods/extractors/view_engines/erb.rb +145 -0
data/lib/woods/extractors/view_template_extractor.rb +92 -133
data/lib/woods/flow_assembler.rb +23 -15
data/lib/woods/flow_precomputer.rb +21 -2
data/lib/woods/graph_analyzer.rb +3 -4
data/lib/woods/index_artifact.rb +173 -0
data/lib/woods/mcp/bearer_auth.rb +45 -0
data/lib/woods/mcp/bootstrap_state.rb +94 -0
data/lib/woods/mcp/bootstrapper.rb +337 -16
data/lib/woods/mcp/config_resolver.rb +288 -0
data/lib/woods/mcp/errors.rb +134 -0
data/lib/woods/mcp/index_reader.rb +265 -30
data/lib/woods/mcp/origin_guard.rb +132 -0
data/lib/woods/mcp/provider_probe.rb +166 -0
data/lib/woods/mcp/renderers/claude_renderer.rb +6 -0
data/lib/woods/mcp/renderers/markdown_renderer.rb +39 -3
data/lib/woods/mcp/renderers/plain_renderer.rb +16 -2
data/lib/woods/mcp/server.rb +737 -137
data/lib/woods/model_name_cache.rb +78 -2
data/lib/woods/notion/client.rb +25 -2
data/lib/woods/notion/mappers/model_mapper.rb +36 -2
data/lib/woods/railtie.rb +55 -15
data/lib/woods/resilience/circuit_breaker.rb +9 -2
data/lib/woods/resilience/retryable_provider.rb +40 -3
data/lib/woods/resolved_config.rb +299 -0
data/lib/woods/retrieval/context_assembler.rb +112 -5
data/lib/woods/retrieval/query_classifier.rb +1 -1
data/lib/woods/retrieval/ranker.rb +55 -6
data/lib/woods/retrieval/search_executor.rb +42 -13
data/lib/woods/retriever.rb +330 -24
data/lib/woods/session_tracer/middleware.rb +35 -1
data/lib/woods/storage/graph_store.rb +39 -0
data/lib/woods/storage/inapplicable_backend.rb +14 -0
data/lib/woods/storage/metadata_store.rb +129 -1
data/lib/woods/storage/pgvector.rb +70 -8
data/lib/woods/storage/qdrant.rb +196 -5
data/lib/woods/storage/snapshotter/metadata.rb +172 -0
data/lib/woods/storage/snapshotter/vector.rb +238 -0
data/lib/woods/storage/snapshotter.rb +24 -0
data/lib/woods/storage/vector_store.rb +184 -35
data/lib/woods/tasks.rb +85 -0
data/lib/woods/temporal/snapshot_store.rb +49 -1
data/lib/woods/token_utils.rb +44 -5
data/lib/woods/unblocked/client.rb +88 -7
data/lib/woods/unblocked/document_builder.rb +75 -36
data/lib/woods/unblocked/exporter.rb +234 -18
data/lib/woods/unblocked/rate_limiter.rb +10 -2
data/lib/woods/unblocked/sync_manifest.rb +135 -0
data/lib/woods/util/host_guard.rb +61 -0
data/lib/woods/version.rb +1 -1
data/lib/woods.rb +126 -6
metadata +70 -4

data/lib/woods/storage/vector_store.rb CHANGED Viewed

@@ -1,5 +1,7 @@
 # frozen_string_literal: true
+require 'set'
 module Woods
   module Storage
     # VectorStore provides an interface for storing and searching embedding vectors.
@@ -36,11 +38,39 @@ module Woods
           entries.each { |e| store(e[:id], e[:vector], e[:metadata] || {}) }
         end
+        # Iterate over every live entry, yielding `(id, vector, metadata)`.
+        #
+        # Persistence seam for Snapshotter and similar consumers. Default
+        # implementation falls through to `NotImplementedError`; adapters
+        # that need to support dumping must implement it. Persistent
+        # backends (pgvector, Qdrant) aren't expected to implement this —
+        # the Snapshotter only touches non-persistent stores.
+        #
+        # @yield [id, vector, metadata]
+        # @return [Enumerator] when no block given
+        def each_entry
+          raise NotImplementedError
+        end
+        # Bulk-load pre-computed entries. Dual of {#each_entry} — the
+        # Snapshotter hydrates a store by feeding this the dump contents.
+        #
+        # @param entries [Enumerable<Hash>] Each entry has :id, :vector, :metadata keys
+        def bulk_load(entries)
+          store_batch(entries.to_a)
+        end
         # Search for similar vectors using cosine similarity.
         #
+        # Filter values may be scalars (exact match) or Arrays (membership
+        # match — "value ∈ array"). Adapters implement the membership
+        # semantics natively: in-memory loops, pgvector IN (...), Qdrant
+        # `match: { any: [...] }`.
+        #
         # @param query_vector [Array<Float>] The query embedding vector
         # @param limit [Integer] Maximum number of results to return
-        # @param filters [Hash] Optional metadata filters to apply
+        # @param filters [Hash] Optional metadata filters — values may be
+        #   scalars or Arrays
         # @return [Array<SearchResult>] Results sorted by descending similarity
         # @raise [NotImplementedError] if not implemented by adapter
         def search(query_vector, limit: 10, filters: {})
@@ -87,79 +117,198 @@ module Woods
       #   store.search([1.0, 0.0], limit: 1)
       #   # => [#<SearchResult id="doc1", score=1.0, metadata={type: "model"}>]
       #
-      class InMemory
+      class InMemory # rubocop:disable Metrics/ClassLength
         include Interface
+        # Flat-buffer backing. One Array<Float> of length count*dim holds
+        # every vector contiguously; two parallel Arrays hold the ids and
+        # metadata at matching positions. Deleted entries are tombstoned
+        # (their index is added to @tombstones) rather than removed, so
+        # stored vector positions stay stable under concurrent iteration
+        # and dumps. Tombstones are compacted at next full-embed run.
+        #
+        # The flat buffer exists both for cache friendliness during the
+        # cosine kernel (all vectors live in one contiguous allocation)
+        # and to make dump/load via `pack("e*")` a single call rather
+        # than a per-vector concatenation.
         def initialize
-          @entries = {} # id => { vector:, metadata: }
+          @dim = nil
+          @ids = [] # Array<String> (frozen)
+          @vectors_flat = [] # flat Array<Float>, length @ids.size * @dim
+          @metadata = []     # Array<Hash>, index-aligned with @ids
+          @id_to_index = {}  # id => Integer for O(1) delete/overwrite
+          @tombstones = Set.new
         end
+        # @return [Integer, nil] dimension of stored vectors, nil if empty
+        attr_reader :dim
         # @see Interface#store
         def store(id, vector, metadata = {})
-          @entries[id] = { vector: vector, metadata: metadata }
+          @dim ||= vector.length
+          unless vector.length == @dim
+            raise ArgumentError,
+                  "Vector dimension mismatch (#{vector.length} vs #{@dim})"
+          end
+          frozen_id = id.frozen? ? id : id.dup.freeze
+          existing = @id_to_index[frozen_id]
+          if existing
+            overwrite(existing, vector, metadata)
+          else
+            append(frozen_id, vector, metadata)
+          end
+        end
+        # @see Interface#bulk_load
+        # Single-pass hydrate — more efficient than N store calls when
+        # the Snapshotter feeds a large dump at boot time.
+        def bulk_load(entries)
+          entries.each { |entry| store(entry[:id], entry[:vector], entry[:metadata] || {}) }
+        end
+        # Drop every stored entry, restoring the store to its post-+new+ state.
+        #
+        # Used by the MCP +reload+ tool to pick up a fresh embed run without
+        # restarting the process. A subsequent +#bulk_load+ then repopulates
+        # from disk. Safe on an already-empty store.
+        def clear!
+          @dim = nil
+          @ids = []
+          @vectors_flat = []
+          @metadata = []
+          @id_to_index = {}
+          @tombstones = Set.new
+        end
+        # @see Interface#each_entry
+        def each_entry(&block)
+          return enum_for(:each_entry) unless block
+          @ids.each_with_index do |id, idx|
+            next if @tombstones.include?(idx)
+            base = idx * @dim
+            yield(id, @vectors_flat[base, @dim], @metadata[idx])
+          end
         end
         # @see Interface#search
         def search(query_vector, limit: 10, filters: {})
-          candidates = filter_entries(filters)
+          return [] if @dim.nil?
-          scored = candidates.map do |id, entry|
-            score = cosine_similarity(query_vector, entry[:vector])
-            SearchResult.new(id: id, score: score, metadata: entry[:metadata])
+          unless query_vector.length == @dim
+            raise ArgumentError,
+                  "Vector dimension mismatch (#{query_vector.length} vs #{@dim})"
           end
-          scored.sort_by { |r| -r.score }.first(limit)
+          scored = gather_candidates(query_vector, filters)
+          scored.sort_by! { |r| -r.score }
+          scored.first(limit)
         end
         # @see Interface#delete
         def delete(id)
-          @entries.delete(id)
+          idx = @id_to_index.delete(id)
+          @tombstones << idx if idx
         end
         # @see Interface#delete_by_filter
         def delete_by_filter(filters)
-          @entries.reject! do |_id, entry|
-            filters.all? { |key, value| entry[:metadata][key] == value }
+          @ids.each_with_index do |id, idx|
+            next if @tombstones.include?(idx)
+            next unless filters.all? { |key, value| @metadata[idx][key] == value }
+            @tombstones << idx
+            @id_to_index.delete(id)
           end
         end
         # @see Interface#count
         def count
-          @entries.size
+          @ids.size - @tombstones.size
         end
         private
-        # Filter entries by metadata key-value pairs.
-        #
-        # @param filters [Hash] Metadata filters
-        # @return [Hash] Filtered entries
-        def filter_entries(filters)
-          return @entries if filters.empty?
+        # Match a filter value against a metadata value. Arrays are
+        # membership filters ("any of"); scalars are equality.
+        def filter_match?(filter_value, meta_value)
+          filter_value.is_a?(Array) ? filter_value.include?(meta_value) : filter_value == meta_value
+        end
+        # Append a new entry to the flat buffer.
+        def append(id, vector, metadata)
+          idx = @ids.size
+          @ids << id
+          @vectors_flat.concat(vector)
+          @metadata << metadata
+          @id_to_index[id] = idx
+        end
-          @entries.select do |_id, entry|
-            filters.all? { |key, value| entry[:metadata][key] == value }
+        # Overwrite an existing entry in place. Tombstones the old slot's
+        # deletion marker (if any) so the new vector is live again.
+        def overwrite(idx, vector, metadata)
+          base = idx * @dim
+          i = 0
+          while i < @dim
+            @vectors_flat[base + i] = vector[i]
+            i += 1
           end
+          @metadata[idx] = metadata
+          @tombstones.delete(idx)
         end
-        # Compute cosine similarity between two vectors.
-        #
-        # @param vec_a [Array<Float>] First vector
-        # @param vec_b [Array<Float>] Second vector
-        # @return [Float] Cosine similarity between -1.0 and 1.0
-        # @raise [ArgumentError] if vectors have different dimensions
-        def cosine_similarity(vec_a, vec_b)
-          unless vec_a.length == vec_b.length
-            raise ArgumentError,
-                  "Vector dimension mismatch (#{vec_a.length} vs #{vec_b.length})"
+        # Walk every non-tombstoned index, apply filters, score survivors.
+        # Filter check runs BEFORE the cosine kernel — avoids computing
+        # 12k dot products only to discard most of them.
+        def gather_candidates(query_vector, filters)
+          scored = []
+          len = @ids.size
+          idx = 0
+          while idx < len
+            if @tombstones.include?(idx)
+              idx += 1
+              next
+            end
+            meta = @metadata[idx]
+            unless filters.empty? || filters.all? { |k, v| filter_match?(v, meta[k]) }
+              idx += 1
+              next
+            end
+            score = cosine_similarity_strided(query_vector, idx * @dim)
+            scored << SearchResult.new(id: @ids[idx], score: score, metadata: meta)
+            idx += 1
           end
+          scored
+        end
-          dot = vec_a.zip(vec_b).sum { |x, y| x * y }
-          mag_a = Math.sqrt(vec_a.sum { |x| x**2 })
-          mag_b = Math.sqrt(vec_b.sum { |x| x**2 })
+        # Cosine similarity between a query Array<Float> and a vector
+        # that lives at @vectors_flat[base, @dim]. Strided access avoids
+        # allocating a copy of the stored vector on every comparison.
+        #
+        # See bench/vector_query_and_serialization.rb for the allocation
+        # story — the old Enumerable path allocated ~770 objects per pair;
+        # this loop allocates none inside the hot path.
+        def cosine_similarity_strided(query, base)
+          len = @dim
+          i = 0
+          dot = 0.0
+          mag_a = 0.0
+          mag_b = 0.0
+          while i < len
+            a = query[i]
+            b = @vectors_flat[base + i]
+            dot += a * b
+            mag_a += a * a
+            mag_b += b * b
+            i += 1
+          end
           return 0.0 if mag_a.zero? || mag_b.zero?
-          dot / (mag_a * mag_b)
+          dot / (Math.sqrt(mag_a) * Math.sqrt(mag_b))
         end
       end
     end

data/lib/woods/tasks.rb ADDED Viewed

@@ -0,0 +1,85 @@
+# frozen_string_literal: true
+require_relative 'builder'
+require_relative 'embedding/indexer'
+require_relative 'embedding/text_preparer'
+require_relative 'resolved_config'
+module Woods
+  # Small helpers invoked from `lib/tasks/woods.rake`.
+  #
+  # Keeps rake task bodies to a couple of lines each so the real work lives in
+  # plain Ruby that can be unit-tested without Rake's global state.
+  module Tasks
+    module_function
+    # Build an {Embedding::Indexer} wired to the provider and stores described
+    # by {Woods.configuration}. Uses {Builder} so `config.embedding_provider`,
+    # `config.embedding_options`, and `config.vector_store(_options)` are all
+    # honoured — prior to this the rake tasks hardcoded Ollama + InMemory and
+    # silently ignored configuration, which was invisible until the provider
+    # tried to reach an unreachable default host.
+    #
+    # The TextPreparer and SemanticChunker are tuned to the selected
+    # provider so oversize units are split into chunks that fit the
+    # provider's input budget (e.g. Ollama's num_ctx, OpenAI's 8k cap).
+    #
+    # @return [Embedding::Indexer]
+    def build_embed_indexer
+      config = Woods.configuration
+      builder = Builder.new(config)
+      provider = builder.build_embedding_provider
+      # Wire the persistence-arc pieces (resolved_config, metadata_store,
+      # dump_retention_count) so Indexer#persist_snapshot can write
+      # woods.json, dump metadata, and honour the user's retention setting.
+      # Without these kwargs, embed writes vectors.bin + latest pointer but
+      # never writes woods.json — which breaks the standalone woods-mcp
+      # Shape-2 boot path entirely.
+      #
+      # metadata_store and resolved_config are nil-safe — hosts that don't
+      # configure metadata or that pre-date the persistence arc still work.
+      Embedding::Indexer.new(
+        provider: provider,
+        text_preparer: builder.build_text_preparer(provider),
+        vector_store: builder.build_vector_store,
+        metadata_store: config.metadata_store ? builder.build_metadata_store : nil,
+        resolved_config: build_resolved_config(config, provider: provider),
+        chunker: builder.build_chunker(provider),
+        dump_retention_count: config.dump_retention_count,
+        output_dir: ENV.fetch('WOODS_OUTPUT', config.output_dir)
+      )
+    end
+    # Build a ResolvedConfig snapshot from the live Woods::Configuration.
+    # Returns nil if the configuration doesn't have enough to produce one
+    # (pre-persistence-arc hosts) so the Indexer falls back to the legacy
+    # dump-without-woods.json behaviour.
+    #
+    # Passes the live +provider+ so {ResolvedConfig.from_configuration} can
+    # probe +provider.dimensions+ — without this, Ollama snapshots record
+    # +dimension: 0+ and every subsequent MCP boot fails a spurious
+    # dimension-mismatch check against the real stored vectors.
+    def build_resolved_config(config, provider: nil)
+      return nil unless config.embedding_provider
+      ResolvedConfig.from_configuration(config, provider: provider)
+    rescue StandardError
+      nil
+    end
+    # Print an indexer stats hash in the format the rake tasks have historically
+    # used. `mode:` only affects the header line.
+    #
+    # @param stats [Hash]
+    # @param mode [Symbol] :full or :incremental
+    def print_embed_stats(stats, mode:)
+      header = mode == :incremental ? 'Incremental embedding complete!' : 'Embedding complete!'
+      puts
+      puts header
+      puts "  Processed: #{stats[:processed]}"
+      puts "  Skipped:   #{stats[:skipped]}"
+      puts "  Errors:    #{stats[:errors]}"
+    end
+  end
+end

data/lib/woods/temporal/snapshot_store.rb CHANGED Viewed

@@ -23,10 +23,58 @@ module Woods
     #
     class SnapshotStore # rubocop:disable Metrics/ClassLength
       # @param connection [Object] Database connection supporting #execute and #get_first_row
-      def initialize(connection:)
+      # @param validate_schema [Boolean] If true (default), probe both required
+      #   tables at construction time and raise a descriptive error pointing at
+      #   migrations 004+005 when they are missing. Set false in tests that
+      #   construct the store with a bare mock.
+      def initialize(connection:, validate_schema: true)
         @db = connection
+        validate_schema! if validate_schema
       end
+      REQUIRED_TABLES = %w[woods_snapshots woods_snapshot_units].freeze
+      # Probe that `woods_snapshots` and `woods_snapshot_units` exist. If
+      # they don't, raise with guidance to run migrations 004 + 005 —
+      # without this, the first call to {#capture}/{#find} raises a generic
+      # adapter error that doesn't tell operators why.
+      #
+      # When the connection responds to `#columns` (ActiveRecord-shaped) or
+      # `#table_exists?`, use that — these are hard to spoof from a test
+      # mock, so a partial mock can no longer silently pass. Falls back to
+      # the `SELECT 1 FROM t LIMIT 1` probe for minimal connections.
+      #
+      # @raise [Woods::Error]
+      def validate_schema!
+        REQUIRED_TABLES.each { |t| probe_table!(t) }
+      rescue Woods::Error
+        raise
+      rescue StandardError => e
+        raise Woods::Error, schema_error_message(e)
+      end
+      private
+      def probe_table!(table)
+        if @db.respond_to?(:table_exists?)
+          raise Woods::Error, schema_error_message("table `#{table}` does not exist") unless @db.table_exists?(table)
+        elsif @db.respond_to?(:columns)
+          cols = @db.columns(table)
+          raise Woods::Error, schema_error_message("no columns for `#{table}`") if cols.nil? || cols.empty?
+        else
+          @db.execute("SELECT 1 FROM #{table} LIMIT 1")
+        end
+      end
+      def schema_error_message(detail)
+        'SnapshotStore requires the `woods_snapshots` and ' \
+          '`woods_snapshot_units` tables (migrations 004 + 005 under ' \
+          '`lib/woods/db/migrations/`). Run `rake woods:migrate` on the ' \
+          "metadata DB and retry. Underlying error: #{detail}"
+      end
+      public
       # Capture a snapshot after extraction completes.
       #
       # Stores the manifest metadata and per-unit content hashes.

data/lib/woods/token_utils.rb CHANGED Viewed

@@ -1,19 +1,58 @@
 # frozen_string_literal: true
 module Woods
-  # Shared token estimation utility.
+  # Shared token estimation utility — the single source of truth for the
+  # chars-per-token ratio used across cost estimation, context assembly,
+  # and embedding budgeting.
   #
-  # Uses project convention: (string.length / 4.0).ceil
-  # See docs/TOKEN_BENCHMARK.md — conservative floor (~10.6% overestimate).
+  # Ratios:
+  # - `:openai` / default — 4.0 chars/token. Benchmarked against tiktoken
+  #   (cl100k_base) on 19 Ruby source files (mean 4.41 chars/token). We use
+  #   4.0 as a conservative floor (~10.6 % overestimate) so truncation never
+  #   hands the model more tokens than it budgeted for. See
+  #   `docs/TOKEN_BENCHMARK.md`.
+  # - `:ollama` — 1.5 chars/token. Matches the BERT WordPiece tokenizers
+  #   used by nomic-embed-text and mxbai-embed-large. See
+  #   `docs/EMBEDDING_MODELS.md` and `Woods::Builder#chars_per_token_for`.
+  #
+  # Callers should prefer {.chars_per_token_for} over hardcoding a divisor
+  # so future tokenizer changes propagate in one place instead of drifting
+  # between {ContextAssembler}, {Builder}, and cost-model components.
   module TokenUtils
+    CHARS_PER_TOKEN_BY_PROVIDER = {
+      openai: 4.0,
+      ollama: 1.5
+    }.freeze
+    DEFAULT_CHARS_PER_TOKEN = CHARS_PER_TOKEN_BY_PROVIDER[:openai]
     module_function
-    # Estimate token count for a string.
+    # Chars-per-token ratio for the given embedding provider.
+    #
+    # @param provider [Symbol, String, nil] Provider identifier. Unknown or
+    #   nil providers fall back to {DEFAULT_CHARS_PER_TOKEN}.
+    # @return [Float]
+    def chars_per_token_for(provider)
+      CHARS_PER_TOKEN_BY_PROVIDER.fetch(provider&.to_sym, DEFAULT_CHARS_PER_TOKEN)
+    end
+    # Estimate token count for a string using the default (OpenAI) ratio.
+    # Use {.estimate_tokens_for} when a specific provider is in play.
     #
     # @param text [String] Text to estimate
     # @return [Integer] Estimated token count
     def estimate_tokens(text)
-      (text.length / 4.0).ceil
+      estimate_tokens_for(text, provider: nil)
+    end
+    # Estimate token count for a string using the provider's native ratio.
+    #
+    # @param text [String] Text to estimate
+    # @param provider [Symbol, String, nil] `:openai`, `:ollama`, or nil.
+    # @return [Integer] Estimated token count
+    def estimate_tokens_for(text, provider:)
+      (text.length / chars_per_token_for(provider)).ceil
     end
   end
 end

data/lib/woods/unblocked/client.rb CHANGED Viewed

@@ -3,10 +3,28 @@
 require 'json'
 require 'net/http'
 require 'uri'
+require 'woods'
 require_relative 'rate_limiter'
 module Woods
   module Unblocked
+    # API error carrying the HTTP status code, so callers can branch on
+    # status (e.g. treat a 404 on delete as "already gone") instead of
+    # matching message strings. Subclasses Woods::Error, so existing
+    # +rescue Woods::Error+ sites keep working unchanged.
+    class ApiError < Woods::Error
+      # @return [Integer] HTTP status code of the failed response
+      attr_reader :status
+      # @param message [String] Error message
+      # @param status [Integer] HTTP status code — required, because callers
+      #   branch on it (a nil status would silently miss every status check)
+      def initialize(message, status:)
+        super(message)
+        @status = Integer(status)
+      end
+    end
     # REST client for the Unblocked API v1.
     #
     # Handles document and collection CRUD with rate limiting, retries,
@@ -25,6 +43,12 @@ module Woods
       BASE_URL = 'https://getunblocked.com/api/v1'
       MAX_RETRIES = 3
       DEFAULT_TIMEOUT = 30
+      # Max page size the list endpoint accepts (per API docs).
+      PAGE_SIZE = 200
+      # Repo-hosted Woods mark, used as the collection icon when none is given.
+      # The live API rejects collection creation without an iconUrl (despite
+      # the API docs marking it optional), so a working default matters.
+      DEFAULT_ICON_URL = 'https://raw.githubusercontent.com/lost-in-the/woods/main/assets/woods-mark-black.svg'
       # @param api_token [String] Unblocked API token (Personal or Team)
       # @param rate_limiter [RateLimiter] Rate limiter instance
@@ -60,12 +84,17 @@ module Woods
       #
       # @param name [String] Collection name (1-32 chars)
       # @param description [String] Collection description (1-4096 chars)
-      # @param icon_url [String, nil] Optional icon URL
+      # @param icon_url [String, nil] Icon URL. The live API rejects creation
+      #   with a bare 400 when omitted (despite the API docs marking it
+      #   optional), so nil falls back to DEFAULT_ICON_URL — the repo-hosted
+      #   Woods mark.
       # @return [Hash] { "id" => "collection-uuid", "name" => "...", ... }
       def create_collection(name:, description:, icon_url: nil)
-        body = { name: name, description: description }
-        body[:iconUrl] = icon_url if icon_url
-        request(:post, 'collections', body)
+        request(:post, 'collections', {
+                  name: name,
+                  description: description,
+                  iconUrl: icon_url || DEFAULT_ICON_URL
+                })
       end
       # List all collections.
@@ -73,6 +102,10 @@ module Woods
       # @return [Array<Hash>] Collection objects
       def list_collections
         result = request(:get, 'collections')
+        # The live API returns a bare JSON array; the envelope fallbacks are
+        # defensive (calling ['items'] on an Array raises TypeError).
+        return result if result.is_a?(Array)
         result['items'] || result['data'] || [result].flatten.compact
       end
@@ -84,6 +117,52 @@ module Woods
         request(:delete, "documents/#{document_id}")
       end
+      # List a single page of documents.
+      #
+      # The endpoint returns a bare JSON array of document metadata (no body):
+      # `id, collectionId, title, uri, createdAt, updatedAt`. Pagination is
+      # cursor-based via `after`/`before` (opaque cursors); there is no
+      # server-side collection filter.
+      #
+      # @param limit [Integer] Page size (1-200)
+      # @param after [String, nil] Opaque forward cursor (typically the last id)
+      # @return [Array<Hash>] One page of document metadata
+      def list_documents(limit: PAGE_SIZE, after: nil)
+        query = "limit=#{limit}"
+        query += "&after=#{URI.encode_www_form_component(after)}" if after
+        result = request(:get, "documents?#{query}")
+        return result if result.is_a?(Array)
+        result['items'] || result['data'] || []
+      end
+      # List every document in a collection, paging until exhausted.
+      #
+      # Filters client-side on `collectionId` since the API has no collection
+      # filter. ~5 calls for ~1000 documents; each goes through the rate limiter.
+      #
+      # @param collection_id [String] Collection UUID to filter to
+      # @return [Array<Hash>] All matching document metadata
+      def all_documents(collection_id:)
+        docs = []
+        after = nil
+        loop do
+          page = list_documents(limit: PAGE_SIZE, after: after)
+          break if page.empty?
+          docs.concat(page)
+          break if page.size < PAGE_SIZE
+          after = page.last['id']
+          # A full page with no cursor id would refetch page 1 forever —
+          # stop with what we have rather than loop against the budget.
+          break if after.nil?
+        end
+        docs.select { |doc| doc['collectionId'] == collection_id }
+      end
       private
       def request(method, path, body = nil)
@@ -118,7 +197,7 @@ module Woods
           http.request(req)
         rescue Net::OpenTimeout, Net::ReadTimeout, Errno::ECONNRESET, Errno::ECONNREFUSED => e
           attempts += 1
-          raise Woods::Error, "Network error after #{attempts} retries: #{e.message}" if attempts >= MAX_RETRIES
+          raise Woods::Error, "Network error after #{attempts} retries: #{e.message}" if attempts > MAX_RETRIES
           sleep(2**attempts)
           retry
@@ -155,8 +234,10 @@ module Woods
         rescue JSON::ParserError, TypeError
           { 'message' => response.body&.slice(0, 200) || 'Unknown error' }
         end
-        message = parsed['message'] || parsed['error'] || 'Unknown error'
-        raise Woods::Error, "Unblocked API error #{response.code}: #{message}"
+        # The Unblocked API returns RFC7807-style bodies ({ status, title, detail });
+        # older/other paths use message/error. Check all so failures stay legible.
+        message = parsed['message'] || parsed['error'] || parsed['detail'] || parsed['title'] || 'Unknown error'
+        raise ApiError.new("Unblocked API error #{response.code}: #{message}", status: response.code.to_i)
       end
     end
   end