RubyGems - parse-stack-next - Versions diffs - 5.1.1 → 5.2.0 - Mend

parse-stack-next 5.1.1 → 5.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

checksums.yaml +4 -4
data/.env.sample +12 -0
data/.env.test +4 -4
data/CHANGELOG.md +545 -0
data/Gemfile +3 -0
data/Gemfile.lock +6 -1
data/README.md +167 -38
data/Rakefile +56 -10
data/docs/atlas_vector_search_guide.md +110 -9
data/docs/mcp_guide.md +433 -0
data/docs/mongodb_direct_guide.md +66 -1
data/docs/mongodb_index_optimization_guide.md +22 -1
data/docs/usage_guide.md +15 -0
data/lib/parse/agent/approval_gate.rb +0 -0
data/lib/parse/agent/constraint_translator.rb +90 -19
data/lib/parse/agent/describe.rb +1 -0
data/lib/parse/agent/errors.rb +16 -0
data/lib/parse/agent/mcp_client.rb +9 -0
data/lib/parse/agent/mcp_dispatcher.rb +139 -7
data/lib/parse/agent/mcp_rack_app.rb +621 -17
data/lib/parse/agent/mcp_subscriptions.rb +607 -0
data/lib/parse/agent/metadata_dsl.rb +58 -0
data/lib/parse/agent/metadata_registry.rb +141 -1
data/lib/parse/agent/prompt_hardening.rb +213 -0
data/lib/parse/agent/result_formatter.rb +18 -3
data/lib/parse/agent/tools.rb +167 -24
data/lib/parse/agent.rb +692 -21
data/lib/parse/client/request.rb +55 -4
data/lib/parse/client/response.rb +4 -0
data/lib/parse/client.rb +205 -7
data/lib/parse/model/classes/installation.rb +27 -10
data/lib/parse/model/classes/user.rb +8 -0
data/lib/parse/model/core/actions.rb +58 -4
data/lib/parse/model/core/embed_managed.rb +19 -14
data/lib/parse/model/core/indexing.rb +108 -16
data/lib/parse/model/core/querying.rb +29 -0
data/lib/parse/model/model.rb +34 -3
data/lib/parse/model/object.rb +1 -0
data/lib/parse/query.rb +90 -24
data/lib/parse/retrieval/agent_tool.rb +369 -0
data/lib/parse/retrieval/chunk.rb +74 -0
data/lib/parse/retrieval/chunker.rb +208 -0
data/lib/parse/retrieval/retriever.rb +274 -0
data/lib/parse/retrieval.rb +10 -0
data/lib/parse/schema.rb +69 -20
data/lib/parse/stack/version.rb +2 -2
data/parse-stack-next.gemspec +1 -1
data/scripts/docker/docker-compose.atlas.yml +14 -10
data/scripts/docker/docker-compose.test.yml +24 -20
data/scripts/docker/mongo-init.js +3 -3
data/scripts/start-parse.sh +10 -0
data/scripts/start_mcp_server.rb +1 -1
data/scripts/test_server_connection.rb +1 -1
data/scripts/vector_prototype/create_vector_index.js +1 -1
data/scripts/vector_prototype/fetch_embeddings.py +2 -2
data/scripts/vector_prototype/query_prototype.rb +1 -1
data/scripts/vector_prototype/run.sh +4 -4
metadata +10 -2

data/lib/parse/agent/metadata_registry.rb CHANGED Viewed

@@ -44,6 +44,15 @@ module Parse
       @tenant_scope_bypasses = {}
       @tenant_scope_bypass_mutex = Mutex.new
+      # Thread-safe storage for `agent_searchable` opt-ins.
+      # Maps parse_class_name => { field: Symbol, filter_fields: Array<Symbol> }
+      @searchable_classes = {}
+      @searchable_mutex = Mutex.new
+      # Once-per-class memo for the agent-visible-but-unscoped lint warning
+      # (guarded by @tenant_scope_mutex). Maps parse_class_name => true.
+      @tenant_scope_lint_warned = {}
       # Register a class as visible to agents.
       # @param klass [Class] the model class
       def register_visible_class(klass)
@@ -581,6 +590,89 @@ module Parse
         end
       end
+      # ============================================================
+      # Searchable Registry (semantic_search opt-in)
+      # ============================================================
+      # Register a class as searchable via the `semantic_search` tool.
+      #
+      # @param class_name [String] the Parse class name
+      # @param field [Symbol] the :vector property to search
+      # @param filter_fields [Array<Symbol>] fields the agent may filter on
+      def register_searchable(class_name, field:, filter_fields: [])
+        @searchable_mutex.synchronize do
+          @searchable_classes[class_name.to_s] = {
+            field: field.to_sym,
+            filter_fields: Array(filter_fields).map(&:to_sym),
+          }
+        end
+      end
+      # @param class_name [String]
+      # @return [Hash, nil] { field:, filter_fields: } or nil if not opted in.
+      def searchable_rule(class_name)
+        @searchable_mutex.synchronize { @searchable_classes[class_name.to_s] }
+      end
+      # @param class_name [String]
+      # @return [Symbol, nil] the searchable vector field, or nil.
+      def searchable_field(class_name)
+        searchable_rule(class_name)&.fetch(:field, nil)
+      end
+      # @param class_name [String]
+      # @return [Array<Symbol>] the declared filter-field allowlist.
+      def searchable_filter_fields(class_name)
+        searchable_rule(class_name)&.fetch(:filter_fields, []) || []
+      end
+      # @return [Boolean] true if any class declares agent_tenant_scope.
+      def any_tenant_scope?
+        @tenant_scope_mutex.synchronize { !@tenant_scope_rules.empty? }
+      end
+      # Resolve a class name to its model class for `semantic_search`,
+      # enforcing the three opt-in / safety gates. Called at dispatch
+      # time (all classes loaded), which is why the tenant-scope cross-
+      # check is order-independent.
+      #
+      # @param class_name [String]
+      # @return [Class] the resolved Parse::Object subclass.
+      # @raise [Parse::Agent::ValidationError] when the class did not opt
+      #   in via `agent_searchable` (a caller/LLM mistake).
+      # @raise [Parse::Agent::AccessDenied] when the class is
+      #   `agent_hidden` (kind: :hidden_class).
+      # @raise [Parse::Agent::MissingTenantScope] when a tenant-aware
+      #   deployment has a searchable class without its own tenant scope.
+      def resolve_searchable!(class_name)
+        name = class_name.to_s
+        rule = searchable_rule(name)
+        if rule.nil?
+          raise Parse::Agent::ValidationError,
+                "Class '#{name}' is not registered for semantic search. " \
+                "Declare `agent_searchable field: :<vector_field>` on the model."
+        end
+        if hidden?(name)
+          raise Parse::Agent::AccessDenied.new(
+            name, "Class '#{name}' is not accessible to this agent.",
+            kind: :hidden_class,
+          )
+        end
+        if any_tenant_scope? && tenant_scope_rule(name).nil?
+          raise Parse::Agent::MissingTenantScope,
+                "Class '#{name}' is searchable but declares no agent_tenant_scope " \
+                "while other classes do. Refusing to expose an un-scoped searchable " \
+                "surface in a tenant-aware deployment; add agent_tenant_scope to '#{name}'."
+        end
+        klass = find_model_class(name)
+        unless klass.is_a?(Class) && klass.respond_to?(:find_similar)
+          raise Parse::Agent::ValidationError,
+                "Class '#{name}' is registered searchable but no Parse::Object model " \
+                "with a :vector property could be resolved."
+        end
+        klass
+      end
       # Return the tenant scope rule for a class name, or nil if none declared.
       #
       # @param class_name [String] the Parse class name
@@ -625,7 +717,23 @@ module Parse
       # @raise [Parse::Agent::AccessDenied]
       def resolve_tenant_scope(class_name, agent)
         rule = tenant_scope_rule(class_name)
-        return nil unless rule
+        unless rule
+          # Lint: in a tenant-aware deployment, an agent-visible class with no
+          # agent_tenant_scope is the silent cross-tenant case (resolve_searchable!
+          # raises for the search path, but the general query path passes through
+          # for back-compat). Warn once per class so it isn't discovered only by
+          # leaked rows; do not raise — a genuinely global class is legitimate.
+          #
+          # Gated to classes EXPLICITLY opted into the agent surface (via
+          # `agent_fields` → visible, or `agent_searchable`). resolve_tenant_scope
+          # runs for every class a tool touches, so without this gate the warning
+          # also fires for _User / _Role / _Session and incidental tables — noise
+          # that trains operators to ignore the signal.
+          if any_tenant_scope? && agent_visible_for_lint?(class_name)
+            warn_unscoped_agent_class!(class_name)
+          end
+          return nil
+        end
         return nil if tenant_scope_bypassed?(class_name, agent)
@@ -640,6 +748,38 @@ module Parse
         { field: rule[:field], value: value }
       end
+      # @!visibility private
+      # Whether a class is EXPLICITLY exposed to agents — declared `agent_fields`
+      # (registered visible) or `agent_searchable`. Used to scope the unscoped-
+      # class lint so it doesn't fire for system/incidental classes the agent
+      # merely happens to touch.
+      def agent_visible_for_lint?(class_name)
+        name = class_name.to_s
+        visible_class_names.include?(name) || !searchable_rule(name).nil?
+      end
+      # @!visibility private
+      # Emit a one-time (per class, per process) warning that an agent-visible
+      # class is unscoped in a tenant-aware deployment. See {resolve_tenant_scope}.
+      def warn_unscoped_agent_class!(class_name)
+        name = class_name.to_s
+        emit = @tenant_scope_mutex.synchronize do
+          next false if @tenant_scope_lint_warned[name]
+          @tenant_scope_lint_warned[name] = true
+        end
+        return unless emit
+        warn "[Parse::Agent:SECURITY] class '#{name}' is agent-visible but declares no " \
+             "agent_tenant_scope while other classes do — queries against it are NOT " \
+             "tenant-scoped and may return cross-tenant rows. Add agent_tenant_scope to " \
+             "'#{name}', or confirm it is intentionally global."
+      end
+      # @!visibility private
+      # Test-only: re-arm the per-class unscoped-class lint warnings.
+      def reset_tenant_scope_lint!
+        @tenant_scope_mutex.synchronize { @tenant_scope_lint_warned.clear }
+      end
       private
       # Find the Ruby model class for a Parse class name.

data/lib/parse/agent/prompt_hardening.rb ADDED Viewed

@@ -0,0 +1,213 @@
+# encoding: UTF-8
+# frozen_string_literal: true
+module Parse
+  class Agent
+    # Sanitization primitives for prompt-injection hardening (NEW-PROMPT-6).
+    # A single home for the transforms applied to data that flows toward an
+    # LLM: schema descriptions surfaced by the schema tools, untrusted tool
+    # result content, and canary scanning of tool results.
+    #
+    # All functions are pure (module_function via `extend self`) and have no
+    # dependency on a live client.
+    module PromptHardening
+      extend self
+      # Identifier shape for LLM-surfaced field names: ASCII letter/underscore
+      # start, then up to 127 more identifier chars. NOT the secret-field
+      # boundary — it permits a leading underscore; `_rperm`/`_hashed_password`
+      # are stopped by field_allowlist / validate_keys!, untouched here. This
+      # only drops non-identifier names (spaces, punctuation, >128 chars,
+      # leading digit) that could carry injection payloads in a field name.
+      # The length is an injection-safety cap, not a Parse limit — it is set
+      # well above any realistic field name so valid identifiers aren't
+      # silently dropped from the schema surfaced to the LLM.
+      FIELD_NAME_RE = /\A[a-zA-Z_][a-zA-Z0-9_]{0,127}\z/
+      # Max characters retained from any LLM-surfaced description.
+      DESCRIPTION_CAP = 200
+      SCHEMA_DESC_OPEN  = "<schema_description>"
+      SCHEMA_DESC_CLOSE = "</schema_description>"
+      # C0 (0x00-0x1F except \t\n) + DEL + C1 (0x7F-0x9F) + zero-width
+      # (200B-200D, 2060, FEFF). Stripped from descriptions so invisible
+      # control/format characters can't smuggle instructions past a human
+      # reviewer or confuse the model.
+      CONTROL_CHARS_RE = /[\u0000-\u0008\u000B-\u001F\u007F-\u009F\u200B-\u200D\u2060\uFEFF]/
+      # Sub-part 1 — sanitize an enriched schema hash before it is
+      # serialized toward the LLM. Returns a sanitized deep copy (input is
+      # not mutated). Drops fields whose names fail {FIELD_NAME_RE} (with a
+      # `[Parse::Agent:PROMPT]` warning), and scrubs + caps + marker-wraps
+      # every description / usage string (class-level, per-field, and enum
+      # value descriptions).
+      #
+      # @param schema [Hash]
+      # @return [Hash]
+      def sanitize_schema_for_llm(schema)
+        return schema unless schema.is_a?(Hash)
+        out = deep_dup(schema)
+        class_name = out["className"] || out[:className]
+        %w[description usage].each do |k|
+          out[k] = sanitize_description(out[k]) if out[k].is_a?(String)
+        end
+        fields = out["fields"] || out[:fields]
+        if fields.is_a?(Hash)
+          fields.keys.each do |fname|
+            unless valid_field_name?(fname)
+              fields.delete(fname)
+              warn "[Parse::Agent:PROMPT] dropped field #{fname.inspect} on " \
+                   "#{class_name.inspect}: invalid identifier"
+              next
+            end
+            cfg = fields[fname]
+            next unless cfg.is_a?(Hash)
+            %w[description usage].each do |k|
+              cfg[k] = sanitize_description(cfg[k]) if cfg[k].is_a?(String)
+            end
+            allowed = cfg["allowed_values"] || cfg[:allowed_values]
+            if allowed.is_a?(Array)
+              allowed.each do |v|
+                next unless v.is_a?(Hash)
+                v["description"] = sanitize_description(v["description"]) if v["description"].is_a?(String)
+                v[:description]  = sanitize_description(v[:description])  if v[:description].is_a?(String)
+              end
+            end
+          end
+        end
+        # agent_methods entries are surfaced to the LLM by format_schema exactly
+        # like field descriptions, and their :description / per-parameter
+        # description strings come from the same developer-authored DSL — so they
+        # get the same marker-neutralization / control-char strip / length cap.
+        # (format_methods emits symbol-keyed hashes; tolerate both forms.)
+        methods = out["agent_methods"] || out[:agent_methods]
+        if methods.is_a?(Array)
+          methods.each do |m|
+            next unless m.is_a?(Hash)
+            m["description"] = sanitize_description(m["description"]) if m["description"].is_a?(String)
+            m[:description]  = sanitize_description(m[:description])  if m[:description].is_a?(String)
+            sanitize_nested_descriptions!(m["parameters"] || m[:parameters])
+          end
+        end
+        out
+      end
+      # @return [Boolean] whether `name` is a safe LLM-surfaceable identifier.
+      def valid_field_name?(name)
+        FIELD_NAME_RE.match?(name.to_s)
+      end
+      # Scrub control chars, cap length, and wrap a description in
+      # <schema_description> markers. Markers in the RAW text are neutralized
+      # FIRST (so a stored `</schema_description>` can't close the wrapper).
+      #
+      # @param str [String]
+      # @return [String]
+      def sanitize_description(str)
+        return str unless str.is_a?(String)
+        cleaned = scrub_marker_injection(str)
+        cleaned = cleaned.gsub(CONTROL_CHARS_RE, "")
+        cleaned = cleaned[0, DESCRIPTION_CAP] if cleaned.length > DESCRIPTION_CAP
+        "#{SCHEMA_DESC_OPEN}#{cleaned}#{SCHEMA_DESC_CLOSE}"
+      end
+      # Sub-part 2 — neutralize wrapper/marker strings embedded in untrusted
+      # content so a stored value cannot impersonate or close the
+      # tool-result wrapper. Idempotent: the escaped form no longer contains
+      # the original literal, so re-application is a no-op (content is
+      # re-serialized into history every turn).
+      #
+      # When `Parse::Agent.prompt_marker_strict` is true, raises instead of
+      # escaping (fail-closed for high-assurance deployments).
+      #
+      # @param content [String, #to_s]
+      # @return [String]
+      def scrub_marker_injection(content)
+        s = content.to_s
+        strict = Parse::Agent.prompt_marker_strict
+        injection_markers.each do |marker|
+          next unless s.include?(marker)
+          if strict
+            raise Parse::Agent::SecurityError,
+                  "prompt_marker_strict: untrusted content contains a reserved marker"
+          end
+          s = s.gsub(marker, escape_marker(marker))
+        end
+        s
+      end
+      # Sub-part 3 — scan text for any operator-registered canary phrase.
+      # @param text [String]
+      # @return [String, nil] the matched phrase/pattern source, or nil.
+      def scan_for_canaries(text)
+        canaries = Parse::Agent.prompt_injection_canaries
+        return nil if canaries.nil? || canaries.empty?
+        s = text.to_s
+        return nil if s.empty?
+        down = s.downcase
+        canaries.each do |c|
+          case c
+          when Regexp
+            return c.source if c.match?(s)
+          else
+            phrase = c.to_s
+            return phrase if !phrase.empty? && down.include?(phrase.downcase)
+          end
+        end
+        nil
+      end
+      private
+      # Recursively run every `description` string nested in an agent method's
+      # JSON-Schema `parameters` through {#sanitize_description}, so per-parameter
+      # descriptions get the same hardening as field descriptions. Mutates in
+      # place; tolerates string- and symbol-keyed hashes and arbitrary nesting.
+      def sanitize_nested_descriptions!(node)
+        case node
+        when Hash
+          node.each do |k, v|
+            if (k == "description" || k == :description) && v.is_a?(String)
+              node[k] = sanitize_description(v)
+            else
+              sanitize_nested_descriptions!(v)
+            end
+          end
+        when Array
+          node.each { |e| sanitize_nested_descriptions!(e) }
+        end
+      end
+      # The literal strings scrub_marker_injection neutralizes. The MCP
+      # wrapper marker is resolved lazily to avoid a load-order dependency.
+      def injection_markers
+        markers = [SCHEMA_DESC_OPEN, SCHEMA_DESC_CLOSE]
+        if defined?(Parse::Agent::MCPClient::UNTRUSTED_TOOL_RESULT_MARKER)
+          markers << Parse::Agent::MCPClient::UNTRUSTED_TOOL_RESULT_MARKER
+        end
+        markers
+      end
+      # Insert a backslash after the first character so the original literal
+      # no longer occurs (keeps the text human-readable and idempotent).
+      def escape_marker(marker)
+        return marker if marker.length < 2
+        "#{marker[0]}\\#{marker[1..]}"
+      end
+      def deep_dup(obj)
+        case obj
+        when Hash  then obj.each_with_object({}) { |(k, v), h| h[k] = deep_dup(v) }
+        when Array then obj.map { |e| deep_dup(e) }
+        when String then obj.dup
+        else obj
+        end
+      end
+    end
+  end
+end

data/lib/parse/agent/result_formatter.rb CHANGED Viewed

@@ -15,6 +15,15 @@ module Parse
       # Maximum number of results to include in output
       MAX_RESULTS_DISPLAY = 50
+      # Keys stripped from every simplified data object before it reaches
+      # the LLM. The raw `ACL` map (per-role / per-user read/write bits) is
+      # operationally useless to a model reasoning over row data — the
+      # agent's effective read/write authority is enforced server-side
+      # regardless of what ACL a row carries — so surfacing it is pure
+      # token overhead plus a minor disclosure of role/user identifiers.
+      # Applied recursively (nested included records too).
+      DROPPED_OBJECT_KEYS = %w[ACL].freeze
       # Parse field type mappings for human-readable output
       TYPE_NAMES = {
         "String" => "string",
@@ -382,14 +391,20 @@ module Parse
         "Pointer to #{target}. Equality: #{equality}. $in/$nin: #{in_shape}."
       end
-      # Simplify an object for display (resolve __type fields)
+      # Simplify an object for display (resolve __type fields). Strips the
+      # raw ACL map (see {DROPPED_OBJECT_KEYS}). Public so the query/get/
+      # atlas tool envelopes can route their rows through the same
+      # normalization query_class already uses.
       def simplify_object(obj)
         return obj unless obj.is_a?(Hash)
-        obj.transform_values do |value|
-          simplify_value(value)
+        obj.each_with_object({}) do |(key, value), acc|
+          next if DROPPED_OBJECT_KEYS.include?(key.to_s)
+          acc[key] = simplify_value(value)
         end
       end
+      public :simplify_object
       # Simplify a single value
       def simplify_value(value)