RubyGems - pikuri - Versions diffs - 0.0.1 - Mend

pikuri 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +62 -0
data/GETTING_STARTED.md +223 -0
data/LICENSE +21 -0
data/README.md +193 -0
data/lib/pikuri/agent/chat_transport.rb +41 -0
data/lib/pikuri/agent/context_window_detector.rb +101 -0
data/lib/pikuri/agent/listener/in_memory_message_list.rb +33 -0
data/lib/pikuri/agent/listener/message_listener.rb +93 -0
data/lib/pikuri/agent/listener/step_limit.rb +97 -0
data/lib/pikuri/agent/listener/terminal.rb +137 -0
data/lib/pikuri/agent/listener/token_log.rb +166 -0
data/lib/pikuri/agent/listener_list.rb +113 -0
data/lib/pikuri/agent/message.rb +61 -0
data/lib/pikuri/agent/synthesizer.rb +120 -0
data/lib/pikuri/agent/tokens.rb +56 -0
data/lib/pikuri/agent.rb +286 -0
data/lib/pikuri/subprocess.rb +166 -0
data/lib/pikuri/tool/bash.rb +272 -0
data/lib/pikuri/tool/calculator.rb +82 -0
data/lib/pikuri/tool/confirmer.rb +96 -0
data/lib/pikuri/tool/edit.rb +196 -0
data/lib/pikuri/tool/fetch.rb +167 -0
data/lib/pikuri/tool/glob.rb +310 -0
data/lib/pikuri/tool/grep.rb +338 -0
data/lib/pikuri/tool/parameters.rb +314 -0
data/lib/pikuri/tool/read.rb +254 -0
data/lib/pikuri/tool/scraper/fetch_error.rb +16 -0
data/lib/pikuri/tool/scraper/html.rb +285 -0
data/lib/pikuri/tool/scraper/pdf.rb +54 -0
data/lib/pikuri/tool/scraper/simple.rb +177 -0
data/lib/pikuri/tool/search/brave.rb +184 -0
data/lib/pikuri/tool/search/duckduckgo.rb +196 -0
data/lib/pikuri/tool/search/engines.rb +154 -0
data/lib/pikuri/tool/search/exa.rb +217 -0
data/lib/pikuri/tool/search/rate_limiter.rb +92 -0
data/lib/pikuri/tool/search/result.rb +29 -0
data/lib/pikuri/tool/skill.rb +80 -0
data/lib/pikuri/tool/skill_catalog.rb +376 -0
data/lib/pikuri/tool/sub_agent.rb +102 -0
data/lib/pikuri/tool/web_scrape.rb +117 -0
data/lib/pikuri/tool/web_search.rb +38 -0
data/lib/pikuri/tool/workspace.rb +150 -0
data/lib/pikuri/tool/write.rb +170 -0
data/lib/pikuri/tool.rb +118 -0
data/lib/pikuri/url_cache.rb +106 -0
data/lib/pikuri/version.rb +10 -0
data/lib/pikuri.rb +165 -0
data/prompts/coding-system-prompt.txt +28 -0
data/prompts/pikuri-chat.txt +15 -0
metadata +259 -0

data/lib/pikuri/tool/fetch.rb ADDED Viewed

@@ -0,0 +1,167 @@
+# frozen_string_literal: true
+module Pikuri
+  class Tool
+    # Truncation policy and Tool spec for the +fetch+ tool. The HTTP work
+    # lives in {Tool::Scraper::Simple.fetch}; this module is a thin
+    # wrapper that accepts only textual content-types, applies a character
+    # cap so the LLM doesn't drown in long-form bodies, and exposes the
+    # result to the agent loop in OpenAI tool-call shape.
+    #
+    # Sister of {Tool::WebScrape}, but without HTML→Markdown or PDF→text
+    # extraction: bodies are returned verbatim. Useful for raw textual
+    # data — JSON APIs, CSV files, +robots.txt+, sitemaps, source files —
+    # where any rendering pass would corrupt the payload.
+    module Fetch
+      # @return [Integer] default character cap on the body returned by
+      #   {.fetch}. Smaller than {Tool::WebScrape::DEFAULT_MAX_CHARS}
+      #   because fetch's content profile is bimodal — most JSON/XML/CSV
+      #   responses are tiny, and the long-tail (large data dumps) is
+      #   better re-requested deliberately than padded into every default.
+      DEFAULT_MAX_CHARS = 5_000
+      # @return [Integer] hard ceiling on the +max_chars+ argument to
+      #   {.fetch}. Matches {Tool::WebScrape::MAX_MAX_CHARS}.
+      MAX_MAX_CHARS = 100_000
+      # Application content-types that are textual in practice and so
+      # safe to return verbatim to the LLM, despite their +application/+
+      # prefix making them fail the +text/*+ check. Anything outside
+      # +text/*+ and this allowlist is refused.
+      # @return [Array<String>]
+      TEXTUAL_APPLICATION_TYPES = %w[
+        application/json
+        application/xml
+        application/javascript
+        application/xhtml+xml
+        application/rss+xml
+        application/atom+xml
+      ].freeze
+      # On-disk cache used by {.fetch} to memoize downloads. Defined as a
+      # method so specs can swap it for an isolated cache or
+      # {UrlCache::NULL} without touching the shared instance. Lives in
+      # its own subdir under {UrlCache::ROOT_DIR} so a +fetch+ on a URL
+      # and a +web_scrape+ on the same URL cannot collide on the same
+      # cache file (one returns the raw body, the other returns extracted
+      # Markdown).
+      #
+      # @return [UrlCache, #fetch]
+      CACHE = UrlCache.new(ttl: UrlCache::DEFAULT_TTL, dir: "#{UrlCache::ROOT_DIR}/fetch")
+      def self.cache
+        CACHE
+      end
+      # Download +url+ via {Tool::Scraper::Simple.fetch} and return the
+      # response body verbatim, provided the content-type is one we deem
+      # textual (any +text/*+, plus the formats listed in
+      # {TEXTUAL_APPLICATION_TYPES}). Anything else — PDFs, images, other
+      # binaries — produces an +"Error: ..."+ string in the calculator-
+      # style convention so the agent loop feeds the failure back to the
+      # model as the next observation.
+      #
+      # The body is cached on disk via {.cache}, keyed by URL, so repeat
+      # fetches within the cache TTL skip the network. +max_chars+ is not
+      # part of the cache key — different values for the same URL share
+      # one entry, and truncation runs after the cache lookup. The cache
+      # is only populated on success: {Scraper::FetchError} (HTTP non-2xx,
+      # network failure, redirect-loop exhaustion, refused content-type)
+      # is caught outside the +cache.fetch+ block, so failure strings are
+      # never persisted and a retry on the next call hits the network
+      # again. Other exceptions (parser bugs in our own code) bubble up
+      # unchanged.
+      #
+      # @param url [String] absolute HTTP(S) URL to download
+      # @param max_chars [Integer] character cap on the returned body.
+      #   Clamped to +[1, {MAX_MAX_CHARS}]+; defaults to
+      #   {DEFAULT_MAX_CHARS}. When the body exceeds the cap, output is
+      #   cut and a marker noting the original length is appended.
+      # @return [String] response body, possibly truncated, or
+      #   +"Error: ..."+ on a recoverable failure
+      def self.fetch(url, max_chars: DEFAULT_MAX_CHARS)
+        max_chars = max_chars.clamp(1, MAX_MAX_CHARS)
+        body = cache.fetch(url) { download(url) }
+        truncate(body, max_chars)
+      rescue Scraper::FetchError => e
+        "Error: #{e.message}"
+      end
+      # GET +url+ and verify the response's content-type is textual.
+      # Caller is responsible for caching and truncation; this method
+      # always hits the network.
+      #
+      # @param url [String]
+      # @return [String] response body
+      # @raise [Scraper::FetchError] on HTTP non-2xx, network failure,
+      #   redirect-loop exhaustion, missing +Location+ on a 3xx, or a
+      #   non-textual content-type
+      def self.download(url)
+        fetched = Scraper::Simple.fetch(url)
+        return fetched.body if textual?(fetched.content_type)
+        raise Scraper::FetchError,
+              "refused to fetch #{url}: content-type #{fetched.content_type.inspect} " \
+              'is not textual (use web_scrape for PDFs or rendered pages)'
+      end
+      # @param content_type [String] normalized content-type (no +charset+
+      #   parameter, lowercased) as produced by {Scraper::Simple.fetch}
+      # @return [Boolean] true when the content-type is +text/*+ or one
+      #   of {TEXTUAL_APPLICATION_TYPES}
+      def self.textual?(content_type)
+        content_type.start_with?('text/') ||
+          TEXTUAL_APPLICATION_TYPES.include?(content_type)
+      end
+      # Cut +body+ to at most +max_chars+ characters, appending a marker
+      # describing the original length when truncation actually happens.
+      # Returns +body+ unchanged if it already fits. Same shape as
+      # {Tool::WebScrape.truncate} so the LLM sees a consistent
+      # truncation marker across both tools.
+      #
+      # @param body [String] full response body
+      # @param max_chars [Integer] character cap; assumed already clamped
+      # @return [String]
+      def self.truncate(body, max_chars)
+        return body if body.length <= max_chars
+        "#{body[0, max_chars]}\n\n" \
+          "... [truncated at #{max_chars} of #{body.length} chars; " \
+          'call again with a larger `max_chars` to see more]'
+      end
+    end
+    # Verbatim URL download tool. Thin wrapper over {Tool::Fetch.fetch}
+    # that exposes it to the agent loop in OpenAI tool-call shape. Use for
+    # raw textual payloads (JSON APIs, CSV files, +robots.txt+, source
+    # files); use {Tool::WEB_SCRAPE} for rendered web pages or PDFs where
+    # readability extraction makes the result usable.
+    #
+    # @return [Tool]
+    FETCH = new(
+      name: 'fetch',
+      description: <<~DESC,
+        Downloads the given URL and returns its body verbatim.
+        Usage:
+        - Use for raw textual payloads: JSON APIs, CSV files, robots.txt, sitemaps, source files — anywhere a rendering pass would corrupt the data.
+        - For rendered HTML pages or PDFs, use web_scrape — it extracts readable content; fetch returns the raw HTML/PDF bytes unchanged.
+        - Accepts text/* and common textual application/* types (JSON, XML, JS, XHTML, RSS, Atom). Refuses PDFs, images, and other binaries.
+      DESC
+      parameters: Parameters.build { |p|
+        p.required_string :url,
+                          'Absolute URL to download, including the scheme, ' \
+                          'e.g. "https://example.com/data.json".'
+        p.optional_integer :max_chars,
+                           'Maximum number of characters of the body to ' \
+                           'return. Defaults to 5000; hard-capped at ' \
+                           '100000. When the body is longer than this, ' \
+                           'output is cut and a marker reports the full ' \
+                           'length.'
+      },
+      execute: ->(url:, max_chars: Fetch::DEFAULT_MAX_CHARS) {
+        Fetch.fetch(url, max_chars: max_chars)
+      }
+    )
+  end
+end

data/lib/pikuri/tool/glob.rb ADDED Viewed

@@ -0,0 +1,310 @@
+# frozen_string_literal: true
+module Pikuri
+  class Tool
+    # The +glob+ tool — list files matching a glob pattern via
+    # +rg --files+, sorted by modification time (newest first).
+    # Instantiating +Tool::Glob.new(workspace: ws)+ produces a tool
+    # whose {Tool#to_ruby_llm_tool} wiring is identical to any bundled
+    # tool's. Same shape as {Tool::Grep} (workspace captured by the
+    # +execute+ closure, no confirmer — read-only).
+    #
+    # == Why a separate tool from Grep
+    #
+    # The unique capability is *mtime-descending sort* — "what's been
+    # touched recently" is a common navigation move and Grep can't
+    # express it. The rest (filter by name, default to listing all
+    # matching files) is theoretically reachable through Grep with
+    # +pattern="."+, but Glob avoids that hack and keeps Read / Grep /
+    # Glob as three clean roles: read one file, search content, list
+    # files by name.
+    #
+    # == ripgrep dependency
+    #
+    # Hard dependency: {.check_binaries!} runs in +initialize+ and
+    # raises if +rg+ isn't on +PATH+. Each tool owns its own probe so
+    # construction order doesn't matter — Glob doesn't lean on Grep's
+    # check.
+    #
+    # == Argv & filter pipeline
+    #
+    #   rg --files --color=never --hidden --glob '!.git/*' \
+    #      -- <relative-path-or-dot>
+    #   # …then filter the result list in Ruby with File.fnmatch?
+    #
+    # Why not pass the user pattern as +--glob+ to rg? Because rg's
+    # +--glob+ documentation says *"This always overrides any other
+    # ignore logic"* — so +--glob '**/*.rb'+ would re-include
+    # +.gitignore+'d Ruby files, breaking our gitignore-respect
+    # promise. We let rg produce the full gitignore-respecting file
+    # list and filter to the user's pattern in Ruby with
+    # +File.fnmatch?(pattern, p, FNM_PATHNAME | FNM_EXTGLOB |
+    # FNM_DOTMATCH)+. The three flags together cover the common rg
+    # glob cases: +**+ recursion (+FNM_PATHNAME+), +{a,b}+ alternation
+    # (+FNM_EXTGLOB+), and dotfile inclusion (+FNM_DOTMATCH+, matching
+    # rg's +--hidden+ behavior). The +.git/+ exclusion stays on the rg
+    # side so its contents never even reach the Ruby filter.
+    #
+    # * +--hidden+ → search dotfiles (still respects +.gitignore+).
+    # * No +--sort+ flag: we re-sort by mtime in Ruby on the way out.
+    # * Output paths come back as +./...+ when the search path is +.+;
+    #   the leading +./+ is stripped post-rg so the model sees clean
+    #   workspace-relative paths.
+    #
+    # == Sort
+    #
+    # mtime-descending in Ruby after rg returns, with path-ascending
+    # as a tiebreaker for files with equal mtimes (the common case in
+    # fresh checkouts). Cost: one +stat+ per result. Broad patterns
+    # can make this expensive, but in practice rg's +.gitignore+ filter
+    # keeps result sets bounded; if real friction shows up later we can
+    # cap pre-sort.
+    #
+    # == Truncation
+    #
+    # Total output head-truncated to {MAX_BYTES} *after* mtime sort, so
+    # the kept rows are the newest. Matches {Tool::Grep}'s budget and
+    # head-bias.
+    #
+    # == Exit codes
+    #
+    # * +0+ → at least one file; format with footer.
+    # * +1+ → no files; return +"No files match pattern '...'"+.
+    # * +2+ → rg error (bad path, bad glob); return
+    #   +"Error: ripgrep: ..."+.
+    #
+    # == Refusals
+    #
+    # All returned as +"Error: ..."+ observations:
+    #
+    # * Empty +pattern+ → fast reject.
+    # * +path+ is a regular file → fast reject pointing at the +read+
+    #   tool.
+    # * +path+ not found → +"Error: path not found: <path>"+.
+    # * +path+ outside the workspace → caught from
+    #   {Tool::Workspace::Error}.
+    class Glob < Tool
+      # @return [Integer] hard byte cap on combined rg output. Same
+      #   value as {Tool::Grep::MAX_BYTES} so the two file-touching
+      #   tools share a budget shape. Re-declared here rather than
+      #   referenced cross-file because Zeitwerk's eager-load order
+      #   isn't guaranteed between siblings.
+      MAX_BYTES = 50 * 1024
+      # @return [String] human-readable form of {MAX_BYTES} for the
+      #   truncation marker.
+      MAX_BYTES_LABEL = "#{MAX_BYTES / 1024} KB"
+      # Description shown to the LLM. opencode-shape (summary +
+      # +Usage:+ bullets). Per-parameter constraints live in parameter
+      # descriptions.
+      #
+      # @return [String]
+      DESCRIPTION = <<~DESC
+        List files matching a glob pattern, sorted by modification time (newest first).
+        Usage:
+        - `.gitignore` is respected; for unfiltered listing use bash `rg --no-ignore --files -g <pattern>`.
+        - Glob syntax: `**` matches any number of directories, `*` matches any filename chars (not `/`), `{a,b}` is alternation.
+        - Default search root is the workspace root; pass `path` to narrow to a subdirectory.
+        - Use `glob` to find files by name; use `grep` to find files by content.
+        - Output is sorted by mtime descending — recently-touched files come first, so broad patterns still surface relevant files near the top.
+        - Output is truncated to #{MAX_BYTES_LABEL}; refine the pattern or narrow `path` if the response ends in a truncation marker.
+      DESC
+      # @param workspace [Tool::Workspace] captured for path resolution
+      #   and as +chdir+ for rg. All path arguments route through
+      #   +workspace.resolve_for_read+.
+      # @raise [RuntimeError] if +rg+ isn't on +PATH+; fail-loud at
+      #   construction rather than the first tool call.
+      # @return [Glob]
+      def initialize(workspace:)
+        Glob.send(:check_binaries!)
+        super(
+          name: 'glob',
+          description: DESCRIPTION,
+          parameters: Parameters.build { |p|
+            p.required_string :pattern,
+                              'Glob pattern (** matches any number of ' \
+                              'directories; {a,b} alternation), e.g. ' \
+                              '"**/*.rb" or "lib/**/*_spec.rb".'
+            p.optional_string :path,
+                              'Directory to search in. Relative paths resolve ' \
+                              'against the workspace root. Defaults to the ' \
+                              'workspace root, e.g. "lib/" or "spec/".'
+          },
+          execute: lambda { |pattern:, path: nil|
+            Glob.search(workspace: workspace, pattern: pattern, path: path)
+          }
+        )
+      end
+      # Validate inputs, resolve the path against the workspace, spawn
+      # rg, mtime-sort, head-truncate, render. Returns either the
+      # formatted listing, a "no files match" message, or
+      # +"Error: ..."+.
+      #
+      # @param workspace [Tool::Workspace]
+      # @param pattern [String]
+      # @param path [String, nil]
+      # @return [String]
+      def self.search(workspace:, pattern:, path:)
+        return 'Error: empty pattern.' if pattern.empty?
+        search_target = '.'
+        if path
+          resolved = workspace.resolve_for_read(path)
+          return "Error: path not found: #{path}" unless resolved.exist?
+          if resolved.file?
+            return "Error: #{path} is a file, not a directory; use the read tool to view it."
+          end
+          rel = resolved.relative_path_from(workspace.cwd).to_s
+          search_target = rel
+        end
+        argv = build_argv(path: search_target)
+        result = Pikuri::Subprocess.spawn(*argv, chdir: workspace.cwd.to_s).wait
+        exit_code = result.status.exitstatus
+        case exit_code
+        when 0
+          format_output(result.output, workspace: workspace,
+                        pattern: pattern, path: path)
+        when 1
+          no_match_message(pattern: pattern, path: path)
+        else
+          stderr = result.output.strip
+          stderr = "exited #{exit_code}" if stderr.empty?
+          "Error: ripgrep: #{stderr}"
+        end
+      rescue Tool::Workspace::Error => e
+        "Error: #{e.message}"
+      end
+      # @return [Integer] flags for {File.fnmatch?}: +FNM_PATHNAME+ for
+      #   +**+ recursion + path-aware +/+ matching, +FNM_EXTGLOB+ for
+      #   +{a,b}+ alternation, +FNM_DOTMATCH+ to match dotfiles (rg
+      #   does this when +--hidden+ is set).
+      FNMATCH_FLAGS = File::FNM_PATHNAME | File::FNM_EXTGLOB | File::FNM_DOTMATCH
+      # Build the +rg+ argv. User pattern is NOT passed to rg — see
+      # the class header for why (rg's +--glob+ overrides
+      # +.gitignore+).
+      #
+      # @return [Array<String>]
+      def self.build_argv(path:)
+        [
+          'rg',
+          '--files',
+          '--color=never',
+          '--hidden',
+          '--glob', '!.git/*',
+          '--', path
+        ]
+      end
+      private_class_method :build_argv
+      # Strip the +./+ prefix rg adds when invoked with +.+ as the
+      # search path, filter to the user pattern with +fnmatch+,
+      # mtime-sort descending (path ascending as tiebreaker),
+      # head-truncate at {MAX_BYTES}, append a footer summarizing the
+      # count.
+      #
+      # @return [String]
+      def self.format_output(raw, workspace:, pattern:, path:)
+        all_paths = raw.split("\n").reject(&:empty?).map { |p| p.sub(%r{\A\./}, '') }
+        paths = all_paths.select { |p| File.fnmatch?(pattern, p, FNMATCH_FLAGS) }
+        return no_match_message(pattern: pattern, path: path) if paths.empty?
+        sorted = mtime_sort(paths, workspace.cwd)
+        joined = sorted.join("\n") + "\n"
+        content, truncation_marker = head_truncate(joined)
+        stripped = content.chomp
+        count = stripped.split("\n").size
+        footer = "Found #{pluralize(count, 'file', 'files')}."
+        [stripped, '', footer + truncation_marker].join("\n")
+      end
+      private_class_method :format_output
+      # mtime descending; path ascending for stable order on ties.
+      #
+      # @return [Array<String>]
+      def self.mtime_sort(paths, cwd)
+        paths
+          .map { |p| [p, mtime_of(cwd + p)] }
+          .sort_by { |(p, m)| [-m, p] }
+          .map(&:first)
+      end
+      private_class_method :mtime_sort
+      # @return [Float] epoch-seconds mtime; 0 for paths we can't stat
+      #   (race between rg listing and our stat, deleted symlinks,
+      #   etc.). The fallback puts unstattable entries at the bottom.
+      def self.mtime_of(absolute)
+        File.mtime(absolute).to_f
+      rescue Errno::ENOENT
+        0.0
+      end
+      private_class_method :mtime_of
+      # Head-truncate +raw+ to {MAX_BYTES}, cutting at the last newline
+      # boundary so the final row is never partial. Returns the
+      # truncated content and a marker String (empty if no truncation).
+      #
+      # @return [Array(String, String)]
+      def self.head_truncate(raw)
+        total = raw.bytesize
+        return [raw, ''] if total <= MAX_BYTES
+        head = raw.byteslice(0, MAX_BYTES)
+        last_nl = head.rindex("\n")
+        head = head.byteslice(0, last_nl) if last_nl
+        omitted = total - head.bytesize
+        marker = "\n\n... [#{omitted} bytes omitted; total was #{total} bytes; " \
+                 'refine pattern or path] ...'
+        [head, marker]
+      end
+      private_class_method :head_truncate
+      # @return [String]
+      def self.no_match_message(pattern:, path:)
+        base = "No files match pattern '#{pattern}'"
+        base += " in #{path}" if path
+        "#{base}."
+      end
+      private_class_method :no_match_message
+      # @return [String] +"1 file"+ / +"2 files"+
+      def self.pluralize(n, sing, plural)
+        "#{n} #{n == 1 ? sing : plural}"
+      end
+      private_class_method :pluralize
+      # Verify +rg+ is reachable on +PATH+. Routed through
+      # {Pikuri::Subprocess.spawn} to honor the subprocess seam. rg
+      # missing surfaces as +Errno::ENOENT+; an installed rg returns
+      # exit 0 from +--version+.
+      #
+      # @return [void]
+      # @raise [RuntimeError] if rg is missing
+      def self.check_binaries!
+        result = Pikuri::Subprocess.spawn('rg', '--version', chdir: '/').wait
+        return if result.status.success?
+        raise install_hint
+      rescue Errno::ENOENT
+        raise install_hint
+      end
+      private_class_method :check_binaries!
+      # @return [String]
+      def self.install_hint
+        "Tool::Glob requires 'rg' (ripgrep) on PATH; install via your " \
+          "distro's package manager (e.g. 'apt install ripgrep')."
+      end
+      private_class_method :install_hint
+    end
+  end
+end