pikuri-core 0.0.5 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +5 -3
  3. data/lib/pikuri/agent/chat_transport.rb +135 -11
  4. data/lib/pikuri/agent/configurator.rb +4 -4
  5. data/lib/pikuri/agent/context_window_detector.rb +103 -52
  6. data/lib/pikuri/agent/control/step_limit.rb +39 -7
  7. data/lib/pikuri/agent/event.rb +43 -16
  8. data/lib/pikuri/agent/extension.rb +31 -17
  9. data/lib/pikuri/agent/extension_context.rb +147 -0
  10. data/lib/pikuri/agent/listener/terminal.rb +30 -37
  11. data/lib/pikuri/agent/listener/token_log.rb +60 -13
  12. data/lib/pikuri/agent/listener.rb +12 -5
  13. data/lib/pikuri/agent/listener_list.rb +7 -17
  14. data/lib/pikuri/agent/synthesizer.rb +93 -67
  15. data/lib/pikuri/agent.rb +358 -403
  16. data/lib/pikuri/extractor/html.rb +303 -0
  17. data/lib/pikuri/extractor/passthrough.rb +64 -0
  18. data/lib/pikuri/extractor.rb +314 -0
  19. data/lib/pikuri/file_type.rb +74 -266
  20. data/lib/pikuri/sanitizer.rb +179 -0
  21. data/lib/pikuri/subprocess.rb +73 -2
  22. data/lib/pikuri/tool/calculator.rb +213 -41
  23. data/lib/pikuri/tool/fetch.rb +10 -9
  24. data/lib/pikuri/tool/parameters.rb +65 -2
  25. data/lib/pikuri/tool/scraper.rb +186 -0
  26. data/lib/pikuri/tool/search/brave.rb +32 -18
  27. data/lib/pikuri/tool/search/duckduckgo.rb +18 -7
  28. data/lib/pikuri/tool/search/engines.rb +72 -49
  29. data/lib/pikuri/tool/search/exa.rb +34 -22
  30. data/lib/pikuri/tool/web_scrape.rb +5 -5
  31. data/lib/pikuri/tool/web_search.rb +45 -26
  32. data/lib/pikuri/version.rb +1 -1
  33. data/lib/pikuri-core.rb +11 -10
  34. metadata +9 -66
  35. data/lib/pikuri/tool/scraper/fetch_error.rb +0 -16
  36. data/lib/pikuri/tool/scraper/html.rb +0 -285
  37. data/lib/pikuri/tool/scraper/pdf.rb +0 -54
  38. data/lib/pikuri/tool/scraper/simple.rb +0 -183
@@ -0,0 +1,186 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'faraday'
4
+ require 'stringio'
5
+ require 'uri'
6
+
7
+ module Pikuri
8
+ class Tool
9
+ # HTTP side of the web tools ({Tool::WEB_SCRAPE} and {Tool::FETCH}):
10
+ # GET the URL with a real-browser User-Agent, follow redirects, and
11
+ # hand the response body to {Pikuri::Extractor.extract} with the
12
+ # response's +Content-Type+ as the hint. HTML/XHTML render via
13
+ # {Extractor::HTML}, any other +text/*+ type passes through
14
+ # verbatim, and plug-in extractors extend the set (with pikuri-pdf
15
+ # registered, +application/pdf+ extracts — by header or by +%PDF-+
16
+ # magic, so a PDF served under a lying header still works); the
17
+ # remaining types raise {FetchError} so the LLM observes the
18
+ # failure instead of receiving an empty rendering.
19
+ #
20
+ # Split into a thin HTTP fetch ({.fetch}) and the extraction
21
+ # wrapper ({.visit}) so tests can drive each piece in isolation and
22
+ # {Tool::Fetch} can reuse the HTTP half without the extraction
23
+ # pass. Nothing here knows about the LLM; the tools that wrap this
24
+ # module own caching and truncation and turn rendered Markdown (or
25
+ # {FetchError}) into the next observation.
26
+ module Scraper
27
+ # Raised when a URL cannot be rendered into Markdown text — HTTP
28
+ # non-2xx, network failure, redirect-loop, missing +Location+,
29
+ # unsupported content-type, or a parse failure that reads as "try
30
+ # a different URL" to the LLM. Catching this in
31
+ # {Tool::WEB_SCRAPE} / {Tool::FETCH} turns the failure into an
32
+ # +"Error: ..."+ observation; anything else bubbles up so genuine
33
+ # bugs stay visible.
34
+ class FetchError < StandardError; end
35
+
36
+ # @return [String] User-Agent sent with each request; many sites
37
+ # reject requests with no UA or an obvious bot UA
38
+ USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' \
39
+ '(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
40
+ # @return [String] +Accept+ header sent with each request, so
41
+ # servers that content-negotiate hand back something we can use:
42
+ # rendered HTML first, +application/pdf+ for hosts with a PDF
43
+ # extractor registered, then any +text/*+ for the verbatim
44
+ # pass-through arm.
45
+ ACCEPT = 'text/html,application/xhtml+xml,application/pdf,text/*;q=0.8'
46
+ # @return [Integer] maximum number of HTTP redirects to follow
47
+ # before giving up
48
+ MAX_REDIRECTS = 5
49
+ # @return [Integer] connect timeout in seconds for the underlying
50
+ # Faraday request
51
+ OPEN_TIMEOUT = 10
52
+ # @return [Integer] read timeout in seconds for the underlying
53
+ # Faraday request
54
+ READ_TIMEOUT = 20
55
+
56
+ # @return [Integer] maximum number of characters of an error
57
+ # response body to include in a {FetchError} message. The body is
58
+ # often a multi-kilobyte HTML challenge page (Cloudflare, WAF
59
+ # interstitial, etc.); a short excerpt tells the LLM what kind of
60
+ # page came back without flooding the next observation.
61
+ ERROR_BODY_EXCERPT = 200
62
+
63
+ # Result of a successful {Scraper.fetch}: the response body, the
64
+ # normalized content-type (lower-cased, with any +; charset=...+
65
+ # parameters stripped), and the final URL after redirects.
66
+ Fetched = Data.define(:body, :content_type, :url)
67
+
68
+ # Fetch +url+ and render its main content as Markdown.
69
+ #
70
+ # No caching here — every call hits the network. Callers that want
71
+ # to memoize results should wrap this method themselves (see
72
+ # {Tool::WebScrape.visit}, which does exactly that).
73
+ #
74
+ # The extracted output is +String#strip+'d so the LLM never sees
75
+ # a body that opens or closes with blank lines — common with
76
+ # extracted PDFs' page-feed whitespace and with text bodies that
77
+ # carry a trailing newline. Interior whitespace is preserved
78
+ # because Markdown paragraph breaks and source-code indentation
79
+ # are load-bearing.
80
+ #
81
+ # @param url [String] absolute HTTP(S) URL of the page to download
82
+ # @return [String] full Markdown representation of the page with
83
+ # leading/trailing whitespace trimmed, uncapped otherwise —
84
+ # caller is responsible for any size limiting before feeding
85
+ # the result back to the LLM
86
+ # @raise [FetchError] on HTTP non-2xx, network failure, redirect
87
+ # loop, a 3xx without a +Location+ header, a response no
88
+ # extractor recognizes, or an extraction failure (malformed
89
+ # PDF, ...)
90
+ def self.visit(url)
91
+ extract(fetch(url)).strip
92
+ end
93
+
94
+ # Render a {Fetched} response as Markdown via
95
+ # {Pikuri::Extractor.extract}, re-raising both extraction failure
96
+ # modes as {FetchError} — the single exception type the web tools
97
+ # rescue. The content-type is passed verbatim (including the +""+
98
+ # of a missing header, which matches no text arm — a body without
99
+ # transport metadata is refused, not sniffed; only a strong magic
100
+ # sniff like pikuri-pdf's +%PDF-+ overrides a wrong or missing
101
+ # header, because such a sniff never misfires on text).
102
+ #
103
+ # @param fetched [Fetched]
104
+ # @return [String] Markdown representation produced by the
105
+ # matched extractor
106
+ # @raise [FetchError] when no extractor matches the response's
107
+ # content-type, or when extraction fails
108
+ def self.extract(fetched)
109
+ Pikuri::Extractor.extract(StringIO.new(fetched.body), content_type: fetched.content_type)
110
+ rescue Pikuri::Extractor::Unsupported
111
+ raise FetchError, "unsupported content-type #{fetched.content_type.inspect} for #{fetched.url}"
112
+ rescue Pikuri::Extractor::Error => e
113
+ raise FetchError, e.message
114
+ end
115
+
116
+ # Download the body of +url+, manually following up to
117
+ # {MAX_REDIRECTS} redirects. Faraday is configured with no
118
+ # middleware so behavior here mirrors the rest of the codebase
119
+ # (see +Tool::Search::DuckDuckGo.search+).
120
+ #
121
+ # All recoverable failures — HTTP 4xx/5xx, +Faraday::Error+ network
122
+ # blips, exhausted redirect budget, 3xx without a +Location+ —
123
+ # surface as {FetchError} so the caller has a single exception type
124
+ # to rescue. Error bodies are trimmed to {ERROR_BODY_EXCERPT}
125
+ # characters with whitespace collapsed, so a Cloudflare-challenge
126
+ # response doesn't dump kilobytes of inline HTML into the next LLM
127
+ # observation.
128
+ #
129
+ # @param url [String] absolute HTTP(S) URL to fetch
130
+ # @param limit [Integer] redirects remaining; recurses with
131
+ # +limit - 1+ on each 3xx
132
+ # @return [Fetched] body, normalized content-type, and final URL
133
+ # after redirects
134
+ # @raise [FetchError] on non-2xx/3xx responses, network errors,
135
+ # redirect-loop exhaustion, or 3xx without a +Location+ header
136
+ def self.fetch(url, limit: MAX_REDIRECTS)
137
+ raise FetchError, "too many redirects fetching #{url}" if limit.zero?
138
+
139
+ response = begin
140
+ Faraday.new(request: { open_timeout: OPEN_TIMEOUT, timeout: READ_TIMEOUT }).get(url) do |req|
141
+ req.headers['User-Agent'] = USER_AGENT
142
+ req.headers['Accept'] = ACCEPT
143
+ end
144
+ rescue Faraday::Error => e
145
+ raise FetchError, "#{e.class.name.split('::').last} fetching #{url}: #{e.message}"
146
+ end
147
+
148
+ case response.status
149
+ when 200..299
150
+ Fetched.new(body: response.body, content_type: normalize_content_type(response.headers['content-type']), url: url)
151
+ when 300..399
152
+ location = response.headers['location']
153
+ raise FetchError, "HTTP #{response.status} from #{url} with no Location header" if location.nil? || location.empty?
154
+
155
+ fetch(URI.join(url, location).to_s, limit: limit - 1)
156
+ else
157
+ raise FetchError, "HTTP #{response.status} fetching #{url}: #{excerpt(response.body)}"
158
+ end
159
+ end
160
+
161
+ # Lower-case +raw+ and strip any +; charset=...+ parameters so the
162
+ # extractors can match on a canonical token.
163
+ #
164
+ # @param raw [String, nil] raw +Content-Type+ header value
165
+ # @return [String] normalized content-type, or +""+ when the
166
+ # header was missing
167
+ def self.normalize_content_type(raw)
168
+ raw.to_s.split(';').first.to_s.strip.downcase
169
+ end
170
+ private_class_method :normalize_content_type
171
+
172
+ # Whitespace-collapse +body+ and clip to {ERROR_BODY_EXCERPT}
173
+ # characters, so the {FetchError} message stays a single readable
174
+ # line even when the server returned a multi-KB HTML challenge
175
+ # page.
176
+ #
177
+ # @param body [String, nil]
178
+ # @return [String]
179
+ def self.excerpt(body)
180
+ text = body.to_s.gsub(/\s+/, ' ').strip
181
+ text.length > ERROR_BODY_EXCERPT ? "#{text[0, ERROR_BODY_EXCERPT]}..." : text
182
+ end
183
+ private_class_method :excerpt
184
+ end
185
+ end
186
+ end
@@ -9,13 +9,17 @@ module Pikuri
9
9
  module Search
10
10
  # Performs a Brave Search via the official Web Search API and returns
11
11
  # the hits as a list of {Result} rows. Split into a thin HTTP fetch
12
- # (#search) and a pure parser (#parse) so tests can exercise the
12
+ # (#search) and a pure parser (.parse) so tests can exercise the
13
13
  # parser against fixture JSON without hitting the network. The
14
- # cascade in {Engines.search} owns the final Markdown rendering.
14
+ # cascade in {Engines#search} owns the final Markdown rendering.
15
15
  #
16
- # Requires a Brave Search API key. Get one at
17
- # https://api-dashboard.search.brave.com the free "Data for Search"
18
- # tier allows 1 query/sec and ~2k queries/month.
16
+ # A class constructed with the API key it should use
17
+ # (+Brave.new(api_key:)+); {Engines} builds one only when a Brave key
18
+ # was configured and then drives it through the same +#search+ /
19
+ # +#label+ interface as every other provider. pikuri reads no key
20
+ # from the environment (see CLAUDE.md "Environment is not a secret
21
+ # store"). Get a key at https://api-dashboard.search.brave.com — the
22
+ # free "Data for Search" tier allows 1 query/sec and ~2k queries/month.
19
23
  #
20
24
  # == Privacy posture
21
25
  #
@@ -32,49 +36,59 @@ module Pikuri
32
36
  # 90-day retention by default, real ZDR if you pay for it. Still a
33
37
  # logged 90-day window on the cheap tier, so not a substitute for
34
38
  # ZDR for genuinely sensitive queries.
35
- module Brave
39
+ class Brave
36
40
  # @return [String] Web Search endpoint
37
41
  ENDPOINT = 'https://api.search.brave.com/res/v1/web/search'
38
42
  # @return [Integer] default number of results returned, matching
39
43
  # {DuckDuckGo::DEFAULT_MAX_RESULTS}
40
44
  DEFAULT_MAX_RESULTS = 10
41
- # @return [String] env var holding the API key; +X-Subscription-Token+
42
- ENV_KEY = 'BRAVE_SEARCH_API_KEY'
43
45
  # @return [RateLimiter] free-tier Brave caps at 1 req/sec; the
44
46
  # 5-minute cooldown protects the limited monthly quota from
45
47
  # being burned on doomed retries when a 429 hits.
46
48
  LIMITER = RateLimiter.new(min_interval: 1.0, cooldown: 300.0)
47
49
 
50
+ # @param api_key [String] Brave Search subscription token. Required
51
+ # and non-blank: pikuri reads no key from the environment — the
52
+ # host supplies it ({Engines} only constructs a Brave when a key
53
+ # was configured).
54
+ # @raise [ArgumentError] if +api_key+ is blank
55
+ def initialize(api_key:)
56
+ raise ArgumentError, 'Brave Search API key is blank' if api_key.to_s.strip.empty?
57
+
58
+ @api_key = api_key
59
+ end
60
+
61
+ # @return [String] short provider label for {Engines} logging /
62
+ # fallback messages.
63
+ def label
64
+ 'Brave'
65
+ end
66
+
48
67
  # Fetch results for +query+ and return them as an +Array<Result>+.
49
68
  # Calls are throttled to one per second and circuit-broken for 5
50
69
  # minutes on rate-limit / quota-exhausted responses; see {LIMITER}.
51
- # The caller (typically {Engines.search}) is expected to have
70
+ # The caller (typically {Engines#search}) is expected to have
52
71
  # already normalized the query and to wrap this in a result cache.
53
72
  #
54
73
  # @param query [String] search query (already normalized)
55
74
  # @param max_results [Integer] maximum number of result entries;
56
75
  # passed through as Brave's +count+ (1..20)
57
- # @param api_key [String] Brave Search subscription token; defaults to
58
- # the {ENV_KEY} environment variable
59
76
  # @return [Array<Result>] hits, possibly empty when Brave ran the
60
77
  # query and matched nothing
61
- # @raise [ArgumentError] if no API key is available
62
78
  # @raise [Engines::Unavailable] when Brave returns HTTP 429
63
79
  # (rate limit / quota exhausted) or 5xx — "try again later"
64
- # responses the cascade in {Engines.search} can fall back
80
+ # responses the cascade in {Engines#search} can fall back
65
81
  # from. Also raised immediately if {LIMITER} is in cooldown.
66
82
  # Other non-2xx (e.g. 401/403 from a bad API key) bubble up as
67
83
  # +RuntimeError+ so config problems stay visible.
68
84
  # @raise [RuntimeError] for non-rate-limit HTTP failures or when the
69
85
  # response shape contains no results.
70
- def self.search(query, max_results: DEFAULT_MAX_RESULTS, api_key: ENV.fetch(ENV_KEY, nil))
71
- raise ArgumentError, "Brave Search API key not set (#{ENV_KEY})" if api_key.to_s.strip.empty?
72
-
86
+ def search(query, max_results: DEFAULT_MAX_RESULTS)
73
87
  LIMITER.call do
74
88
  response = Faraday.get(
75
89
  ENDPOINT,
76
90
  { q: query, count: max_results },
77
- { 'X-Subscription-Token' => api_key, 'Accept' => 'application/json' }
91
+ { 'X-Subscription-Token' => @api_key, 'Accept' => 'application/json' }
78
92
  )
79
93
  unless response.success?
80
94
  if response.status == 429 || response.status >= 500
@@ -84,7 +98,7 @@ module Pikuri
84
98
  raise "Brave Search request failed: #{response.status} #{response.body}"
85
99
  end
86
100
 
87
- parse(response.body, max_results: max_results)
101
+ self.class.parse(response.body, max_results: max_results)
88
102
  end
89
103
  end
90
104
 
@@ -9,9 +9,14 @@ module Pikuri
9
9
  module Search
10
10
  # Performs a DuckDuckGo search by scraping +html.duckduckgo.com+ and
11
11
  # returns the hits as a list of {Result} rows. Split into a thin HTTP
12
- # fetch (#search) and a pure parser (#parse) so tests can exercise
12
+ # fetch (#search) and a pure parser (.parse) so tests can exercise
13
13
  # the parser against fixture HTML without hitting the network. The
14
- # cascade in {Engines.search} owns the final Markdown rendering.
14
+ # cascade in {Engines#search} owns the final Markdown rendering.
15
+ #
16
+ # A class (constructed with no arguments) so it shares the uniform
17
+ # provider shape with the keyed {Brave} / {Exa}: {Engines} holds a
18
+ # list of provider *instances* and calls +#search+ / +#label+ on each
19
+ # without caring which is which.
15
20
  #
16
21
  # == Privacy posture
17
22
  #
@@ -30,7 +35,7 @@ module Pikuri
30
35
  # Microsoft, who has no comparable no-training pledge. Better than
31
36
  # Exa for sensitive queries, worse than Brave; for anything
32
37
  # genuinely embarrassing, don't search the web at all.
33
- module DuckDuckGo
38
+ class DuckDuckGo
34
39
  # @return [String] HTML search endpoint
35
40
  ENDPOINT = 'https://html.duckduckgo.com/html/'
36
41
  # @return [String] User-Agent sent with each request; DDG often rejects
@@ -44,10 +49,16 @@ module Pikuri
44
49
  # soft-block response doesn't get retried for the next 5 minutes
45
50
  LIMITER = RateLimiter.new(min_interval: 2.0, cooldown: 300.0)
46
51
 
52
+ # @return [String] short provider label for {Engines} logging /
53
+ # fallback messages. Uniform across providers (see {Brave#label}).
54
+ def label
55
+ 'DuckDuckGo'
56
+ end
57
+
47
58
  # Fetch results for +query+ and return them as an +Array<Result>+.
48
59
  # Calls are throttled to one every 2s and circuit-broken for 5 minutes
49
60
  # after a soft-block; see {LIMITER}. The caller (typically
50
- # {Engines.search}) is expected to have already normalized the
61
+ # {Engines#search}) is expected to have already normalized the
51
62
  # query and to wrap this in a result cache.
52
63
  #
53
64
  # @param query [String] search query (already normalized)
@@ -56,12 +67,12 @@ module Pikuri
56
67
  # query and matched nothing
57
68
  # @raise [Engines::Unavailable] when DDG soft-blocks the IP
58
69
  # (anomaly/CAPTCHA page) or returns HTTP 429/5xx — i.e. "try again
59
- # later" responses the cascade in {Engines.search} can fall
70
+ # later" responses the cascade in {Engines#search} can fall
60
71
  # back from. Also raised immediately if {LIMITER} is in cooldown.
61
72
  # @raise [RuntimeError] if the HTTP call fails for other reasons or
62
73
  # the empty-results page is in an unrecognized layout. A genuine
63
74
  # empty-results page is *not* an error; see {.parse}.
64
- def self.search(query, max_results: DEFAULT_MAX_RESULTS)
75
+ def search(query, max_results: DEFAULT_MAX_RESULTS)
65
76
  LIMITER.call do
66
77
  response = Faraday.get(ENDPOINT, { q: query }, { 'User-Agent' => USER_AGENT })
67
78
  unless response.success?
@@ -72,7 +83,7 @@ module Pikuri
72
83
  raise "DuckDuckGo request failed: #{response.status} #{response.body}"
73
84
  end
74
85
 
75
- parse(response.body, max_results: max_results)
86
+ self.class.parse(response.body, max_results: max_results)
76
87
  end
77
88
  end
78
89
 
@@ -2,20 +2,31 @@
2
2
 
3
3
  module Pikuri
4
4
  class Tool
5
- # Namespace for the web-search stack used by {Tool::WEB_SEARCH}: per-
5
+ # Namespace for the web-search stack used by {Tool::WebSearch}: per-
6
6
  # provider modules ({DuckDuckGo}, {Brave}, {Exa}), the {Result} value
7
7
  # object they all return, the cross-provider {Engines} cascade with
8
8
  # its on-disk cache, and the shared {RateLimiter} a provider can wire
9
9
  # in to back off when a quota header says so.
10
10
  module Search
11
- # Search-orchestration entry point: the cascade across configured
11
+ # Search-orchestration object: the cascade across configured
12
12
  # providers, the result cache, and the {Unavailable} protocol marker
13
- # the cascade uses to fall back. The LLM-facing tool itself
14
- # ({Tool::WEB_SEARCH}) lives in +lib/tool/web_search.rb+ and calls
15
- # into {.search} below. Each {Tool::Search} provider module
13
+ # the cascade uses to fall back. The LLM-facing tool itself is built
14
+ # by {Tool::WebSearch.build}, which constructs one of these and wires
15
+ # its {#search} into a {Tool}. Each {Tool::Search} provider module
16
16
  # ({DuckDuckGo}, {Brave}, {Exa}) raises {Unavailable} when it wants
17
17
  # the cascade to try the next one.
18
- module Engines
18
+ #
19
+ # == Provider keys are constructor config, not environment
20
+ #
21
+ # Brave and Exa are paid and need an API key; DuckDuckGo needs none.
22
+ # An {Engines} is constructed with the keys it should use
23
+ # (+brave_key:+ / +exa_key:+, both optional) — pikuri reads no key
24
+ # from the environment, so the only providers in the cascade are
25
+ # DuckDuckGo plus whichever keyed providers the host actually
26
+ # configured. The host sources those keys however it likes (the
27
+ # bundled +bin/+ examples load a JSON config file by convention); see
28
+ # CLAUDE.md "Environment is not a secret store".
29
+ class Engines
19
30
  # Subsystem logger; set its level with +PIKURI_LOG_ENGINES+
20
31
  # (e.g. +PIKURI_LOG_ENGINES=debug+) or the global +PIKURI_LOG+.
21
32
  #
@@ -24,40 +35,54 @@ module Pikuri
24
35
 
25
36
  # Raised by a provider when it is temporarily unavailable (rate-limited,
26
37
  # bot-blocked, quota-exhausted, or otherwise saying "try again later"
27
- # rather than "your request is wrong"). The cascade in {Engines.search}
38
+ # rather than "your request is wrong"). The cascade in {#search}
28
39
  # catches this and tries the next provider; any other exception bubbles
29
40
  # up unchanged so genuine bugs and config errors stay visible.
30
41
  class Unavailable < StandardError; end
31
42
 
32
- # All providers that are currently configured. {DuckDuckGo} is always
33
- # available (no API key needed); {Brave} and {Exa} each join the
34
- # list when their API token is present in the environment. Recomputed
35
- # on every call so a process picks up a newly-set token without a
36
- # restart.
37
- #
38
- # @return [Array<Module>] +Tool::Search::*+ provider modules, each
39
- # exposing +.search(query, max_results:)+ → +Array<Result>+
40
- def self.providers
41
- list = [DuckDuckGo]
42
- list << Brave unless ENV[Brave::ENV_KEY].to_s.strip.empty?
43
- list << Exa unless ENV[Exa::ENV_KEY].to_s.strip.empty?
44
- list
45
- end
46
-
47
- # On-disk cache used by {.search} to memoize answered queries.
48
- # Defined as a method so specs can swap it for an isolated cache
49
- # or {UrlCache::NULL} without touching the shared instance.
43
+ # Process-shared on-disk cache backing {#search}'s default. Kept at
44
+ # class level (not per-instance) so every engine dedupes answered
45
+ # queries into one directory; the constructor's +cache:+ parameter
46
+ # injects a different store for tests. Exposed as a method so specs
47
+ # can swap it for {UrlCache::NULL} without touching the instance.
50
48
  #
51
49
  # @return [UrlCache, #fetch]
52
50
  CACHE = UrlCache.new(ttl: UrlCache::DEFAULT_TTL, dir: "#{UrlCache::ROOT_DIR}/web_search")
53
- # Accessor for {CACHE}; specs override this to swap in
54
- # {UrlCache::NULL} or an isolated cache.
51
+ # Accessor for {CACHE}, used as the constructor's +cache:+ default;
52
+ # specs override this to swap in {UrlCache::NULL}.
55
53
  #
56
54
  # @return [UrlCache, #fetch]
57
55
  def self.cache
58
56
  CACHE
59
57
  end
60
58
 
59
+ # Builds the provider cascade once: {DuckDuckGo} always (no key
60
+ # needed), plus {Brave} / {Exa} when their key was supplied
61
+ # (non-blank). Each keyed provider is constructed with its key, so
62
+ # from here on every provider is just an object answering +#search+
63
+ # / +#label+ — the cascade in {#search} treats them uniformly.
64
+ #
65
+ # @param brave_key [String, nil] Brave Search subscription token;
66
+ # non-blank ⇒ Brave joins the cascade. +nil+/blank ⇒ not configured.
67
+ # @param exa_key [String, nil] Exa API key; non-blank ⇒ Exa joins the
68
+ # cascade. +nil+/blank ⇒ not configured.
69
+ # @param cache [UrlCache, #fetch] result store memoizing answered
70
+ # queries; defaults to the process-shared {.cache}.
71
+ # @return [Engines]
72
+ def initialize(brave_key: nil, exa_key: nil, cache: self.class.cache)
73
+ @providers = [DuckDuckGo.new]
74
+ @providers << Brave.new(api_key: brave_key) unless brave_key.to_s.strip.empty?
75
+ @providers << Exa.new(api_key: exa_key) unless exa_key.to_s.strip.empty?
76
+ @cache = cache
77
+ @last_logged_providers = nil
78
+ end
79
+
80
+ # The provider instances this engine cascades across, in
81
+ # declaration order (the cascade itself shuffles them per call).
82
+ #
83
+ # @return [Array<#search, #label>] configured provider instances
84
+ attr_reader :providers
85
+
61
86
  # Run +query+ through the configured providers in random order, falling
62
87
  # back to the next one each time a provider raises {Unavailable}. The
63
88
  # shuffle spreads load so a single provider isn't always hit first
@@ -68,13 +93,13 @@ module Pikuri
68
93
  # +Array<Result>+ is rendered into smolagents-style Markdown here
69
94
  # (+"## Search Results"+ header, then +[title](url)\nbody+ entries
70
95
  # joined by blank lines; an empty array becomes +"No results found."+),
71
- # and the rendered Markdown is cached on disk via {.cache}, keyed by
72
- # the cleaned query. A cache hit short-circuits the cascade entirely
73
- # (and benefits whichever provider would have answered next time too
74
- # — once a query is cached, the cooldown state of the original
75
- # answering provider no longer matters). +max_results+ is not part
76
- # of the cache key, so callers passing a non-default value may get
77
- # a result rendered with the previously-cached size.
96
+ # and the rendered Markdown is cached on disk via {#initialize}'s
97
+ # +cache:+, keyed by the cleaned query. A cache hit short-circuits the
98
+ # cascade entirely (and benefits whichever provider would have
99
+ # answered next time too — once a query is cached, the cooldown state
100
+ # of the original answering provider no longer matters). +max_results+
101
+ # is not part of the cache key, so callers passing a non-default value
102
+ # may get a result rendered with the previously-cached size.
78
103
  #
79
104
  # If every provider reports temporary unavailability, returns an
80
105
  # +"Error: ..."+ string instead of raising — same convention as
@@ -88,7 +113,7 @@ module Pikuri
88
113
  # @return [String] Markdown-formatted result list, or +"Error: ..."+
89
114
  # when all providers are exhausted
90
115
  # @raise [ArgumentError] if the query is empty after normalization
91
- def self.search(query, max_results:)
116
+ def search(query, max_results:)
92
117
  cleaned = query.to_s.strip.gsub(/\s+/, ' ')
93
118
  raise ArgumentError, 'query is empty' if cleaned.empty?
94
119
 
@@ -96,7 +121,7 @@ module Pikuri
96
121
  log_providers(current_providers)
97
122
 
98
123
  hit = true
99
- result = cache.fetch(cleaned) do
124
+ result = @cache.fetch(cleaned) do
100
125
  hit = false
101
126
  failures = []
102
127
  results = nil
@@ -106,7 +131,7 @@ module Pikuri
106
131
  chosen = provider
107
132
  break
108
133
  rescue Unavailable => e
109
- failures << "#{provider.name.split('::').last} (#{e.message})"
134
+ failures << "#{provider.label} (#{e.message})"
110
135
  end
111
136
  # Raise so {UrlCache#fetch} does NOT persist the all-unavailable
112
137
  # message — otherwise that string would block every future search
@@ -115,7 +140,7 @@ module Pikuri
115
140
  chosen or raise Unavailable, "all search providers temporarily unavailable: #{failures.join('; ')}"
116
141
 
117
142
  LOGGER.info do
118
- "engine=#{chosen.name.split('::').last} query=#{cleaned.inspect} results=#{results.size}"
143
+ "engine=#{chosen.label} query=#{cleaned.inspect} results=#{results.size}"
119
144
  end
120
145
  render(results)
121
146
  end
@@ -125,6 +150,8 @@ module Pikuri
125
150
  "Error: #{e.message}"
126
151
  end
127
152
 
153
+ private
154
+
128
155
  # Render an +Array<Result>+ into the smolagents-style Markdown the
129
156
  # LLM consumes: +"## Search Results"+ header, then +[title](url)\nbody+
130
157
  # entries joined by blank lines. An empty array becomes the
@@ -133,30 +160,26 @@ module Pikuri
133
160
  #
134
161
  # @param results [Array<Result>] hits from the winning provider
135
162
  # @return [String] Markdown-formatted result list
136
- def self.render(results)
163
+ def render(results)
137
164
  return "## Search Results\n\nNo results found." if results.empty?
138
165
 
139
166
  "## Search Results\n\n" + results.map { |r| "[#{r.title}](#{r.url})\n#{r.body}" }.join("\n\n")
140
167
  end
141
- private_class_method :render
142
168
 
143
169
  # Emit an INFO log line listing the currently-available providers,
144
- # but only when the set differs from the last one we logged.
145
- # {.providers} is recomputed on every {.search} call so a process
146
- # picks up newly-set API keys without a restart; the memo here
147
- # keeps the log to one line per distinct configuration rather
148
- # than one per search.
170
+ # but only when the set differs from the last one this engine
171
+ # logged. The memo keeps the log to one line per distinct
172
+ # configuration rather than one per search.
149
173
  #
150
- # @param current [Array<Module>] providers returned by {.providers}
174
+ # @param current [Array<#label>] providers returned by {#providers}
151
175
  # @return [void]
152
- def self.log_providers(current)
176
+ def log_providers(current)
153
177
  return if @last_logged_providers == current
154
178
 
155
179
  @last_logged_providers = current
156
- names = current.map { |p| p.name.split('::').last }.join(', ')
180
+ names = current.map(&:label).join(', ')
157
181
  LOGGER.info("engines available: #{names}")
158
182
  end
159
- private_class_method :log_providers
160
183
  end
161
184
  end
162
185
  end