pikuri-core 0.0.5 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +5 -3
- data/lib/pikuri/agent/chat_transport.rb +135 -11
- data/lib/pikuri/agent/configurator.rb +4 -4
- data/lib/pikuri/agent/context_window_detector.rb +103 -52
- data/lib/pikuri/agent/control/step_limit.rb +39 -7
- data/lib/pikuri/agent/event.rb +43 -16
- data/lib/pikuri/agent/extension.rb +31 -17
- data/lib/pikuri/agent/extension_context.rb +147 -0
- data/lib/pikuri/agent/listener/terminal.rb +30 -37
- data/lib/pikuri/agent/listener/token_log.rb +60 -13
- data/lib/pikuri/agent/listener.rb +12 -5
- data/lib/pikuri/agent/listener_list.rb +7 -17
- data/lib/pikuri/agent/synthesizer.rb +93 -67
- data/lib/pikuri/agent.rb +358 -403
- data/lib/pikuri/extractor/html.rb +303 -0
- data/lib/pikuri/extractor/passthrough.rb +64 -0
- data/lib/pikuri/extractor.rb +314 -0
- data/lib/pikuri/file_type.rb +74 -266
- data/lib/pikuri/sanitizer.rb +179 -0
- data/lib/pikuri/subprocess.rb +73 -2
- data/lib/pikuri/tool/calculator.rb +213 -41
- data/lib/pikuri/tool/fetch.rb +10 -9
- data/lib/pikuri/tool/parameters.rb +65 -2
- data/lib/pikuri/tool/scraper.rb +186 -0
- data/lib/pikuri/tool/search/brave.rb +32 -18
- data/lib/pikuri/tool/search/duckduckgo.rb +18 -7
- data/lib/pikuri/tool/search/engines.rb +72 -49
- data/lib/pikuri/tool/search/exa.rb +34 -22
- data/lib/pikuri/tool/web_scrape.rb +5 -5
- data/lib/pikuri/tool/web_search.rb +45 -26
- data/lib/pikuri/version.rb +1 -1
- data/lib/pikuri-core.rb +11 -10
- metadata +9 -66
- data/lib/pikuri/tool/scraper/fetch_error.rb +0 -16
- data/lib/pikuri/tool/scraper/html.rb +0 -285
- data/lib/pikuri/tool/scraper/pdf.rb +0 -54
- data/lib/pikuri/tool/scraper/simple.rb +0 -183
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'faraday'
|
|
4
|
+
require 'stringio'
|
|
5
|
+
require 'uri'
|
|
6
|
+
|
|
7
|
+
module Pikuri
|
|
8
|
+
class Tool
|
|
9
|
+
# HTTP side of the web tools ({Tool::WEB_SCRAPE} and {Tool::FETCH}):
|
|
10
|
+
# GET the URL with a real-browser User-Agent, follow redirects, and
|
|
11
|
+
# hand the response body to {Pikuri::Extractor.extract} with the
|
|
12
|
+
# response's +Content-Type+ as the hint. HTML/XHTML render via
|
|
13
|
+
# {Extractor::HTML}, any other +text/*+ type passes through
|
|
14
|
+
# verbatim, and plug-in extractors extend the set (with pikuri-pdf
|
|
15
|
+
# registered, +application/pdf+ extracts — by header or by +%PDF-+
|
|
16
|
+
# magic, so a PDF served under a lying header still works); the
|
|
17
|
+
# remaining types raise {FetchError} so the LLM observes the
|
|
18
|
+
# failure instead of receiving an empty rendering.
|
|
19
|
+
#
|
|
20
|
+
# Split into a thin HTTP fetch ({.fetch}) and the extraction
|
|
21
|
+
# wrapper ({.visit}) so tests can drive each piece in isolation and
|
|
22
|
+
# {Tool::Fetch} can reuse the HTTP half without the extraction
|
|
23
|
+
# pass. Nothing here knows about the LLM; the tools that wrap this
|
|
24
|
+
# module own caching and truncation and turn rendered Markdown (or
|
|
25
|
+
# {FetchError}) into the next observation.
|
|
26
|
+
module Scraper
|
|
27
|
+
# Raised when a URL cannot be rendered into Markdown text — HTTP
|
|
28
|
+
# non-2xx, network failure, redirect-loop, missing +Location+,
|
|
29
|
+
# unsupported content-type, or a parse failure that reads as "try
|
|
30
|
+
# a different URL" to the LLM. Catching this in
|
|
31
|
+
# {Tool::WEB_SCRAPE} / {Tool::FETCH} turns the failure into an
|
|
32
|
+
# +"Error: ..."+ observation; anything else bubbles up so genuine
|
|
33
|
+
# bugs stay visible.
|
|
34
|
+
class FetchError < StandardError; end
|
|
35
|
+
|
|
36
|
+
# @return [String] User-Agent sent with each request; many sites
|
|
37
|
+
# reject requests with no UA or an obvious bot UA
|
|
38
|
+
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' \
|
|
39
|
+
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
40
|
+
# @return [String] +Accept+ header sent with each request, so
|
|
41
|
+
# servers that content-negotiate hand back something we can use:
|
|
42
|
+
# rendered HTML first, +application/pdf+ for hosts with a PDF
|
|
43
|
+
# extractor registered, then any +text/*+ for the verbatim
|
|
44
|
+
# pass-through arm.
|
|
45
|
+
ACCEPT = 'text/html,application/xhtml+xml,application/pdf,text/*;q=0.8'
|
|
46
|
+
# @return [Integer] maximum number of HTTP redirects to follow
|
|
47
|
+
# before giving up
|
|
48
|
+
MAX_REDIRECTS = 5
|
|
49
|
+
# @return [Integer] connect timeout in seconds for the underlying
|
|
50
|
+
# Faraday request
|
|
51
|
+
OPEN_TIMEOUT = 10
|
|
52
|
+
# @return [Integer] read timeout in seconds for the underlying
|
|
53
|
+
# Faraday request
|
|
54
|
+
READ_TIMEOUT = 20
|
|
55
|
+
|
|
56
|
+
# @return [Integer] maximum number of characters of an error
|
|
57
|
+
# response body to include in a {FetchError} message. The body is
|
|
58
|
+
# often a multi-kilobyte HTML challenge page (Cloudflare, WAF
|
|
59
|
+
# interstitial, etc.); a short excerpt tells the LLM what kind of
|
|
60
|
+
# page came back without flooding the next observation.
|
|
61
|
+
ERROR_BODY_EXCERPT = 200
|
|
62
|
+
|
|
63
|
+
# Result of a successful {Scraper.fetch}: the response body, the
|
|
64
|
+
# normalized content-type (lower-cased, with any +; charset=...+
|
|
65
|
+
# parameters stripped), and the final URL after redirects.
|
|
66
|
+
Fetched = Data.define(:body, :content_type, :url)
|
|
67
|
+
|
|
68
|
+
# Fetch +url+ and render its main content as Markdown.
|
|
69
|
+
#
|
|
70
|
+
# No caching here — every call hits the network. Callers that want
|
|
71
|
+
# to memoize results should wrap this method themselves (see
|
|
72
|
+
# {Tool::WebScrape.visit}, which does exactly that).
|
|
73
|
+
#
|
|
74
|
+
# The extracted output is +String#strip+'d so the LLM never sees
|
|
75
|
+
# a body that opens or closes with blank lines — common with
|
|
76
|
+
# extracted PDFs' page-feed whitespace and with text bodies that
|
|
77
|
+
# carry a trailing newline. Interior whitespace is preserved
|
|
78
|
+
# because Markdown paragraph breaks and source-code indentation
|
|
79
|
+
# are load-bearing.
|
|
80
|
+
#
|
|
81
|
+
# @param url [String] absolute HTTP(S) URL of the page to download
|
|
82
|
+
# @return [String] full Markdown representation of the page with
|
|
83
|
+
# leading/trailing whitespace trimmed, uncapped otherwise —
|
|
84
|
+
# caller is responsible for any size limiting before feeding
|
|
85
|
+
# the result back to the LLM
|
|
86
|
+
# @raise [FetchError] on HTTP non-2xx, network failure, redirect
|
|
87
|
+
# loop, a 3xx without a +Location+ header, a response no
|
|
88
|
+
# extractor recognizes, or an extraction failure (malformed
|
|
89
|
+
# PDF, ...)
|
|
90
|
+
def self.visit(url)
|
|
91
|
+
extract(fetch(url)).strip
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Render a {Fetched} response as Markdown via
|
|
95
|
+
# {Pikuri::Extractor.extract}, re-raising both extraction failure
|
|
96
|
+
# modes as {FetchError} — the single exception type the web tools
|
|
97
|
+
# rescue. The content-type is passed verbatim (including the +""+
|
|
98
|
+
# of a missing header, which matches no text arm — a body without
|
|
99
|
+
# transport metadata is refused, not sniffed; only a strong magic
|
|
100
|
+
# sniff like pikuri-pdf's +%PDF-+ overrides a wrong or missing
|
|
101
|
+
# header, because such a sniff never misfires on text).
|
|
102
|
+
#
|
|
103
|
+
# @param fetched [Fetched]
|
|
104
|
+
# @return [String] Markdown representation produced by the
|
|
105
|
+
# matched extractor
|
|
106
|
+
# @raise [FetchError] when no extractor matches the response's
|
|
107
|
+
# content-type, or when extraction fails
|
|
108
|
+
def self.extract(fetched)
|
|
109
|
+
Pikuri::Extractor.extract(StringIO.new(fetched.body), content_type: fetched.content_type)
|
|
110
|
+
rescue Pikuri::Extractor::Unsupported
|
|
111
|
+
raise FetchError, "unsupported content-type #{fetched.content_type.inspect} for #{fetched.url}"
|
|
112
|
+
rescue Pikuri::Extractor::Error => e
|
|
113
|
+
raise FetchError, e.message
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# Download the body of +url+, manually following up to
|
|
117
|
+
# {MAX_REDIRECTS} redirects. Faraday is configured with no
|
|
118
|
+
# middleware so behavior here mirrors the rest of the codebase
|
|
119
|
+
# (see +Tool::Search::DuckDuckGo.search+).
|
|
120
|
+
#
|
|
121
|
+
# All recoverable failures — HTTP 4xx/5xx, +Faraday::Error+ network
|
|
122
|
+
# blips, exhausted redirect budget, 3xx without a +Location+ —
|
|
123
|
+
# surface as {FetchError} so the caller has a single exception type
|
|
124
|
+
# to rescue. Error bodies are trimmed to {ERROR_BODY_EXCERPT}
|
|
125
|
+
# characters with whitespace collapsed, so a Cloudflare-challenge
|
|
126
|
+
# response doesn't dump kilobytes of inline HTML into the next LLM
|
|
127
|
+
# observation.
|
|
128
|
+
#
|
|
129
|
+
# @param url [String] absolute HTTP(S) URL to fetch
|
|
130
|
+
# @param limit [Integer] redirects remaining; recurses with
|
|
131
|
+
# +limit - 1+ on each 3xx
|
|
132
|
+
# @return [Fetched] body, normalized content-type, and final URL
|
|
133
|
+
# after redirects
|
|
134
|
+
# @raise [FetchError] on non-2xx/3xx responses, network errors,
|
|
135
|
+
# redirect-loop exhaustion, or 3xx without a +Location+ header
|
|
136
|
+
def self.fetch(url, limit: MAX_REDIRECTS)
|
|
137
|
+
raise FetchError, "too many redirects fetching #{url}" if limit.zero?
|
|
138
|
+
|
|
139
|
+
response = begin
|
|
140
|
+
Faraday.new(request: { open_timeout: OPEN_TIMEOUT, timeout: READ_TIMEOUT }).get(url) do |req|
|
|
141
|
+
req.headers['User-Agent'] = USER_AGENT
|
|
142
|
+
req.headers['Accept'] = ACCEPT
|
|
143
|
+
end
|
|
144
|
+
rescue Faraday::Error => e
|
|
145
|
+
raise FetchError, "#{e.class.name.split('::').last} fetching #{url}: #{e.message}"
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
case response.status
|
|
149
|
+
when 200..299
|
|
150
|
+
Fetched.new(body: response.body, content_type: normalize_content_type(response.headers['content-type']), url: url)
|
|
151
|
+
when 300..399
|
|
152
|
+
location = response.headers['location']
|
|
153
|
+
raise FetchError, "HTTP #{response.status} from #{url} with no Location header" if location.nil? || location.empty?
|
|
154
|
+
|
|
155
|
+
fetch(URI.join(url, location).to_s, limit: limit - 1)
|
|
156
|
+
else
|
|
157
|
+
raise FetchError, "HTTP #{response.status} fetching #{url}: #{excerpt(response.body)}"
|
|
158
|
+
end
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
# Lower-case +raw+ and strip any +; charset=...+ parameters so the
|
|
162
|
+
# extractors can match on a canonical token.
|
|
163
|
+
#
|
|
164
|
+
# @param raw [String, nil] raw +Content-Type+ header value
|
|
165
|
+
# @return [String] normalized content-type, or +""+ when the
|
|
166
|
+
# header was missing
|
|
167
|
+
def self.normalize_content_type(raw)
|
|
168
|
+
raw.to_s.split(';').first.to_s.strip.downcase
|
|
169
|
+
end
|
|
170
|
+
private_class_method :normalize_content_type
|
|
171
|
+
|
|
172
|
+
# Whitespace-collapse +body+ and clip to {ERROR_BODY_EXCERPT}
|
|
173
|
+
# characters, so the {FetchError} message stays a single readable
|
|
174
|
+
# line even when the server returned a multi-KB HTML challenge
|
|
175
|
+
# page.
|
|
176
|
+
#
|
|
177
|
+
# @param body [String, nil]
|
|
178
|
+
# @return [String]
|
|
179
|
+
def self.excerpt(body)
|
|
180
|
+
text = body.to_s.gsub(/\s+/, ' ').strip
|
|
181
|
+
text.length > ERROR_BODY_EXCERPT ? "#{text[0, ERROR_BODY_EXCERPT]}..." : text
|
|
182
|
+
end
|
|
183
|
+
private_class_method :excerpt
|
|
184
|
+
end
|
|
185
|
+
end
|
|
186
|
+
end
|
|
@@ -9,13 +9,17 @@ module Pikuri
|
|
|
9
9
|
module Search
|
|
10
10
|
# Performs a Brave Search via the official Web Search API and returns
|
|
11
11
|
# the hits as a list of {Result} rows. Split into a thin HTTP fetch
|
|
12
|
-
# (#search) and a pure parser (
|
|
12
|
+
# (#search) and a pure parser (.parse) so tests can exercise the
|
|
13
13
|
# parser against fixture JSON without hitting the network. The
|
|
14
|
-
# cascade in {Engines
|
|
14
|
+
# cascade in {Engines#search} owns the final Markdown rendering.
|
|
15
15
|
#
|
|
16
|
-
#
|
|
17
|
-
#
|
|
18
|
-
#
|
|
16
|
+
# A class constructed with the API key it should use
|
|
17
|
+
# (+Brave.new(api_key:)+); {Engines} builds one only when a Brave key
|
|
18
|
+
# was configured and then drives it through the same +#search+ /
|
|
19
|
+
# +#label+ interface as every other provider. pikuri reads no key
|
|
20
|
+
# from the environment (see CLAUDE.md "Environment is not a secret
|
|
21
|
+
# store"). Get a key at https://api-dashboard.search.brave.com — the
|
|
22
|
+
# free "Data for Search" tier allows 1 query/sec and ~2k queries/month.
|
|
19
23
|
#
|
|
20
24
|
# == Privacy posture
|
|
21
25
|
#
|
|
@@ -32,49 +36,59 @@ module Pikuri
|
|
|
32
36
|
# 90-day retention by default, real ZDR if you pay for it. Still a
|
|
33
37
|
# logged 90-day window on the cheap tier, so not a substitute for
|
|
34
38
|
# ZDR for genuinely sensitive queries.
|
|
35
|
-
|
|
39
|
+
class Brave
|
|
36
40
|
# @return [String] Web Search endpoint
|
|
37
41
|
ENDPOINT = 'https://api.search.brave.com/res/v1/web/search'
|
|
38
42
|
# @return [Integer] default number of results returned, matching
|
|
39
43
|
# {DuckDuckGo::DEFAULT_MAX_RESULTS}
|
|
40
44
|
DEFAULT_MAX_RESULTS = 10
|
|
41
|
-
# @return [String] env var holding the API key; +X-Subscription-Token+
|
|
42
|
-
ENV_KEY = 'BRAVE_SEARCH_API_KEY'
|
|
43
45
|
# @return [RateLimiter] free-tier Brave caps at 1 req/sec; the
|
|
44
46
|
# 5-minute cooldown protects the limited monthly quota from
|
|
45
47
|
# being burned on doomed retries when a 429 hits.
|
|
46
48
|
LIMITER = RateLimiter.new(min_interval: 1.0, cooldown: 300.0)
|
|
47
49
|
|
|
50
|
+
# @param api_key [String] Brave Search subscription token. Required
|
|
51
|
+
# and non-blank: pikuri reads no key from the environment — the
|
|
52
|
+
# host supplies it ({Engines} only constructs a Brave when a key
|
|
53
|
+
# was configured).
|
|
54
|
+
# @raise [ArgumentError] if +api_key+ is blank
|
|
55
|
+
def initialize(api_key:)
|
|
56
|
+
raise ArgumentError, 'Brave Search API key is blank' if api_key.to_s.strip.empty?
|
|
57
|
+
|
|
58
|
+
@api_key = api_key
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# @return [String] short provider label for {Engines} logging /
|
|
62
|
+
# fallback messages.
|
|
63
|
+
def label
|
|
64
|
+
'Brave'
|
|
65
|
+
end
|
|
66
|
+
|
|
48
67
|
# Fetch results for +query+ and return them as an +Array<Result>+.
|
|
49
68
|
# Calls are throttled to one per second and circuit-broken for 5
|
|
50
69
|
# minutes on rate-limit / quota-exhausted responses; see {LIMITER}.
|
|
51
|
-
# The caller (typically {Engines
|
|
70
|
+
# The caller (typically {Engines#search}) is expected to have
|
|
52
71
|
# already normalized the query and to wrap this in a result cache.
|
|
53
72
|
#
|
|
54
73
|
# @param query [String] search query (already normalized)
|
|
55
74
|
# @param max_results [Integer] maximum number of result entries;
|
|
56
75
|
# passed through as Brave's +count+ (1..20)
|
|
57
|
-
# @param api_key [String] Brave Search subscription token; defaults to
|
|
58
|
-
# the {ENV_KEY} environment variable
|
|
59
76
|
# @return [Array<Result>] hits, possibly empty when Brave ran the
|
|
60
77
|
# query and matched nothing
|
|
61
|
-
# @raise [ArgumentError] if no API key is available
|
|
62
78
|
# @raise [Engines::Unavailable] when Brave returns HTTP 429
|
|
63
79
|
# (rate limit / quota exhausted) or 5xx — "try again later"
|
|
64
|
-
# responses the cascade in {Engines
|
|
80
|
+
# responses the cascade in {Engines#search} can fall back
|
|
65
81
|
# from. Also raised immediately if {LIMITER} is in cooldown.
|
|
66
82
|
# Other non-2xx (e.g. 401/403 from a bad API key) bubble up as
|
|
67
83
|
# +RuntimeError+ so config problems stay visible.
|
|
68
84
|
# @raise [RuntimeError] for non-rate-limit HTTP failures or when the
|
|
69
85
|
# response shape contains no results.
|
|
70
|
-
def
|
|
71
|
-
raise ArgumentError, "Brave Search API key not set (#{ENV_KEY})" if api_key.to_s.strip.empty?
|
|
72
|
-
|
|
86
|
+
def search(query, max_results: DEFAULT_MAX_RESULTS)
|
|
73
87
|
LIMITER.call do
|
|
74
88
|
response = Faraday.get(
|
|
75
89
|
ENDPOINT,
|
|
76
90
|
{ q: query, count: max_results },
|
|
77
|
-
{ 'X-Subscription-Token' => api_key, 'Accept' => 'application/json' }
|
|
91
|
+
{ 'X-Subscription-Token' => @api_key, 'Accept' => 'application/json' }
|
|
78
92
|
)
|
|
79
93
|
unless response.success?
|
|
80
94
|
if response.status == 429 || response.status >= 500
|
|
@@ -84,7 +98,7 @@ module Pikuri
|
|
|
84
98
|
raise "Brave Search request failed: #{response.status} #{response.body}"
|
|
85
99
|
end
|
|
86
100
|
|
|
87
|
-
parse(response.body, max_results: max_results)
|
|
101
|
+
self.class.parse(response.body, max_results: max_results)
|
|
88
102
|
end
|
|
89
103
|
end
|
|
90
104
|
|
|
@@ -9,9 +9,14 @@ module Pikuri
|
|
|
9
9
|
module Search
|
|
10
10
|
# Performs a DuckDuckGo search by scraping +html.duckduckgo.com+ and
|
|
11
11
|
# returns the hits as a list of {Result} rows. Split into a thin HTTP
|
|
12
|
-
# fetch (#search) and a pure parser (
|
|
12
|
+
# fetch (#search) and a pure parser (.parse) so tests can exercise
|
|
13
13
|
# the parser against fixture HTML without hitting the network. The
|
|
14
|
-
# cascade in {Engines
|
|
14
|
+
# cascade in {Engines#search} owns the final Markdown rendering.
|
|
15
|
+
#
|
|
16
|
+
# A class (constructed with no arguments) so it shares the uniform
|
|
17
|
+
# provider shape with the keyed {Brave} / {Exa}: {Engines} holds a
|
|
18
|
+
# list of provider *instances* and calls +#search+ / +#label+ on each
|
|
19
|
+
# without caring which is which.
|
|
15
20
|
#
|
|
16
21
|
# == Privacy posture
|
|
17
22
|
#
|
|
@@ -30,7 +35,7 @@ module Pikuri
|
|
|
30
35
|
# Microsoft, who has no comparable no-training pledge. Better than
|
|
31
36
|
# Exa for sensitive queries, worse than Brave; for anything
|
|
32
37
|
# genuinely embarrassing, don't search the web at all.
|
|
33
|
-
|
|
38
|
+
class DuckDuckGo
|
|
34
39
|
# @return [String] HTML search endpoint
|
|
35
40
|
ENDPOINT = 'https://html.duckduckgo.com/html/'
|
|
36
41
|
# @return [String] User-Agent sent with each request; DDG often rejects
|
|
@@ -44,10 +49,16 @@ module Pikuri
|
|
|
44
49
|
# soft-block response doesn't get retried for the next 5 minutes
|
|
45
50
|
LIMITER = RateLimiter.new(min_interval: 2.0, cooldown: 300.0)
|
|
46
51
|
|
|
52
|
+
# @return [String] short provider label for {Engines} logging /
|
|
53
|
+
# fallback messages. Uniform across providers (see {Brave#label}).
|
|
54
|
+
def label
|
|
55
|
+
'DuckDuckGo'
|
|
56
|
+
end
|
|
57
|
+
|
|
47
58
|
# Fetch results for +query+ and return them as an +Array<Result>+.
|
|
48
59
|
# Calls are throttled to one every 2s and circuit-broken for 5 minutes
|
|
49
60
|
# after a soft-block; see {LIMITER}. The caller (typically
|
|
50
|
-
# {Engines
|
|
61
|
+
# {Engines#search}) is expected to have already normalized the
|
|
51
62
|
# query and to wrap this in a result cache.
|
|
52
63
|
#
|
|
53
64
|
# @param query [String] search query (already normalized)
|
|
@@ -56,12 +67,12 @@ module Pikuri
|
|
|
56
67
|
# query and matched nothing
|
|
57
68
|
# @raise [Engines::Unavailable] when DDG soft-blocks the IP
|
|
58
69
|
# (anomaly/CAPTCHA page) or returns HTTP 429/5xx — i.e. "try again
|
|
59
|
-
# later" responses the cascade in {Engines
|
|
70
|
+
# later" responses the cascade in {Engines#search} can fall
|
|
60
71
|
# back from. Also raised immediately if {LIMITER} is in cooldown.
|
|
61
72
|
# @raise [RuntimeError] if the HTTP call fails for other reasons or
|
|
62
73
|
# the empty-results page is in an unrecognized layout. A genuine
|
|
63
74
|
# empty-results page is *not* an error; see {.parse}.
|
|
64
|
-
def
|
|
75
|
+
def search(query, max_results: DEFAULT_MAX_RESULTS)
|
|
65
76
|
LIMITER.call do
|
|
66
77
|
response = Faraday.get(ENDPOINT, { q: query }, { 'User-Agent' => USER_AGENT })
|
|
67
78
|
unless response.success?
|
|
@@ -72,7 +83,7 @@ module Pikuri
|
|
|
72
83
|
raise "DuckDuckGo request failed: #{response.status} #{response.body}"
|
|
73
84
|
end
|
|
74
85
|
|
|
75
|
-
parse(response.body, max_results: max_results)
|
|
86
|
+
self.class.parse(response.body, max_results: max_results)
|
|
76
87
|
end
|
|
77
88
|
end
|
|
78
89
|
|
|
@@ -2,20 +2,31 @@
|
|
|
2
2
|
|
|
3
3
|
module Pikuri
|
|
4
4
|
class Tool
|
|
5
|
-
# Namespace for the web-search stack used by {Tool::
|
|
5
|
+
# Namespace for the web-search stack used by {Tool::WebSearch}: per-
|
|
6
6
|
# provider modules ({DuckDuckGo}, {Brave}, {Exa}), the {Result} value
|
|
7
7
|
# object they all return, the cross-provider {Engines} cascade with
|
|
8
8
|
# its on-disk cache, and the shared {RateLimiter} a provider can wire
|
|
9
9
|
# in to back off when a quota header says so.
|
|
10
10
|
module Search
|
|
11
|
-
# Search-orchestration
|
|
11
|
+
# Search-orchestration object: the cascade across configured
|
|
12
12
|
# providers, the result cache, and the {Unavailable} protocol marker
|
|
13
|
-
# the cascade uses to fall back. The LLM-facing tool itself
|
|
14
|
-
#
|
|
15
|
-
#
|
|
13
|
+
# the cascade uses to fall back. The LLM-facing tool itself is built
|
|
14
|
+
# by {Tool::WebSearch.build}, which constructs one of these and wires
|
|
15
|
+
# its {#search} into a {Tool}. Each {Tool::Search} provider module
|
|
16
16
|
# ({DuckDuckGo}, {Brave}, {Exa}) raises {Unavailable} when it wants
|
|
17
17
|
# the cascade to try the next one.
|
|
18
|
-
|
|
18
|
+
#
|
|
19
|
+
# == Provider keys are constructor config, not environment
|
|
20
|
+
#
|
|
21
|
+
# Brave and Exa are paid and need an API key; DuckDuckGo needs none.
|
|
22
|
+
# An {Engines} is constructed with the keys it should use
|
|
23
|
+
# (+brave_key:+ / +exa_key:+, both optional) — pikuri reads no key
|
|
24
|
+
# from the environment, so the only providers in the cascade are
|
|
25
|
+
# DuckDuckGo plus whichever keyed providers the host actually
|
|
26
|
+
# configured. The host sources those keys however it likes (the
|
|
27
|
+
# bundled +bin/+ examples load a JSON config file by convention); see
|
|
28
|
+
# CLAUDE.md "Environment is not a secret store".
|
|
29
|
+
class Engines
|
|
19
30
|
# Subsystem logger; set its level with +PIKURI_LOG_ENGINES+
|
|
20
31
|
# (e.g. +PIKURI_LOG_ENGINES=debug+) or the global +PIKURI_LOG+.
|
|
21
32
|
#
|
|
@@ -24,40 +35,54 @@ module Pikuri
|
|
|
24
35
|
|
|
25
36
|
# Raised by a provider when it is temporarily unavailable (rate-limited,
|
|
26
37
|
# bot-blocked, quota-exhausted, or otherwise saying "try again later"
|
|
27
|
-
# rather than "your request is wrong"). The cascade in {
|
|
38
|
+
# rather than "your request is wrong"). The cascade in {#search}
|
|
28
39
|
# catches this and tries the next provider; any other exception bubbles
|
|
29
40
|
# up unchanged so genuine bugs and config errors stay visible.
|
|
30
41
|
class Unavailable < StandardError; end
|
|
31
42
|
|
|
32
|
-
#
|
|
33
|
-
#
|
|
34
|
-
#
|
|
35
|
-
#
|
|
36
|
-
#
|
|
37
|
-
#
|
|
38
|
-
# @return [Array<Module>] +Tool::Search::*+ provider modules, each
|
|
39
|
-
# exposing +.search(query, max_results:)+ → +Array<Result>+
|
|
40
|
-
def self.providers
|
|
41
|
-
list = [DuckDuckGo]
|
|
42
|
-
list << Brave unless ENV[Brave::ENV_KEY].to_s.strip.empty?
|
|
43
|
-
list << Exa unless ENV[Exa::ENV_KEY].to_s.strip.empty?
|
|
44
|
-
list
|
|
45
|
-
end
|
|
46
|
-
|
|
47
|
-
# On-disk cache used by {.search} to memoize answered queries.
|
|
48
|
-
# Defined as a method so specs can swap it for an isolated cache
|
|
49
|
-
# or {UrlCache::NULL} without touching the shared instance.
|
|
43
|
+
# Process-shared on-disk cache backing {#search}'s default. Kept at
|
|
44
|
+
# class level (not per-instance) so every engine dedupes answered
|
|
45
|
+
# queries into one directory; the constructor's +cache:+ parameter
|
|
46
|
+
# injects a different store for tests. Exposed as a method so specs
|
|
47
|
+
# can swap it for {UrlCache::NULL} without touching the instance.
|
|
50
48
|
#
|
|
51
49
|
# @return [UrlCache, #fetch]
|
|
52
50
|
CACHE = UrlCache.new(ttl: UrlCache::DEFAULT_TTL, dir: "#{UrlCache::ROOT_DIR}/web_search")
|
|
53
|
-
# Accessor for {CACHE}
|
|
54
|
-
# {UrlCache::NULL}
|
|
51
|
+
# Accessor for {CACHE}, used as the constructor's +cache:+ default;
|
|
52
|
+
# specs override this to swap in {UrlCache::NULL}.
|
|
55
53
|
#
|
|
56
54
|
# @return [UrlCache, #fetch]
|
|
57
55
|
def self.cache
|
|
58
56
|
CACHE
|
|
59
57
|
end
|
|
60
58
|
|
|
59
|
+
# Builds the provider cascade once: {DuckDuckGo} always (no key
|
|
60
|
+
# needed), plus {Brave} / {Exa} when their key was supplied
|
|
61
|
+
# (non-blank). Each keyed provider is constructed with its key, so
|
|
62
|
+
# from here on every provider is just an object answering +#search+
|
|
63
|
+
# / +#label+ — the cascade in {#search} treats them uniformly.
|
|
64
|
+
#
|
|
65
|
+
# @param brave_key [String, nil] Brave Search subscription token;
|
|
66
|
+
# non-blank ⇒ Brave joins the cascade. +nil+/blank ⇒ not configured.
|
|
67
|
+
# @param exa_key [String, nil] Exa API key; non-blank ⇒ Exa joins the
|
|
68
|
+
# cascade. +nil+/blank ⇒ not configured.
|
|
69
|
+
# @param cache [UrlCache, #fetch] result store memoizing answered
|
|
70
|
+
# queries; defaults to the process-shared {.cache}.
|
|
71
|
+
# @return [Engines]
|
|
72
|
+
def initialize(brave_key: nil, exa_key: nil, cache: self.class.cache)
|
|
73
|
+
@providers = [DuckDuckGo.new]
|
|
74
|
+
@providers << Brave.new(api_key: brave_key) unless brave_key.to_s.strip.empty?
|
|
75
|
+
@providers << Exa.new(api_key: exa_key) unless exa_key.to_s.strip.empty?
|
|
76
|
+
@cache = cache
|
|
77
|
+
@last_logged_providers = nil
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# The provider instances this engine cascades across, in
|
|
81
|
+
# declaration order (the cascade itself shuffles them per call).
|
|
82
|
+
#
|
|
83
|
+
# @return [Array<#search, #label>] configured provider instances
|
|
84
|
+
attr_reader :providers
|
|
85
|
+
|
|
61
86
|
# Run +query+ through the configured providers in random order, falling
|
|
62
87
|
# back to the next one each time a provider raises {Unavailable}. The
|
|
63
88
|
# shuffle spreads load so a single provider isn't always hit first
|
|
@@ -68,13 +93,13 @@ module Pikuri
|
|
|
68
93
|
# +Array<Result>+ is rendered into smolagents-style Markdown here
|
|
69
94
|
# (+"## Search Results"+ header, then +[title](url)\nbody+ entries
|
|
70
95
|
# joined by blank lines; an empty array becomes +"No results found."+),
|
|
71
|
-
# and the rendered Markdown is cached on disk via {
|
|
72
|
-
# the cleaned query. A cache hit short-circuits the
|
|
73
|
-
# (and benefits whichever provider would have
|
|
74
|
-
# — once a query is cached, the cooldown state
|
|
75
|
-
# answering provider no longer matters). +max_results+
|
|
76
|
-
# of the cache key, so callers passing a non-default value
|
|
77
|
-
# a result rendered with the previously-cached size.
|
|
96
|
+
# and the rendered Markdown is cached on disk via {#initialize}'s
|
|
97
|
+
# +cache:+, keyed by the cleaned query. A cache hit short-circuits the
|
|
98
|
+
# cascade entirely (and benefits whichever provider would have
|
|
99
|
+
# answered next time too — once a query is cached, the cooldown state
|
|
100
|
+
# of the original answering provider no longer matters). +max_results+
|
|
101
|
+
# is not part of the cache key, so callers passing a non-default value
|
|
102
|
+
# may get a result rendered with the previously-cached size.
|
|
78
103
|
#
|
|
79
104
|
# If every provider reports temporary unavailability, returns an
|
|
80
105
|
# +"Error: ..."+ string instead of raising — same convention as
|
|
@@ -88,7 +113,7 @@ module Pikuri
|
|
|
88
113
|
# @return [String] Markdown-formatted result list, or +"Error: ..."+
|
|
89
114
|
# when all providers are exhausted
|
|
90
115
|
# @raise [ArgumentError] if the query is empty after normalization
|
|
91
|
-
def
|
|
116
|
+
def search(query, max_results:)
|
|
92
117
|
cleaned = query.to_s.strip.gsub(/\s+/, ' ')
|
|
93
118
|
raise ArgumentError, 'query is empty' if cleaned.empty?
|
|
94
119
|
|
|
@@ -96,7 +121,7 @@ module Pikuri
|
|
|
96
121
|
log_providers(current_providers)
|
|
97
122
|
|
|
98
123
|
hit = true
|
|
99
|
-
result = cache.fetch(cleaned) do
|
|
124
|
+
result = @cache.fetch(cleaned) do
|
|
100
125
|
hit = false
|
|
101
126
|
failures = []
|
|
102
127
|
results = nil
|
|
@@ -106,7 +131,7 @@ module Pikuri
|
|
|
106
131
|
chosen = provider
|
|
107
132
|
break
|
|
108
133
|
rescue Unavailable => e
|
|
109
|
-
failures << "#{provider.
|
|
134
|
+
failures << "#{provider.label} (#{e.message})"
|
|
110
135
|
end
|
|
111
136
|
# Raise so {UrlCache#fetch} does NOT persist the all-unavailable
|
|
112
137
|
# message — otherwise that string would block every future search
|
|
@@ -115,7 +140,7 @@ module Pikuri
|
|
|
115
140
|
chosen or raise Unavailable, "all search providers temporarily unavailable: #{failures.join('; ')}"
|
|
116
141
|
|
|
117
142
|
LOGGER.info do
|
|
118
|
-
"engine=#{chosen.
|
|
143
|
+
"engine=#{chosen.label} query=#{cleaned.inspect} results=#{results.size}"
|
|
119
144
|
end
|
|
120
145
|
render(results)
|
|
121
146
|
end
|
|
@@ -125,6 +150,8 @@ module Pikuri
|
|
|
125
150
|
"Error: #{e.message}"
|
|
126
151
|
end
|
|
127
152
|
|
|
153
|
+
private
|
|
154
|
+
|
|
128
155
|
# Render an +Array<Result>+ into the smolagents-style Markdown the
|
|
129
156
|
# LLM consumes: +"## Search Results"+ header, then +[title](url)\nbody+
|
|
130
157
|
# entries joined by blank lines. An empty array becomes the
|
|
@@ -133,30 +160,26 @@ module Pikuri
|
|
|
133
160
|
#
|
|
134
161
|
# @param results [Array<Result>] hits from the winning provider
|
|
135
162
|
# @return [String] Markdown-formatted result list
|
|
136
|
-
def
|
|
163
|
+
def render(results)
|
|
137
164
|
return "## Search Results\n\nNo results found." if results.empty?
|
|
138
165
|
|
|
139
166
|
"## Search Results\n\n" + results.map { |r| "[#{r.title}](#{r.url})\n#{r.body}" }.join("\n\n")
|
|
140
167
|
end
|
|
141
|
-
private_class_method :render
|
|
142
168
|
|
|
143
169
|
# Emit an INFO log line listing the currently-available providers,
|
|
144
|
-
# but only when the set differs from the last one
|
|
145
|
-
#
|
|
146
|
-
#
|
|
147
|
-
# keeps the log to one line per distinct configuration rather
|
|
148
|
-
# than one per search.
|
|
170
|
+
# but only when the set differs from the last one this engine
|
|
171
|
+
# logged. The memo keeps the log to one line per distinct
|
|
172
|
+
# configuration rather than one per search.
|
|
149
173
|
#
|
|
150
|
-
# @param current [Array
|
|
174
|
+
# @param current [Array<#label>] providers returned by {#providers}
|
|
151
175
|
# @return [void]
|
|
152
|
-
def
|
|
176
|
+
def log_providers(current)
|
|
153
177
|
return if @last_logged_providers == current
|
|
154
178
|
|
|
155
179
|
@last_logged_providers = current
|
|
156
|
-
names = current.map
|
|
180
|
+
names = current.map(&:label).join(', ')
|
|
157
181
|
LOGGER.info("engines available: #{names}")
|
|
158
182
|
end
|
|
159
|
-
private_class_method :log_providers
|
|
160
183
|
end
|
|
161
184
|
end
|
|
162
185
|
end
|