pikuri 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +62 -0
- data/GETTING_STARTED.md +223 -0
- data/LICENSE +21 -0
- data/README.md +193 -0
- data/lib/pikuri/agent/chat_transport.rb +41 -0
- data/lib/pikuri/agent/context_window_detector.rb +101 -0
- data/lib/pikuri/agent/listener/in_memory_message_list.rb +33 -0
- data/lib/pikuri/agent/listener/message_listener.rb +93 -0
- data/lib/pikuri/agent/listener/step_limit.rb +97 -0
- data/lib/pikuri/agent/listener/terminal.rb +137 -0
- data/lib/pikuri/agent/listener/token_log.rb +166 -0
- data/lib/pikuri/agent/listener_list.rb +113 -0
- data/lib/pikuri/agent/message.rb +61 -0
- data/lib/pikuri/agent/synthesizer.rb +120 -0
- data/lib/pikuri/agent/tokens.rb +56 -0
- data/lib/pikuri/agent.rb +286 -0
- data/lib/pikuri/subprocess.rb +166 -0
- data/lib/pikuri/tool/bash.rb +272 -0
- data/lib/pikuri/tool/calculator.rb +82 -0
- data/lib/pikuri/tool/confirmer.rb +96 -0
- data/lib/pikuri/tool/edit.rb +196 -0
- data/lib/pikuri/tool/fetch.rb +167 -0
- data/lib/pikuri/tool/glob.rb +310 -0
- data/lib/pikuri/tool/grep.rb +338 -0
- data/lib/pikuri/tool/parameters.rb +314 -0
- data/lib/pikuri/tool/read.rb +254 -0
- data/lib/pikuri/tool/scraper/fetch_error.rb +16 -0
- data/lib/pikuri/tool/scraper/html.rb +285 -0
- data/lib/pikuri/tool/scraper/pdf.rb +54 -0
- data/lib/pikuri/tool/scraper/simple.rb +177 -0
- data/lib/pikuri/tool/search/brave.rb +184 -0
- data/lib/pikuri/tool/search/duckduckgo.rb +196 -0
- data/lib/pikuri/tool/search/engines.rb +154 -0
- data/lib/pikuri/tool/search/exa.rb +217 -0
- data/lib/pikuri/tool/search/rate_limiter.rb +92 -0
- data/lib/pikuri/tool/search/result.rb +29 -0
- data/lib/pikuri/tool/skill.rb +80 -0
- data/lib/pikuri/tool/skill_catalog.rb +376 -0
- data/lib/pikuri/tool/sub_agent.rb +102 -0
- data/lib/pikuri/tool/web_scrape.rb +117 -0
- data/lib/pikuri/tool/web_search.rb +38 -0
- data/lib/pikuri/tool/workspace.rb +150 -0
- data/lib/pikuri/tool/write.rb +170 -0
- data/lib/pikuri/tool.rb +118 -0
- data/lib/pikuri/url_cache.rb +106 -0
- data/lib/pikuri/version.rb +10 -0
- data/lib/pikuri.rb +165 -0
- data/prompts/coding-system-prompt.txt +28 -0
- data/prompts/pikuri-chat.txt +15 -0
- metadata +259 -0
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'faraday'
|
|
4
|
+
require 'uri'
|
|
5
|
+
|
|
6
|
+
module Pikuri
|
|
7
|
+
class Tool
|
|
8
|
+
module Scraper
|
|
9
|
+
# Plain HTTP scraper: GET the URL with a real-browser User-Agent,
|
|
10
|
+
# follow redirects, and dispatch the response body to the parser
|
|
11
|
+
# matching its +Content-Type+. HTML and XHTML route to
|
|
12
|
+
# {HTML.extract}; +application/pdf+ routes to {PDF.extract}; any
|
|
13
|
+
# other +text/*+ type (plain text, Markdown, source files, …) is
|
|
14
|
+
# passed through verbatim since the LLM can already read it; the
|
|
15
|
+
# remaining types raise {FetchError} so the LLM observes the
|
|
16
|
+
# failure instead of receiving an empty rendering.
|
|
17
|
+
#
|
|
18
|
+
# Split into a thin HTTP fetch ({.fetch}) and a content-type
|
|
19
|
+
# dispatcher ({.visit}) so tests can drive each piece in isolation.
|
|
20
|
+
# "Simple" because everything happens in one Faraday GET — no
|
|
21
|
+
# headless browser, no JS execution.
|
|
22
|
+
module Simple
|
|
23
|
+
# @return [String] User-Agent sent with each request; many sites
|
|
24
|
+
# reject requests with no UA or an obvious bot UA
|
|
25
|
+
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' \
|
|
26
|
+
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
27
|
+
# @return [String] +Accept+ header sent with each request. Lists
|
|
28
|
+
# every content-type the dispatcher in {.visit} knows how to
|
|
29
|
+
# render, so servers that content-negotiate hand back something
|
|
30
|
+
# we can use. The trailing +text/*;q=0.8+ covers the verbatim
|
|
31
|
+
# pass-through arm (plain text, Markdown, source files, …) at a
|
|
32
|
+
# lower preference than rendered HTML/PDF.
|
|
33
|
+
ACCEPT = 'text/html,application/xhtml+xml,application/pdf,text/*;q=0.8'
|
|
34
|
+
# @return [Integer] maximum number of HTTP redirects to follow
|
|
35
|
+
# before giving up
|
|
36
|
+
MAX_REDIRECTS = 5
|
|
37
|
+
# @return [Integer] connect timeout in seconds for the underlying
|
|
38
|
+
# Faraday request
|
|
39
|
+
OPEN_TIMEOUT = 10
|
|
40
|
+
# @return [Integer] read timeout in seconds for the underlying
|
|
41
|
+
# Faraday request
|
|
42
|
+
READ_TIMEOUT = 20
|
|
43
|
+
|
|
44
|
+
# @return [Integer] maximum number of characters of an error
|
|
45
|
+
# response body to include in a {FetchError} message. The body is
|
|
46
|
+
# often a multi-kilobyte HTML challenge page (Cloudflare, WAF
|
|
47
|
+
# interstitial, etc.); a short excerpt tells the LLM what kind of
|
|
48
|
+
# page came back without flooding the next observation.
|
|
49
|
+
ERROR_BODY_EXCERPT = 200
|
|
50
|
+
|
|
51
|
+
# Result of a successful {Simple.fetch}: the response body, the
|
|
52
|
+
# normalized content-type (lower-cased, with any +; charset=...+
|
|
53
|
+
# parameters stripped), and the final URL after redirects. The
|
|
54
|
+
# final URL is kept so future scrapers can resolve relative links
|
|
55
|
+
# against the actual landing page rather than the originally
|
|
56
|
+
# requested one.
|
|
57
|
+
Fetched = Data.define(:body, :content_type, :url)
|
|
58
|
+
|
|
59
|
+
# Fetch +url+ and render its main content as Markdown.
|
|
60
|
+
#
|
|
61
|
+
# No caching here — every call hits the network. Callers that want
|
|
62
|
+
# to memoize results should wrap this method themselves (see
|
|
63
|
+
# {Tool::WebScrape.visit}, which does exactly that).
|
|
64
|
+
#
|
|
65
|
+
# The dispatcher's output is +String#strip+'d so the LLM never
|
|
66
|
+
# sees a body that opens or closes with blank lines — common with
|
|
67
|
+
# +pdf-reader+'s page-feed whitespace and with text bodies that
|
|
68
|
+
# carry a trailing newline. Interior whitespace is preserved
|
|
69
|
+
# because Markdown paragraph breaks and source-code indentation
|
|
70
|
+
# are load-bearing.
|
|
71
|
+
#
|
|
72
|
+
# @param url [String] absolute HTTP(S) URL of the page to download
|
|
73
|
+
# @return [String] full Markdown representation of the page with
|
|
74
|
+
# leading/trailing whitespace trimmed, uncapped otherwise —
|
|
75
|
+
# caller is responsible for any size limiting before feeding
|
|
76
|
+
# the result back to the LLM
|
|
77
|
+
# @raise [FetchError] on HTTP non-2xx, network failure, redirect
|
|
78
|
+
# loop, a 3xx without a +Location+ header, or a response whose
|
|
79
|
+
# content-type the dispatcher does not recognize
|
|
80
|
+
def self.visit(url)
|
|
81
|
+
dispatch(fetch(url)).strip
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Download the body of +url+, manually following up to
|
|
85
|
+
# {MAX_REDIRECTS} redirects. Faraday is configured with no
|
|
86
|
+
# middleware so behavior here mirrors the rest of the codebase
|
|
87
|
+
# (see +Tool::Search::DuckDuckGo.search+).
|
|
88
|
+
#
|
|
89
|
+
# All recoverable failures — HTTP 4xx/5xx, +Faraday::Error+ network
|
|
90
|
+
# blips, exhausted redirect budget, 3xx without a +Location+ —
|
|
91
|
+
# surface as {FetchError} so the caller has a single exception type
|
|
92
|
+
# to rescue. Error bodies are trimmed to {ERROR_BODY_EXCERPT}
|
|
93
|
+
# characters with whitespace collapsed, so a Cloudflare-challenge
|
|
94
|
+
# response doesn't dump kilobytes of inline HTML into the next LLM
|
|
95
|
+
# observation.
|
|
96
|
+
#
|
|
97
|
+
# @param url [String] absolute HTTP(S) URL to fetch
|
|
98
|
+
# @param limit [Integer] redirects remaining; recurses with
|
|
99
|
+
# +limit - 1+ on each 3xx
|
|
100
|
+
# @return [Fetched] body, normalized content-type, and final URL
|
|
101
|
+
# after redirects
|
|
102
|
+
# @raise [FetchError] on non-2xx/3xx responses, network errors,
|
|
103
|
+
# redirect-loop exhaustion, or 3xx without a +Location+ header
|
|
104
|
+
def self.fetch(url, limit: MAX_REDIRECTS)
|
|
105
|
+
raise FetchError, "too many redirects fetching #{url}" if limit.zero?
|
|
106
|
+
|
|
107
|
+
response = begin
|
|
108
|
+
Faraday.new(request: { open_timeout: OPEN_TIMEOUT, timeout: READ_TIMEOUT }).get(url) do |req|
|
|
109
|
+
req.headers['User-Agent'] = USER_AGENT
|
|
110
|
+
req.headers['Accept'] = ACCEPT
|
|
111
|
+
end
|
|
112
|
+
rescue Faraday::Error => e
|
|
113
|
+
raise FetchError, "#{e.class.name.split('::').last} fetching #{url}: #{e.message}"
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
case response.status
|
|
117
|
+
when 200..299
|
|
118
|
+
Fetched.new(body: response.body, content_type: normalize_content_type(response.headers['content-type']), url: url)
|
|
119
|
+
when 300..399
|
|
120
|
+
location = response.headers['location']
|
|
121
|
+
raise FetchError, "HTTP #{response.status} from #{url} with no Location header" if location.nil? || location.empty?
|
|
122
|
+
|
|
123
|
+
fetch(URI.join(url, location).to_s, limit: limit - 1)
|
|
124
|
+
else
|
|
125
|
+
raise FetchError, "HTTP #{response.status} fetching #{url}: #{excerpt(response.body)}"
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
# Route a {Fetched} response to the parser that matches its
|
|
130
|
+
# content-type. Unknown types raise {FetchError} so the LLM gets a
|
|
131
|
+
# legible observation instead of an empty string.
|
|
132
|
+
#
|
|
133
|
+
# @param fetched [Fetched]
|
|
134
|
+
# @return [String] Markdown representation produced by the matched
|
|
135
|
+
# parser
|
|
136
|
+
# @raise [FetchError] when no parser matches the response's
|
|
137
|
+
# content-type
|
|
138
|
+
def self.dispatch(fetched)
|
|
139
|
+
case fetched.content_type
|
|
140
|
+
when 'text/html', 'application/xhtml+xml'
|
|
141
|
+
HTML.extract(fetched.body)
|
|
142
|
+
when 'application/pdf'
|
|
143
|
+
PDF.extract(fetched.body)
|
|
144
|
+
when %r{\Atext/}
|
|
145
|
+
fetched.body
|
|
146
|
+
else
|
|
147
|
+
raise FetchError, "unsupported content-type #{fetched.content_type.inspect} for #{fetched.url}"
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
# Lower-case +raw+ and strip any +; charset=...+ parameters so the
|
|
152
|
+
# dispatcher can match on a canonical token.
|
|
153
|
+
#
|
|
154
|
+
# @param raw [String, nil] raw +Content-Type+ header value
|
|
155
|
+
# @return [String] normalized content-type, or +""+ when the
|
|
156
|
+
# header was missing
|
|
157
|
+
def self.normalize_content_type(raw)
|
|
158
|
+
raw.to_s.split(';').first.to_s.strip.downcase
|
|
159
|
+
end
|
|
160
|
+
private_class_method :normalize_content_type
|
|
161
|
+
|
|
162
|
+
# Whitespace-collapse +body+ and clip to {ERROR_BODY_EXCERPT}
|
|
163
|
+
# characters, so the {FetchError} message stays a single readable
|
|
164
|
+
# line even when the server returned a multi-KB HTML challenge
|
|
165
|
+
# page.
|
|
166
|
+
#
|
|
167
|
+
# @param body [String, nil]
|
|
168
|
+
# @return [String]
|
|
169
|
+
def self.excerpt(body)
|
|
170
|
+
text = body.to_s.gsub(/\s+/, ' ').strip
|
|
171
|
+
text.length > ERROR_BODY_EXCERPT ? "#{text[0, ERROR_BODY_EXCERPT]}..." : text
|
|
172
|
+
end
|
|
173
|
+
private_class_method :excerpt
|
|
174
|
+
end
|
|
175
|
+
end
|
|
176
|
+
end
|
|
177
|
+
end
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'faraday'
|
|
4
|
+
require 'json'
|
|
5
|
+
require 'nokogiri'
|
|
6
|
+
|
|
7
|
+
module Pikuri
|
|
8
|
+
class Tool
|
|
9
|
+
module Search
|
|
10
|
+
# Performs a Brave Search via the official Web Search API and returns
|
|
11
|
+
# the hits as a list of {Result} rows. Split into a thin HTTP fetch
|
|
12
|
+
# (#search) and a pure parser (#parse) so tests can exercise the
|
|
13
|
+
# parser against fixture JSON without hitting the network. The
|
|
14
|
+
# cascade in {Engines.search} owns the final Markdown rendering.
|
|
15
|
+
#
|
|
16
|
+
# Requires a Brave Search API key. Get one at
|
|
17
|
+
# https://api-dashboard.search.brave.com — the free "Data for Search"
|
|
18
|
+
# tier allows 1 query/sec and ~2k queries/month.
|
|
19
|
+
#
|
|
20
|
+
# == Privacy posture
|
|
21
|
+
#
|
|
22
|
+
# Brave's API Privacy Notice retains Search Query Logs for 90 days
|
|
23
|
+
# (billing / troubleshooting) and states +Brave does not collect any
|
|
24
|
+
# identifiers that can link a search query to an individual or their
|
|
25
|
+
# devices+. Brave publicly commits that the Search API does not use
|
|
26
|
+
# query data to train its own models, and offers Zero Data Retention
|
|
27
|
+
# — but only on the Enterprise plan, not on the free "Data for
|
|
28
|
+
# Search" tier pikuri defaults to.
|
|
29
|
+
#
|
|
30
|
+
# Bottom line: of pikuri's three providers Brave has the cleanest
|
|
31
|
+
# API-level posture — no training-on-queries, no IP linkage, capped
|
|
32
|
+
# 90-day retention by default, real ZDR if you pay for it. Still a
|
|
33
|
+
# logged 90-day window on the cheap tier, so not a substitute for
|
|
34
|
+
# ZDR for genuinely sensitive queries.
|
|
35
|
+
module Brave
|
|
36
|
+
# @return [String] Web Search endpoint
|
|
37
|
+
ENDPOINT = 'https://api.search.brave.com/res/v1/web/search'
|
|
38
|
+
# @return [Integer] default number of results returned, matching
|
|
39
|
+
# {DuckDuckGo::DEFAULT_MAX_RESULTS}
|
|
40
|
+
DEFAULT_MAX_RESULTS = 10
|
|
41
|
+
# @return [String] env var holding the API key; +X-Subscription-Token+
|
|
42
|
+
ENV_KEY = 'BRAVE_SEARCH_API_KEY'
|
|
43
|
+
# @return [RateLimiter] free-tier Brave caps at 1 req/sec; the
|
|
44
|
+
# 5-minute cooldown protects the limited monthly quota from
|
|
45
|
+
# being burned on doomed retries when a 429 hits.
|
|
46
|
+
LIMITER = RateLimiter.new(min_interval: 1.0, cooldown: 300.0)
|
|
47
|
+
|
|
48
|
+
# Fetch results for +query+ and return them as an +Array<Result>+.
|
|
49
|
+
# Calls are throttled to one per second and circuit-broken for 5
|
|
50
|
+
# minutes on rate-limit / quota-exhausted responses; see {LIMITER}.
|
|
51
|
+
# The caller (typically {Engines.search}) is expected to have
|
|
52
|
+
# already normalized the query and to wrap this in a result cache.
|
|
53
|
+
#
|
|
54
|
+
# @param query [String] search query (already normalized)
|
|
55
|
+
# @param max_results [Integer] maximum number of result entries;
|
|
56
|
+
# passed through as Brave's +count+ (1..20)
|
|
57
|
+
# @param api_key [String] Brave Search subscription token; defaults to
|
|
58
|
+
# the {ENV_KEY} environment variable
|
|
59
|
+
# @return [Array<Result>] hits, possibly empty when Brave ran the
|
|
60
|
+
# query and matched nothing
|
|
61
|
+
# @raise [ArgumentError] if no API key is available
|
|
62
|
+
# @raise [Engines::Unavailable] when Brave returns HTTP 429
|
|
63
|
+
# (rate limit / quota exhausted) or 5xx — "try again later"
|
|
64
|
+
# responses the cascade in {Engines.search} can fall back
|
|
65
|
+
# from. Also raised immediately if {LIMITER} is in cooldown.
|
|
66
|
+
# Other non-2xx (e.g. 401/403 from a bad API key) bubble up as
|
|
67
|
+
# +RuntimeError+ so config problems stay visible.
|
|
68
|
+
# @raise [RuntimeError] for non-rate-limit HTTP failures or when the
|
|
69
|
+
# response shape contains no results.
|
|
70
|
+
def self.search(query, max_results: DEFAULT_MAX_RESULTS, api_key: ENV.fetch(ENV_KEY, nil))
|
|
71
|
+
raise ArgumentError, "Brave Search API key not set (#{ENV_KEY})" if api_key.to_s.strip.empty?
|
|
72
|
+
|
|
73
|
+
LIMITER.call do
|
|
74
|
+
response = Faraday.get(
|
|
75
|
+
ENDPOINT,
|
|
76
|
+
{ q: query, count: max_results },
|
|
77
|
+
{ 'X-Subscription-Token' => api_key, 'Accept' => 'application/json' }
|
|
78
|
+
)
|
|
79
|
+
unless response.success?
|
|
80
|
+
if response.status == 429 || response.status >= 500
|
|
81
|
+
raise Engines::Unavailable, "HTTP #{response.status}"
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
raise "Brave Search request failed: #{response.status} #{response.body}"
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
parse(response.body, max_results: max_results)
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Parse a Brave Web Search JSON response into a list of {Result} rows.
|
|
92
|
+
# HTML highlight tags (+<strong>+) inside +title+ and +description+
|
|
93
|
+
# are stripped via Nokogiri so the output is plain text.
|
|
94
|
+
#
|
|
95
|
+
# When the response yields zero result nodes, two cases are
|
|
96
|
+
# distinguished: a genuine "no results" payload (recognized search
|
|
97
|
+
# shape with empty +mixed.main+/+top+/+side+ — typically a too-narrow
|
|
98
|
+
# query Brave couldn't match) returns an empty array instead of
|
|
99
|
+
# raising, so {Engines.search} can render its standard no-results
|
|
100
|
+
# stub. Anything else (unknown layout, structured error) raises
|
|
101
|
+
# with a diagnostic so the failure surfaces.
|
|
102
|
+
#
|
|
103
|
+
# @param json [String] response body from {ENDPOINT}
|
|
104
|
+
# @param max_results [Integer] maximum number of result entries
|
|
105
|
+
# @return [Array<Result>] hits, possibly empty on a recognized
|
|
106
|
+
# empty-results payload
|
|
107
|
+
# @raise [RuntimeError] when the response yields no result entries and
|
|
108
|
+
# is not recognized as a genuine empty-results payload
|
|
109
|
+
def self.parse(json, max_results: DEFAULT_MAX_RESULTS)
|
|
110
|
+
data = JSON.parse(json)
|
|
111
|
+
results = Array(data.dig('web', 'results')).take(max_results).filter_map do |r|
|
|
112
|
+
href = r['url'].to_s
|
|
113
|
+
next nil if href.empty?
|
|
114
|
+
|
|
115
|
+
Result.new(
|
|
116
|
+
url: href,
|
|
117
|
+
title: strip_html(r['title']),
|
|
118
|
+
body: strip_html(r['description'])
|
|
119
|
+
)
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
if results.empty?
|
|
123
|
+
return [] if genuine_no_results?(data)
|
|
124
|
+
|
|
125
|
+
raise diagnose_empty(data, json)
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
results
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
# Strip HTML markup (notably +<strong>+ highlights Brave wraps around
|
|
132
|
+
# query terms in titles and descriptions) and collapse whitespace.
|
|
133
|
+
#
|
|
134
|
+
# @param html [String, nil] raw text from a Brave result field
|
|
135
|
+
# @return [String] plain text with tags removed; empty string for nil
|
|
136
|
+
def self.strip_html(html)
|
|
137
|
+
return '' if html.nil?
|
|
138
|
+
|
|
139
|
+
Nokogiri::HTML.fragment(html).text.gsub(/\s+/, ' ').strip
|
|
140
|
+
end
|
|
141
|
+
private_class_method :strip_html
|
|
142
|
+
|
|
143
|
+
# True when a parsed response with zero +web.results+ entries looks
|
|
144
|
+
# like Brave's own "search ran, nothing matched" payload (typically a
|
|
145
|
+
# too-narrow query) rather than a malformed or error response. The
|
|
146
|
+
# markers are the recognized +type: "search"+ envelope and an empty
|
|
147
|
+
# +mixed+ block — Brave populates +mixed.main/top/side+ with whichever
|
|
148
|
+
# verticals matched, so all three being empty arrays is the canonical
|
|
149
|
+
# signal that the search itself succeeded but found nothing.
|
|
150
|
+
#
|
|
151
|
+
# @param data [Hash, Object] parsed response
|
|
152
|
+
# @return [Boolean]
|
|
153
|
+
def self.genuine_no_results?(data)
|
|
154
|
+
return false unless data.is_a?(Hash) && data['type'] == 'search'
|
|
155
|
+
|
|
156
|
+
mixed = data['mixed']
|
|
157
|
+
return false unless mixed.is_a?(Hash)
|
|
158
|
+
|
|
159
|
+
%w[main top side].all? { |k| Array(mixed[k]).empty? }
|
|
160
|
+
end
|
|
161
|
+
private_class_method :genuine_no_results?
|
|
162
|
+
|
|
163
|
+
# Build an error message for a parsed response that yielded zero
|
|
164
|
+
# results. Quotes Brave's +error.detail+ if present, otherwise
|
|
165
|
+
# truncates the raw body so the caller can see the actual payload.
|
|
166
|
+
#
|
|
167
|
+
# @param data [Hash, Object] parsed response
|
|
168
|
+
# @param raw [String] raw response body
|
|
169
|
+
# @return [String] human-readable diagnostic to feed to +raise+
|
|
170
|
+
def self.diagnose_empty(data, raw)
|
|
171
|
+
if data.is_a?(Hash) && data['error'].is_a?(Hash)
|
|
172
|
+
err = data['error']
|
|
173
|
+
return "Brave Search returned an error: #{[err['code'], err['detail'] || err['meta']].compact.join(' — ')}"
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
snippet = raw.to_s[0, 800]
|
|
177
|
+
snippet += '…' if raw.to_s.length > 800
|
|
178
|
+
"Brave Search returned no results. Body: #{snippet}"
|
|
179
|
+
end
|
|
180
|
+
private_class_method :diagnose_empty
|
|
181
|
+
end
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
end
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'faraday'
|
|
4
|
+
require 'nokogiri'
|
|
5
|
+
require 'uri'
|
|
6
|
+
|
|
7
|
+
module Pikuri
|
|
8
|
+
class Tool
|
|
9
|
+
module Search
|
|
10
|
+
# Performs a DuckDuckGo search by scraping +html.duckduckgo.com+ and
|
|
11
|
+
# returns the hits as a list of {Result} rows. Split into a thin HTTP
|
|
12
|
+
# fetch (#search) and a pure parser (#parse) so tests can exercise
|
|
13
|
+
# the parser against fixture HTML without hitting the network. The
|
|
14
|
+
# cascade in {Engines.search} owns the final Markdown rendering.
|
|
15
|
+
#
|
|
16
|
+
# == Privacy posture
|
|
17
|
+
#
|
|
18
|
+
# DuckDuckGo's privacy policy states +We don't save your IP address
|
|
19
|
+
# or any unique identifiers alongside your searches+ and +We have
|
|
20
|
+
# never sold any personal information+, and they proxy requests on
|
|
21
|
+
# the user's behalf so downstream content providers can't build a
|
|
22
|
+
# per-user search history. That part is real — but DDG is mainly a
|
|
23
|
+
# relay over Bing for web results, so the *query content* still
|
|
24
|
+
# reaches Microsoft for fulfillment even though DDG strips
|
|
25
|
+
# identifying info on the way out.
|
|
26
|
+
#
|
|
27
|
+
# Bottom line: DDG is a genuine privacy improvement over hitting
|
|
28
|
+
# Bing directly (your IP isn't tied to the query, no per-user
|
|
29
|
+
# profile is built on DDG's side), but query content still lands at
|
|
30
|
+
# Microsoft, who has no comparable no-training pledge. Better than
|
|
31
|
+
# Exa for sensitive queries, worse than Brave; for anything
|
|
32
|
+
# genuinely embarrassing, don't search the web at all.
|
|
33
|
+
module DuckDuckGo
|
|
34
|
+
# @return [String] HTML search endpoint
|
|
35
|
+
ENDPOINT = 'https://html.duckduckgo.com/html/'
|
|
36
|
+
# @return [String] User-Agent sent with each request; DDG often rejects
|
|
37
|
+
# requests with no UA or an obvious bot UA
|
|
38
|
+
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' \
|
|
39
|
+
'(KHTML, like Gecko) Chrome/120.0 Safari/537.36'
|
|
40
|
+
# @return [Integer] default number of results returned, matching smolagents
|
|
41
|
+
DEFAULT_MAX_RESULTS = 10
|
|
42
|
+
# @return [RateLimiter] paces calls (DDG bans IPs that hammer the HTML
|
|
43
|
+
# endpoint) and circuit-breaks on {Engines::Unavailable} so a
|
|
44
|
+
# soft-block response doesn't get retried for the next 5 minutes
|
|
45
|
+
LIMITER = RateLimiter.new(min_interval: 2.0, cooldown: 300.0)
|
|
46
|
+
|
|
47
|
+
# Fetch results for +query+ and return them as an +Array<Result>+.
|
|
48
|
+
# Calls are throttled to one every 2s and circuit-broken for 5 minutes
|
|
49
|
+
# after a soft-block; see {LIMITER}. The caller (typically
|
|
50
|
+
# {Engines.search}) is expected to have already normalized the
|
|
51
|
+
# query and to wrap this in a result cache.
|
|
52
|
+
#
|
|
53
|
+
# @param query [String] search query (already normalized)
|
|
54
|
+
# @param max_results [Integer] maximum number of result entries
|
|
55
|
+
# @return [Array<Result>] hits, possibly empty when DDG ran the
|
|
56
|
+
# query and matched nothing
|
|
57
|
+
# @raise [Engines::Unavailable] when DDG soft-blocks the IP
|
|
58
|
+
# (anomaly/CAPTCHA page) or returns HTTP 429/5xx — i.e. "try again
|
|
59
|
+
# later" responses the cascade in {Engines.search} can fall
|
|
60
|
+
# back from. Also raised immediately if {LIMITER} is in cooldown.
|
|
61
|
+
# @raise [RuntimeError] if the HTTP call fails for other reasons or
|
|
62
|
+
# the empty-results page is in an unrecognized layout. A genuine
|
|
63
|
+
# empty-results page is *not* an error; see {.parse}.
|
|
64
|
+
def self.search(query, max_results: DEFAULT_MAX_RESULTS)
|
|
65
|
+
LIMITER.call do
|
|
66
|
+
response = Faraday.get(ENDPOINT, { q: query }, { 'User-Agent' => USER_AGENT })
|
|
67
|
+
unless response.success?
|
|
68
|
+
if response.status == 429 || response.status >= 500
|
|
69
|
+
raise Engines::Unavailable, "HTTP #{response.status}"
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
raise "DuckDuckGo request failed: #{response.status} #{response.body}"
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
parse(response.body, max_results: max_results)
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Parse a +html.duckduckgo.com+ result page into a list of {Result}
|
|
80
|
+
# rows. +<b>+ highlights inside snippets are stripped.
|
|
81
|
+
#
|
|
82
|
+
# When the page has zero result nodes, two cases are distinguished:
|
|
83
|
+
# a genuine "no results" page (narrow query, DDG's own "No results
|
|
84
|
+
# found" indicator) returns an empty array instead of raising, so
|
|
85
|
+
# {Engines.search} can render its standard no-results stub.
|
|
86
|
+
# Anything else (anomaly modal, CAPTCHA, service-unavailable page,
|
|
87
|
+
# unknown layout) raises with the diagnostic text extracted from
|
|
88
|
+
# the body, so an IP soft-block is surfaced rather than silently
|
|
89
|
+
# masquerading as an empty search.
|
|
90
|
+
#
|
|
91
|
+
# @param html [String] HTML document body from html.duckduckgo.com
|
|
92
|
+
# @param max_results [Integer] maximum number of result entries
|
|
93
|
+
# @return [Array<Result>] hits, possibly empty on a genuine
|
|
94
|
+
# no-results page
|
|
95
|
+
# @raise [Engines::Unavailable] when the page is the DDG
|
|
96
|
+
# anomaly/CAPTCHA modal (IP soft-block) — a "try again later" the
|
|
97
|
+
# cascade can fall back from.
|
|
98
|
+
# @raise [RuntimeError] when the page contains no result nodes and is
|
|
99
|
+
# not recognized as either a genuine no-results page or the
|
|
100
|
+
# anomaly modal (likely a layout change worth surfacing loudly).
|
|
101
|
+
def self.parse(html, max_results: DEFAULT_MAX_RESULTS)
|
|
102
|
+
doc = Nokogiri::HTML(html)
|
|
103
|
+
results = doc.css('div.result.web-result').take(max_results).filter_map do |node|
|
|
104
|
+
title_link = node.at_css('a.result__a')
|
|
105
|
+
next nil if title_link.nil?
|
|
106
|
+
|
|
107
|
+
snippet = node.at_css('a.result__snippet')
|
|
108
|
+
Result.new(
|
|
109
|
+
url: extract_url(title_link['href']),
|
|
110
|
+
title: title_link.text.strip,
|
|
111
|
+
body: snippet&.text&.strip.to_s
|
|
112
|
+
)
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
if results.empty?
|
|
116
|
+
return [] if genuine_no_results?(doc)
|
|
117
|
+
|
|
118
|
+
message = diagnose_empty(doc)
|
|
119
|
+
raise(anomaly_modal?(doc) ? Engines::Unavailable : RuntimeError, message)
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
results
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
# True when the page contains DDG's anomaly/CAPTCHA modal — i.e. the
|
|
126
|
+
# IP has been soft-blocked. Used by {.parse} to pick between
|
|
127
|
+
# {Engines::Unavailable} (recoverable, fall back to another
|
|
128
|
+
# provider) and {RuntimeError} (unknown layout, surface loudly).
|
|
129
|
+
#
|
|
130
|
+
# @param doc [Nokogiri::HTML::Document] parsed result page
|
|
131
|
+
# @return [Boolean]
|
|
132
|
+
def self.anomaly_modal?(doc)
|
|
133
|
+
!!(doc.at_css('.anomaly-modal__title') || doc.at_css('.anomaly-modal__description'))
|
|
134
|
+
end
|
|
135
|
+
private_class_method :anomaly_modal?
|
|
136
|
+
|
|
137
|
+
# True when a results page with zero result nodes looks like DDG's own
|
|
138
|
+
# "no results found" page (narrow query) rather than an anomaly/CAPTCHA
|
|
139
|
+
# or other non-results layout. Anomaly modal wins: if the modal divs
|
|
140
|
+
# are present we never treat the page as a genuine empty result, even
|
|
141
|
+
# if the surrounding text happens to mention "No results".
|
|
142
|
+
#
|
|
143
|
+
# @param doc [Nokogiri::HTML::Document] parsed result page
|
|
144
|
+
# @return [Boolean]
|
|
145
|
+
def self.genuine_no_results?(doc)
|
|
146
|
+
return false if doc.at_css('.anomaly-modal__title') || doc.at_css('.anomaly-modal__description')
|
|
147
|
+
return true if doc.at_css('div.no-results')
|
|
148
|
+
|
|
149
|
+
doc.text.include?('No results found')
|
|
150
|
+
end
|
|
151
|
+
private_class_method :genuine_no_results?
|
|
152
|
+
|
|
153
|
+
# Build an error message for a results page that yielded zero matches.
|
|
154
|
+
# Recognizes the DDG anomaly modal explicitly and quotes its title and
|
|
155
|
+
# description; otherwise extracts visible text from the body (with
|
|
156
|
+
# +<script>+/+<style>+/+<noscript>+ stripped and whitespace collapsed)
|
|
157
|
+
# and includes a truncated copy so the caller can see what came back.
|
|
158
|
+
#
|
|
159
|
+
# @param doc [Nokogiri::HTML::Document] parsed result page
|
|
160
|
+
# @return [String] human-readable diagnostic to feed to +raise+
|
|
161
|
+
def self.diagnose_empty(doc)
|
|
162
|
+
title = doc.at_css('.anomaly-modal__title')&.text&.strip
|
|
163
|
+
desc = doc.at_css('.anomaly-modal__description')&.text&.strip
|
|
164
|
+
if title || desc
|
|
165
|
+
return "DuckDuckGo anomaly check (likely IP soft-block): #{[title, desc].compact.reject(&:empty?).join(' — ')}"
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
doc.css('script, style, noscript').remove
|
|
169
|
+
text = doc.text.gsub(/\s+/, ' ').strip
|
|
170
|
+
snippet = text.empty? ? '<empty body>' : text[0, 1500]
|
|
171
|
+
snippet += '…' if text.length > 1500
|
|
172
|
+
"DuckDuckGo returned no results. Page text: #{snippet}"
|
|
173
|
+
end
|
|
174
|
+
private_class_method :diagnose_empty
|
|
175
|
+
|
|
176
|
+
# Decode DuckDuckGo's +//duckduckgo.com/l/?uddg=<encoded>+ redirect wrapper
|
|
177
|
+
# back to the real target URL.
|
|
178
|
+
#
|
|
179
|
+
# @param href [String, nil] href as found on the search-result page
|
|
180
|
+
# @return [String] the decoded target URL, or +href+ unchanged when it is
|
|
181
|
+
# not a recognized DDG redirect or cannot be parsed
|
|
182
|
+
def self.extract_url(href)
|
|
183
|
+
return href if href.nil? || href.empty?
|
|
184
|
+
|
|
185
|
+
uri = URI.parse(href.start_with?('//') ? "https:#{href}" : href)
|
|
186
|
+
return href unless uri.host&.end_with?('duckduckgo.com') && uri.path == '/l/'
|
|
187
|
+
|
|
188
|
+
params = URI.decode_www_form(uri.query.to_s).to_h
|
|
189
|
+
params['uddg'] || href
|
|
190
|
+
rescue URI::InvalidURIError
|
|
191
|
+
href
|
|
192
|
+
end
|
|
193
|
+
end
|
|
194
|
+
end
|
|
195
|
+
end
|
|
196
|
+
end
|