pikuri 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +62 -0
  3. data/GETTING_STARTED.md +223 -0
  4. data/LICENSE +21 -0
  5. data/README.md +193 -0
  6. data/lib/pikuri/agent/chat_transport.rb +41 -0
  7. data/lib/pikuri/agent/context_window_detector.rb +101 -0
  8. data/lib/pikuri/agent/listener/in_memory_message_list.rb +33 -0
  9. data/lib/pikuri/agent/listener/message_listener.rb +93 -0
  10. data/lib/pikuri/agent/listener/step_limit.rb +97 -0
  11. data/lib/pikuri/agent/listener/terminal.rb +137 -0
  12. data/lib/pikuri/agent/listener/token_log.rb +166 -0
  13. data/lib/pikuri/agent/listener_list.rb +113 -0
  14. data/lib/pikuri/agent/message.rb +61 -0
  15. data/lib/pikuri/agent/synthesizer.rb +120 -0
  16. data/lib/pikuri/agent/tokens.rb +56 -0
  17. data/lib/pikuri/agent.rb +286 -0
  18. data/lib/pikuri/subprocess.rb +166 -0
  19. data/lib/pikuri/tool/bash.rb +272 -0
  20. data/lib/pikuri/tool/calculator.rb +82 -0
  21. data/lib/pikuri/tool/confirmer.rb +96 -0
  22. data/lib/pikuri/tool/edit.rb +196 -0
  23. data/lib/pikuri/tool/fetch.rb +167 -0
  24. data/lib/pikuri/tool/glob.rb +310 -0
  25. data/lib/pikuri/tool/grep.rb +338 -0
  26. data/lib/pikuri/tool/parameters.rb +314 -0
  27. data/lib/pikuri/tool/read.rb +254 -0
  28. data/lib/pikuri/tool/scraper/fetch_error.rb +16 -0
  29. data/lib/pikuri/tool/scraper/html.rb +285 -0
  30. data/lib/pikuri/tool/scraper/pdf.rb +54 -0
  31. data/lib/pikuri/tool/scraper/simple.rb +177 -0
  32. data/lib/pikuri/tool/search/brave.rb +184 -0
  33. data/lib/pikuri/tool/search/duckduckgo.rb +196 -0
  34. data/lib/pikuri/tool/search/engines.rb +154 -0
  35. data/lib/pikuri/tool/search/exa.rb +217 -0
  36. data/lib/pikuri/tool/search/rate_limiter.rb +92 -0
  37. data/lib/pikuri/tool/search/result.rb +29 -0
  38. data/lib/pikuri/tool/skill.rb +80 -0
  39. data/lib/pikuri/tool/skill_catalog.rb +376 -0
  40. data/lib/pikuri/tool/sub_agent.rb +102 -0
  41. data/lib/pikuri/tool/web_scrape.rb +117 -0
  42. data/lib/pikuri/tool/web_search.rb +38 -0
  43. data/lib/pikuri/tool/workspace.rb +150 -0
  44. data/lib/pikuri/tool/write.rb +170 -0
  45. data/lib/pikuri/tool.rb +118 -0
  46. data/lib/pikuri/url_cache.rb +106 -0
  47. data/lib/pikuri/version.rb +10 -0
  48. data/lib/pikuri.rb +165 -0
  49. data/prompts/coding-system-prompt.txt +28 -0
  50. data/prompts/pikuri-chat.txt +15 -0
  51. metadata +259 -0
@@ -0,0 +1,177 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'faraday'
4
+ require 'uri'
5
+
6
+ module Pikuri
7
+ class Tool
8
+ module Scraper
9
+ # Plain HTTP scraper: GET the URL with a real-browser User-Agent,
10
+ # follow redirects, and dispatch the response body to the parser
11
+ # matching its +Content-Type+. HTML and XHTML route to
12
+ # {HTML.extract}; +application/pdf+ routes to {PDF.extract}; any
13
+ # other +text/*+ type (plain text, Markdown, source files, …) is
14
+ # passed through verbatim since the LLM can already read it; the
15
+ # remaining types raise {FetchError} so the LLM observes the
16
+ # failure instead of receiving an empty rendering.
17
+ #
18
+ # Split into a thin HTTP fetch ({.fetch}) and a content-type
19
+ # dispatcher ({.visit}) so tests can drive each piece in isolation.
20
+ # "Simple" because everything happens in one Faraday GET — no
21
+ # headless browser, no JS execution.
22
+ module Simple
23
+ # @return [String] User-Agent sent with each request; many sites
24
+ # reject requests with no UA or an obvious bot UA
25
+ USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' \
26
+ '(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
27
+ # @return [String] +Accept+ header sent with each request. Lists
28
+ # every content-type the dispatcher in {.visit} knows how to
29
+ # render, so servers that content-negotiate hand back something
30
+ # we can use. The trailing +text/*;q=0.8+ covers the verbatim
31
+ # pass-through arm (plain text, Markdown, source files, …) at a
32
+ # lower preference than rendered HTML/PDF.
33
+ ACCEPT = 'text/html,application/xhtml+xml,application/pdf,text/*;q=0.8'
34
+ # @return [Integer] maximum number of HTTP redirects to follow
35
+ # before giving up
36
+ MAX_REDIRECTS = 5
37
+ # @return [Integer] connect timeout in seconds for the underlying
38
+ # Faraday request
39
+ OPEN_TIMEOUT = 10
40
+ # @return [Integer] read timeout in seconds for the underlying
41
+ # Faraday request
42
+ READ_TIMEOUT = 20
43
+
44
+ # @return [Integer] maximum number of characters of an error
45
+ # response body to include in a {FetchError} message. The body is
46
+ # often a multi-kilobyte HTML challenge page (Cloudflare, WAF
47
+ # interstitial, etc.); a short excerpt tells the LLM what kind of
48
+ # page came back without flooding the next observation.
49
+ ERROR_BODY_EXCERPT = 200
50
+
51
+ # Result of a successful {Simple.fetch}: the response body, the
52
+ # normalized content-type (lower-cased, with any +; charset=...+
53
+ # parameters stripped), and the final URL after redirects. The
54
+ # final URL is kept so future scrapers can resolve relative links
55
+ # against the actual landing page rather than the originally
56
+ # requested one.
57
+ Fetched = Data.define(:body, :content_type, :url)
58
+
59
+ # Fetch +url+ and render its main content as Markdown.
60
+ #
61
+ # No caching here — every call hits the network. Callers that want
62
+ # to memoize results should wrap this method themselves (see
63
+ # {Tool::WebScrape.visit}, which does exactly that).
64
+ #
65
+ # The dispatcher's output is +String#strip+'d so the LLM never
66
+ # sees a body that opens or closes with blank lines — common with
67
+ # +pdf-reader+'s page-feed whitespace and with text bodies that
68
+ # carry a trailing newline. Interior whitespace is preserved
69
+ # because Markdown paragraph breaks and source-code indentation
70
+ # are load-bearing.
71
+ #
72
+ # @param url [String] absolute HTTP(S) URL of the page to download
73
+ # @return [String] full Markdown representation of the page with
74
+ # leading/trailing whitespace trimmed, uncapped otherwise —
75
+ # caller is responsible for any size limiting before feeding
76
+ # the result back to the LLM
77
+ # @raise [FetchError] on HTTP non-2xx, network failure, redirect
78
+ # loop, a 3xx without a +Location+ header, or a response whose
79
+ # content-type the dispatcher does not recognize
80
+ def self.visit(url)
81
+ dispatch(fetch(url)).strip
82
+ end
83
+
84
+ # Download the body of +url+, manually following up to
85
+ # {MAX_REDIRECTS} redirects. Faraday is configured with no
86
+ # middleware so behavior here mirrors the rest of the codebase
87
+ # (see +Tool::Search::DuckDuckGo.search+).
88
+ #
89
+ # All recoverable failures — HTTP 4xx/5xx, +Faraday::Error+ network
90
+ # blips, exhausted redirect budget, 3xx without a +Location+ —
91
+ # surface as {FetchError} so the caller has a single exception type
92
+ # to rescue. Error bodies are trimmed to {ERROR_BODY_EXCERPT}
93
+ # characters with whitespace collapsed, so a Cloudflare-challenge
94
+ # response doesn't dump kilobytes of inline HTML into the next LLM
95
+ # observation.
96
+ #
97
+ # @param url [String] absolute HTTP(S) URL to fetch
98
+ # @param limit [Integer] redirects remaining; recurses with
99
+ # +limit - 1+ on each 3xx
100
+ # @return [Fetched] body, normalized content-type, and final URL
101
+ # after redirects
102
+ # @raise [FetchError] on non-2xx/3xx responses, network errors,
103
+ # redirect-loop exhaustion, or 3xx without a +Location+ header
104
+ def self.fetch(url, limit: MAX_REDIRECTS)
105
+ raise FetchError, "too many redirects fetching #{url}" if limit.zero?
106
+
107
+ response = begin
108
+ Faraday.new(request: { open_timeout: OPEN_TIMEOUT, timeout: READ_TIMEOUT }).get(url) do |req|
109
+ req.headers['User-Agent'] = USER_AGENT
110
+ req.headers['Accept'] = ACCEPT
111
+ end
112
+ rescue Faraday::Error => e
113
+ raise FetchError, "#{e.class.name.split('::').last} fetching #{url}: #{e.message}"
114
+ end
115
+
116
+ case response.status
117
+ when 200..299
118
+ Fetched.new(body: response.body, content_type: normalize_content_type(response.headers['content-type']), url: url)
119
+ when 300..399
120
+ location = response.headers['location']
121
+ raise FetchError, "HTTP #{response.status} from #{url} with no Location header" if location.nil? || location.empty?
122
+
123
+ fetch(URI.join(url, location).to_s, limit: limit - 1)
124
+ else
125
+ raise FetchError, "HTTP #{response.status} fetching #{url}: #{excerpt(response.body)}"
126
+ end
127
+ end
128
+
129
+ # Route a {Fetched} response to the parser that matches its
130
+ # content-type. Unknown types raise {FetchError} so the LLM gets a
131
+ # legible observation instead of an empty string.
132
+ #
133
+ # @param fetched [Fetched]
134
+ # @return [String] Markdown representation produced by the matched
135
+ # parser
136
+ # @raise [FetchError] when no parser matches the response's
137
+ # content-type
138
+ def self.dispatch(fetched)
139
+ case fetched.content_type
140
+ when 'text/html', 'application/xhtml+xml'
141
+ HTML.extract(fetched.body)
142
+ when 'application/pdf'
143
+ PDF.extract(fetched.body)
144
+ when %r{\Atext/}
145
+ fetched.body
146
+ else
147
+ raise FetchError, "unsupported content-type #{fetched.content_type.inspect} for #{fetched.url}"
148
+ end
149
+ end
150
+
151
+ # Lower-case +raw+ and strip any +; charset=...+ parameters so the
152
+ # dispatcher can match on a canonical token.
153
+ #
154
+ # @param raw [String, nil] raw +Content-Type+ header value
155
+ # @return [String] normalized content-type, or +""+ when the
156
+ # header was missing
157
+ def self.normalize_content_type(raw)
158
+ raw.to_s.split(';').first.to_s.strip.downcase
159
+ end
160
+ private_class_method :normalize_content_type
161
+
162
+ # Whitespace-collapse +body+ and clip to {ERROR_BODY_EXCERPT}
163
+ # characters, so the {FetchError} message stays a single readable
164
+ # line even when the server returned a multi-KB HTML challenge
165
+ # page.
166
+ #
167
+ # @param body [String, nil]
168
+ # @return [String]
169
+ def self.excerpt(body)
170
+ text = body.to_s.gsub(/\s+/, ' ').strip
171
+ text.length > ERROR_BODY_EXCERPT ? "#{text[0, ERROR_BODY_EXCERPT]}..." : text
172
+ end
173
+ private_class_method :excerpt
174
+ end
175
+ end
176
+ end
177
+ end
@@ -0,0 +1,184 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'faraday'
4
+ require 'json'
5
+ require 'nokogiri'
6
+
7
+ module Pikuri
8
+ class Tool
9
+ module Search
10
+ # Performs a Brave Search via the official Web Search API and returns
11
+ # the hits as a list of {Result} rows. Split into a thin HTTP fetch
12
+ # (#search) and a pure parser (#parse) so tests can exercise the
13
+ # parser against fixture JSON without hitting the network. The
14
+ # cascade in {Engines.search} owns the final Markdown rendering.
15
+ #
16
+ # Requires a Brave Search API key. Get one at
17
+ # https://api-dashboard.search.brave.com — the free "Data for Search"
18
+ # tier allows 1 query/sec and ~2k queries/month.
19
+ #
20
+ # == Privacy posture
21
+ #
22
+ # Brave's API Privacy Notice retains Search Query Logs for 90 days
23
+ # (billing / troubleshooting) and states +Brave does not collect any
24
+ # identifiers that can link a search query to an individual or their
25
+ # devices+. Brave publicly commits that the Search API does not use
26
+ # query data to train its own models, and offers Zero Data Retention
27
+ # — but only on the Enterprise plan, not on the free "Data for
28
+ # Search" tier pikuri defaults to.
29
+ #
30
+ # Bottom line: of pikuri's three providers Brave has the cleanest
31
+ # API-level posture — no training-on-queries, no IP linkage, capped
32
+ # 90-day retention by default, real ZDR if you pay for it. Still a
33
+ # logged 90-day window on the cheap tier, so not a substitute for
34
+ # ZDR for genuinely sensitive queries.
35
+ module Brave
36
+ # @return [String] Web Search endpoint
37
+ ENDPOINT = 'https://api.search.brave.com/res/v1/web/search'
38
+ # @return [Integer] default number of results returned, matching
39
+ # {DuckDuckGo::DEFAULT_MAX_RESULTS}
40
+ DEFAULT_MAX_RESULTS = 10
41
+ # @return [String] env var holding the API key; +X-Subscription-Token+
42
+ ENV_KEY = 'BRAVE_SEARCH_API_KEY'
43
+ # @return [RateLimiter] free-tier Brave caps at 1 req/sec; the
44
+ # 5-minute cooldown protects the limited monthly quota from
45
+ # being burned on doomed retries when a 429 hits.
46
+ LIMITER = RateLimiter.new(min_interval: 1.0, cooldown: 300.0)
47
+
48
+ # Fetch results for +query+ and return them as an +Array<Result>+.
49
+ # Calls are throttled to one per second and circuit-broken for 5
50
+ # minutes on rate-limit / quota-exhausted responses; see {LIMITER}.
51
+ # The caller (typically {Engines.search}) is expected to have
52
+ # already normalized the query and to wrap this in a result cache.
53
+ #
54
+ # @param query [String] search query (already normalized)
55
+ # @param max_results [Integer] maximum number of result entries;
56
+ # passed through as Brave's +count+ (1..20)
57
+ # @param api_key [String] Brave Search subscription token; defaults to
58
+ # the {ENV_KEY} environment variable
59
+ # @return [Array<Result>] hits, possibly empty when Brave ran the
60
+ # query and matched nothing
61
+ # @raise [ArgumentError] if no API key is available
62
+ # @raise [Engines::Unavailable] when Brave returns HTTP 429
63
+ # (rate limit / quota exhausted) or 5xx — "try again later"
64
+ # responses the cascade in {Engines.search} can fall back
65
+ # from. Also raised immediately if {LIMITER} is in cooldown.
66
+ # Other non-2xx (e.g. 401/403 from a bad API key) bubble up as
67
+ # +RuntimeError+ so config problems stay visible.
68
+ # @raise [RuntimeError] for non-rate-limit HTTP failures or when the
69
+ # response shape contains no results.
70
+ def self.search(query, max_results: DEFAULT_MAX_RESULTS, api_key: ENV.fetch(ENV_KEY, nil))
71
+ raise ArgumentError, "Brave Search API key not set (#{ENV_KEY})" if api_key.to_s.strip.empty?
72
+
73
+ LIMITER.call do
74
+ response = Faraday.get(
75
+ ENDPOINT,
76
+ { q: query, count: max_results },
77
+ { 'X-Subscription-Token' => api_key, 'Accept' => 'application/json' }
78
+ )
79
+ unless response.success?
80
+ if response.status == 429 || response.status >= 500
81
+ raise Engines::Unavailable, "HTTP #{response.status}"
82
+ end
83
+
84
+ raise "Brave Search request failed: #{response.status} #{response.body}"
85
+ end
86
+
87
+ parse(response.body, max_results: max_results)
88
+ end
89
+ end
90
+
91
+ # Parse a Brave Web Search JSON response into a list of {Result} rows.
92
+ # HTML highlight tags (+<strong>+) inside +title+ and +description+
93
+ # are stripped via Nokogiri so the output is plain text.
94
+ #
95
+ # When the response yields zero result nodes, two cases are
96
+ # distinguished: a genuine "no results" payload (recognized search
97
+ # shape with empty +mixed.main+/+top+/+side+ — typically a too-narrow
98
+ # query Brave couldn't match) returns an empty array instead of
99
+ # raising, so {Engines.search} can render its standard no-results
100
+ # stub. Anything else (unknown layout, structured error) raises
101
+ # with a diagnostic so the failure surfaces.
102
+ #
103
+ # @param json [String] response body from {ENDPOINT}
104
+ # @param max_results [Integer] maximum number of result entries
105
+ # @return [Array<Result>] hits, possibly empty on a recognized
106
+ # empty-results payload
107
+ # @raise [RuntimeError] when the response yields no result entries and
108
+ # is not recognized as a genuine empty-results payload
109
+ def self.parse(json, max_results: DEFAULT_MAX_RESULTS)
110
+ data = JSON.parse(json)
111
+ results = Array(data.dig('web', 'results')).take(max_results).filter_map do |r|
112
+ href = r['url'].to_s
113
+ next nil if href.empty?
114
+
115
+ Result.new(
116
+ url: href,
117
+ title: strip_html(r['title']),
118
+ body: strip_html(r['description'])
119
+ )
120
+ end
121
+
122
+ if results.empty?
123
+ return [] if genuine_no_results?(data)
124
+
125
+ raise diagnose_empty(data, json)
126
+ end
127
+
128
+ results
129
+ end
130
+
131
+ # Strip HTML markup (notably +<strong>+ highlights Brave wraps around
132
+ # query terms in titles and descriptions) and collapse whitespace.
133
+ #
134
+ # @param html [String, nil] raw text from a Brave result field
135
+ # @return [String] plain text with tags removed; empty string for nil
136
+ def self.strip_html(html)
137
+ return '' if html.nil?
138
+
139
+ Nokogiri::HTML.fragment(html).text.gsub(/\s+/, ' ').strip
140
+ end
141
+ private_class_method :strip_html
142
+
143
+ # True when a parsed response with zero +web.results+ entries looks
144
+ # like Brave's own "search ran, nothing matched" payload (typically a
145
+ # too-narrow query) rather than a malformed or error response. The
146
+ # markers are the recognized +type: "search"+ envelope and an empty
147
+ # +mixed+ block — Brave populates +mixed.main/top/side+ with whichever
148
+ # verticals matched, so all three being empty arrays is the canonical
149
+ # signal that the search itself succeeded but found nothing.
150
+ #
151
+ # @param data [Hash, Object] parsed response
152
+ # @return [Boolean]
153
+ def self.genuine_no_results?(data)
154
+ return false unless data.is_a?(Hash) && data['type'] == 'search'
155
+
156
+ mixed = data['mixed']
157
+ return false unless mixed.is_a?(Hash)
158
+
159
+ %w[main top side].all? { |k| Array(mixed[k]).empty? }
160
+ end
161
+ private_class_method :genuine_no_results?
162
+
163
+ # Build an error message for a parsed response that yielded zero
164
+ # results. Quotes Brave's +error.detail+ if present, otherwise
165
+ # truncates the raw body so the caller can see the actual payload.
166
+ #
167
+ # @param data [Hash, Object] parsed response
168
+ # @param raw [String] raw response body
169
+ # @return [String] human-readable diagnostic to feed to +raise+
170
+ def self.diagnose_empty(data, raw)
171
+ if data.is_a?(Hash) && data['error'].is_a?(Hash)
172
+ err = data['error']
173
+ return "Brave Search returned an error: #{[err['code'], err['detail'] || err['meta']].compact.join(' — ')}"
174
+ end
175
+
176
+ snippet = raw.to_s[0, 800]
177
+ snippet += '…' if raw.to_s.length > 800
178
+ "Brave Search returned no results. Body: #{snippet}"
179
+ end
180
+ private_class_method :diagnose_empty
181
+ end
182
+ end
183
+ end
184
+ end
@@ -0,0 +1,196 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'faraday'
4
+ require 'nokogiri'
5
+ require 'uri'
6
+
7
+ module Pikuri
8
+ class Tool
9
+ module Search
10
+ # Performs a DuckDuckGo search by scraping +html.duckduckgo.com+ and
11
+ # returns the hits as a list of {Result} rows. Split into a thin HTTP
12
+ # fetch (#search) and a pure parser (#parse) so tests can exercise
13
+ # the parser against fixture HTML without hitting the network. The
14
+ # cascade in {Engines.search} owns the final Markdown rendering.
15
+ #
16
+ # == Privacy posture
17
+ #
18
+ # DuckDuckGo's privacy policy states +We don't save your IP address
19
+ # or any unique identifiers alongside your searches+ and +We have
20
+ # never sold any personal information+, and they proxy requests on
21
+ # the user's behalf so downstream content providers can't build a
22
+ # per-user search history. That part is real — but DDG is mainly a
23
+ # relay over Bing for web results, so the *query content* still
24
+ # reaches Microsoft for fulfillment even though DDG strips
25
+ # identifying info on the way out.
26
+ #
27
+ # Bottom line: DDG is a genuine privacy improvement over hitting
28
+ # Bing directly (your IP isn't tied to the query, no per-user
29
+ # profile is built on DDG's side), but query content still lands at
30
+ # Microsoft, who has no comparable no-training pledge. Better than
31
+ # Exa for sensitive queries, worse than Brave; for anything
32
+ # genuinely embarrassing, don't search the web at all.
33
+ module DuckDuckGo
34
+ # @return [String] HTML search endpoint
35
+ ENDPOINT = 'https://html.duckduckgo.com/html/'
36
+ # @return [String] User-Agent sent with each request; DDG often rejects
37
+ # requests with no UA or an obvious bot UA
38
+ USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' \
39
+ '(KHTML, like Gecko) Chrome/120.0 Safari/537.36'
40
+ # @return [Integer] default number of results returned, matching smolagents
41
+ DEFAULT_MAX_RESULTS = 10
42
+ # @return [RateLimiter] paces calls (DDG bans IPs that hammer the HTML
43
+ # endpoint) and circuit-breaks on {Engines::Unavailable} so a
44
+ # soft-block response doesn't get retried for the next 5 minutes
45
+ LIMITER = RateLimiter.new(min_interval: 2.0, cooldown: 300.0)
46
+
47
+ # Fetch results for +query+ and return them as an +Array<Result>+.
48
+ # Calls are throttled to one every 2s and circuit-broken for 5 minutes
49
+ # after a soft-block; see {LIMITER}. The caller (typically
50
+ # {Engines.search}) is expected to have already normalized the
51
+ # query and to wrap this in a result cache.
52
+ #
53
+ # @param query [String] search query (already normalized)
54
+ # @param max_results [Integer] maximum number of result entries
55
+ # @return [Array<Result>] hits, possibly empty when DDG ran the
56
+ # query and matched nothing
57
+ # @raise [Engines::Unavailable] when DDG soft-blocks the IP
58
+ # (anomaly/CAPTCHA page) or returns HTTP 429/5xx — i.e. "try again
59
+ # later" responses the cascade in {Engines.search} can fall
60
+ # back from. Also raised immediately if {LIMITER} is in cooldown.
61
+ # @raise [RuntimeError] if the HTTP call fails for other reasons or
62
+ # the empty-results page is in an unrecognized layout. A genuine
63
+ # empty-results page is *not* an error; see {.parse}.
64
+ def self.search(query, max_results: DEFAULT_MAX_RESULTS)
65
+ LIMITER.call do
66
+ response = Faraday.get(ENDPOINT, { q: query }, { 'User-Agent' => USER_AGENT })
67
+ unless response.success?
68
+ if response.status == 429 || response.status >= 500
69
+ raise Engines::Unavailable, "HTTP #{response.status}"
70
+ end
71
+
72
+ raise "DuckDuckGo request failed: #{response.status} #{response.body}"
73
+ end
74
+
75
+ parse(response.body, max_results: max_results)
76
+ end
77
+ end
78
+
79
+ # Parse a +html.duckduckgo.com+ result page into a list of {Result}
80
+ # rows. +<b>+ highlights inside snippets are stripped.
81
+ #
82
+ # When the page has zero result nodes, two cases are distinguished:
83
+ # a genuine "no results" page (narrow query, DDG's own "No results
84
+ # found" indicator) returns an empty array instead of raising, so
85
+ # {Engines.search} can render its standard no-results stub.
86
+ # Anything else (anomaly modal, CAPTCHA, service-unavailable page,
87
+ # unknown layout) raises with the diagnostic text extracted from
88
+ # the body, so an IP soft-block is surfaced rather than silently
89
+ # masquerading as an empty search.
90
+ #
91
+ # @param html [String] HTML document body from html.duckduckgo.com
92
+ # @param max_results [Integer] maximum number of result entries
93
+ # @return [Array<Result>] hits, possibly empty on a genuine
94
+ # no-results page
95
+ # @raise [Engines::Unavailable] when the page is the DDG
96
+ # anomaly/CAPTCHA modal (IP soft-block) — a "try again later" the
97
+ # cascade can fall back from.
98
+ # @raise [RuntimeError] when the page contains no result nodes and is
99
+ # not recognized as either a genuine no-results page or the
100
+ # anomaly modal (likely a layout change worth surfacing loudly).
101
+ def self.parse(html, max_results: DEFAULT_MAX_RESULTS)
102
+ doc = Nokogiri::HTML(html)
103
+ results = doc.css('div.result.web-result').take(max_results).filter_map do |node|
104
+ title_link = node.at_css('a.result__a')
105
+ next nil if title_link.nil?
106
+
107
+ snippet = node.at_css('a.result__snippet')
108
+ Result.new(
109
+ url: extract_url(title_link['href']),
110
+ title: title_link.text.strip,
111
+ body: snippet&.text&.strip.to_s
112
+ )
113
+ end
114
+
115
+ if results.empty?
116
+ return [] if genuine_no_results?(doc)
117
+
118
+ message = diagnose_empty(doc)
119
+ raise(anomaly_modal?(doc) ? Engines::Unavailable : RuntimeError, message)
120
+ end
121
+
122
+ results
123
+ end
124
+
125
+ # True when the page contains DDG's anomaly/CAPTCHA modal — i.e. the
126
+ # IP has been soft-blocked. Used by {.parse} to pick between
127
+ # {Engines::Unavailable} (recoverable, fall back to another
128
+ # provider) and {RuntimeError} (unknown layout, surface loudly).
129
+ #
130
+ # @param doc [Nokogiri::HTML::Document] parsed result page
131
+ # @return [Boolean]
132
+ def self.anomaly_modal?(doc)
133
+ !!(doc.at_css('.anomaly-modal__title') || doc.at_css('.anomaly-modal__description'))
134
+ end
135
+ private_class_method :anomaly_modal?
136
+
137
+ # True when a results page with zero result nodes looks like DDG's own
138
+ # "no results found" page (narrow query) rather than an anomaly/CAPTCHA
139
+ # or other non-results layout. Anomaly modal wins: if the modal divs
140
+ # are present we never treat the page as a genuine empty result, even
141
+ # if the surrounding text happens to mention "No results".
142
+ #
143
+ # @param doc [Nokogiri::HTML::Document] parsed result page
144
+ # @return [Boolean]
145
+ def self.genuine_no_results?(doc)
146
+ return false if doc.at_css('.anomaly-modal__title') || doc.at_css('.anomaly-modal__description')
147
+ return true if doc.at_css('div.no-results')
148
+
149
+ doc.text.include?('No results found')
150
+ end
151
+ private_class_method :genuine_no_results?
152
+
153
+ # Build an error message for a results page that yielded zero matches.
154
+ # Recognizes the DDG anomaly modal explicitly and quotes its title and
155
+ # description; otherwise extracts visible text from the body (with
156
+ # +<script>+/+<style>+/+<noscript>+ stripped and whitespace collapsed)
157
+ # and includes a truncated copy so the caller can see what came back.
158
+ #
159
+ # @param doc [Nokogiri::HTML::Document] parsed result page
160
+ # @return [String] human-readable diagnostic to feed to +raise+
161
+ def self.diagnose_empty(doc)
162
+ title = doc.at_css('.anomaly-modal__title')&.text&.strip
163
+ desc = doc.at_css('.anomaly-modal__description')&.text&.strip
164
+ if title || desc
165
+ return "DuckDuckGo anomaly check (likely IP soft-block): #{[title, desc].compact.reject(&:empty?).join(' — ')}"
166
+ end
167
+
168
+ doc.css('script, style, noscript').remove
169
+ text = doc.text.gsub(/\s+/, ' ').strip
170
+ snippet = text.empty? ? '<empty body>' : text[0, 1500]
171
+ snippet += '…' if text.length > 1500
172
+ "DuckDuckGo returned no results. Page text: #{snippet}"
173
+ end
174
+ private_class_method :diagnose_empty
175
+
176
+ # Decode DuckDuckGo's +//duckduckgo.com/l/?uddg=<encoded>+ redirect wrapper
177
+ # back to the real target URL.
178
+ #
179
+ # @param href [String, nil] href as found on the search-result page
180
+ # @return [String] the decoded target URL, or +href+ unchanged when it is
181
+ # not a recognized DDG redirect or cannot be parsed
182
+ def self.extract_url(href)
183
+ return href if href.nil? || href.empty?
184
+
185
+ uri = URI.parse(href.start_with?('//') ? "https:#{href}" : href)
186
+ return href unless uri.host&.end_with?('duckduckgo.com') && uri.path == '/l/'
187
+
188
+ params = URI.decode_www_form(uri.query.to_s).to_h
189
+ params['uddg'] || href
190
+ rescue URI::InvalidURIError
191
+ href
192
+ end
193
+ end
194
+ end
195
+ end
196
+ end