pikuri-core 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +67 -0
  3. data/lib/pikuri/agent/chat_transport.rb +41 -0
  4. data/lib/pikuri/agent/configurator.rb +270 -0
  5. data/lib/pikuri/agent/context_window_detector.rb +111 -0
  6. data/lib/pikuri/agent/control/cancellable.rb +128 -0
  7. data/lib/pikuri/agent/control/interloper.rb +167 -0
  8. data/lib/pikuri/agent/control/step_limit.rb +93 -0
  9. data/lib/pikuri/agent/control.rb +45 -0
  10. data/lib/pikuri/agent/event.rb +190 -0
  11. data/lib/pikuri/agent/extension.rb +82 -0
  12. data/lib/pikuri/agent/listener/in_memory_event_list.rb +34 -0
  13. data/lib/pikuri/agent/listener/rate_limited.rb +172 -0
  14. data/lib/pikuri/agent/listener/terminal.rb +264 -0
  15. data/lib/pikuri/agent/listener/token_log.rb +216 -0
  16. data/lib/pikuri/agent/listener.rb +54 -0
  17. data/lib/pikuri/agent/listener_list.rb +102 -0
  18. data/lib/pikuri/agent/synthesizer.rb +145 -0
  19. data/lib/pikuri/agent.rb +731 -0
  20. data/lib/pikuri/subprocess.rb +166 -0
  21. data/lib/pikuri/tool/calculator.rb +82 -0
  22. data/lib/pikuri/tool/fetch.rb +171 -0
  23. data/lib/pikuri/tool/parameters.rb +314 -0
  24. data/lib/pikuri/tool/scraper/fetch_error.rb +16 -0
  25. data/lib/pikuri/tool/scraper/html.rb +285 -0
  26. data/lib/pikuri/tool/scraper/pdf.rb +54 -0
  27. data/lib/pikuri/tool/scraper/simple.rb +183 -0
  28. data/lib/pikuri/tool/search/brave.rb +184 -0
  29. data/lib/pikuri/tool/search/duckduckgo.rb +196 -0
  30. data/lib/pikuri/tool/search/engines.rb +163 -0
  31. data/lib/pikuri/tool/search/exa.rb +217 -0
  32. data/lib/pikuri/tool/search/rate_limiter.rb +92 -0
  33. data/lib/pikuri/tool/search/result.rb +29 -0
  34. data/lib/pikuri/tool/sub_agent.rb +150 -0
  35. data/lib/pikuri/tool/web_scrape.rb +121 -0
  36. data/lib/pikuri/tool/web_search.rb +38 -0
  37. data/lib/pikuri/tool.rb +118 -0
  38. data/lib/pikuri/url_cache.rb +112 -0
  39. data/lib/pikuri/version.rb +10 -0
  40. data/lib/pikuri-core.rb +177 -0
  41. data/prompts/pikuri-chat.txt +15 -0
  42. metadata +251 -0
@@ -0,0 +1,183 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'faraday'
4
+ require 'uri'
5
+
6
+ module Pikuri
7
+ class Tool
8
+ # Namespace for the URL-to-Markdown scraping stack used by
9
+ # {Tool::WEB_SCRAPE} and {Tool::FETCH}: a content-type-dispatching
10
+ # fetcher ({Simple}), pure content extractors ({HTML}, {PDF}), and a
11
+ # shared error type ({FetchError}). Nothing here knows about the LLM
12
+ # — the tools that wrap these layers turn rendered Markdown (or
13
+ # +FetchError+) into the next observation.
14
+ module Scraper
15
+ # Plain HTTP scraper: GET the URL with a real-browser User-Agent,
16
+ # follow redirects, and dispatch the response body to the parser
17
+ # matching its +Content-Type+. HTML and XHTML route to
18
+ # {HTML.extract}; +application/pdf+ routes to {PDF.extract}; any
19
+ # other +text/*+ type (plain text, Markdown, source files, …) is
20
+ # passed through verbatim since the LLM can already read it; the
21
+ # remaining types raise {FetchError} so the LLM observes the
22
+ # failure instead of receiving an empty rendering.
23
+ #
24
+ # Split into a thin HTTP fetch ({.fetch}) and a content-type
25
+ # dispatcher ({.visit}) so tests can drive each piece in isolation.
26
+ # "Simple" because everything happens in one Faraday GET — no
27
+ # headless browser, no JS execution.
28
+ module Simple
29
+ # @return [String] User-Agent sent with each request; many sites
30
+ # reject requests with no UA or an obvious bot UA
31
+ USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' \
32
+ '(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
33
+ # @return [String] +Accept+ header sent with each request. Lists
34
+ # every content-type the dispatcher in {.visit} knows how to
35
+ # render, so servers that content-negotiate hand back something
36
+ # we can use. The trailing +text/*;q=0.8+ covers the verbatim
37
+ # pass-through arm (plain text, Markdown, source files, …) at a
38
+ # lower preference than rendered HTML/PDF.
39
+ ACCEPT = 'text/html,application/xhtml+xml,application/pdf,text/*;q=0.8'
40
+ # @return [Integer] maximum number of HTTP redirects to follow
41
+ # before giving up
42
+ MAX_REDIRECTS = 5
43
+ # @return [Integer] connect timeout in seconds for the underlying
44
+ # Faraday request
45
+ OPEN_TIMEOUT = 10
46
+ # @return [Integer] read timeout in seconds for the underlying
47
+ # Faraday request
48
+ READ_TIMEOUT = 20
49
+
50
+ # @return [Integer] maximum number of characters of an error
51
+ # response body to include in a {FetchError} message. The body is
52
+ # often a multi-kilobyte HTML challenge page (Cloudflare, WAF
53
+ # interstitial, etc.); a short excerpt tells the LLM what kind of
54
+ # page came back without flooding the next observation.
55
+ ERROR_BODY_EXCERPT = 200
56
+
57
+ # Result of a successful {Simple.fetch}: the response body, the
58
+ # normalized content-type (lower-cased, with any +; charset=...+
59
+ # parameters stripped), and the final URL after redirects. The
60
+ # final URL is kept so future scrapers can resolve relative links
61
+ # against the actual landing page rather than the originally
62
+ # requested one.
63
+ Fetched = Data.define(:body, :content_type, :url)
64
+
65
+ # Fetch +url+ and render its main content as Markdown.
66
+ #
67
+ # No caching here — every call hits the network. Callers that want
68
+ # to memoize results should wrap this method themselves (see
69
+ # {Tool::WebScrape.visit}, which does exactly that).
70
+ #
71
+ # The dispatcher's output is +String#strip+'d so the LLM never
72
+ # sees a body that opens or closes with blank lines — common with
73
+ # +pdf-reader+'s page-feed whitespace and with text bodies that
74
+ # carry a trailing newline. Interior whitespace is preserved
75
+ # because Markdown paragraph breaks and source-code indentation
76
+ # are load-bearing.
77
+ #
78
+ # @param url [String] absolute HTTP(S) URL of the page to download
79
+ # @return [String] full Markdown representation of the page with
80
+ # leading/trailing whitespace trimmed, uncapped otherwise —
81
+ # caller is responsible for any size limiting before feeding
82
+ # the result back to the LLM
83
+ # @raise [FetchError] on HTTP non-2xx, network failure, redirect
84
+ # loop, a 3xx without a +Location+ header, or a response whose
85
+ # content-type the dispatcher does not recognize
86
+ def self.visit(url)
87
+ dispatch(fetch(url)).strip
88
+ end
89
+
90
+ # Download the body of +url+, manually following up to
91
+ # {MAX_REDIRECTS} redirects. Faraday is configured with no
92
+ # middleware so behavior here mirrors the rest of the codebase
93
+ # (see +Tool::Search::DuckDuckGo.search+).
94
+ #
95
+ # All recoverable failures — HTTP 4xx/5xx, +Faraday::Error+ network
96
+ # blips, exhausted redirect budget, 3xx without a +Location+ —
97
+ # surface as {FetchError} so the caller has a single exception type
98
+ # to rescue. Error bodies are trimmed to {ERROR_BODY_EXCERPT}
99
+ # characters with whitespace collapsed, so a Cloudflare-challenge
100
+ # response doesn't dump kilobytes of inline HTML into the next LLM
101
+ # observation.
102
+ #
103
+ # @param url [String] absolute HTTP(S) URL to fetch
104
+ # @param limit [Integer] redirects remaining; recurses with
105
+ # +limit - 1+ on each 3xx
106
+ # @return [Fetched] body, normalized content-type, and final URL
107
+ # after redirects
108
+ # @raise [FetchError] on non-2xx/3xx responses, network errors,
109
+ # redirect-loop exhaustion, or 3xx without a +Location+ header
110
+ def self.fetch(url, limit: MAX_REDIRECTS)
111
+ raise FetchError, "too many redirects fetching #{url}" if limit.zero?
112
+
113
+ response = begin
114
+ Faraday.new(request: { open_timeout: OPEN_TIMEOUT, timeout: READ_TIMEOUT }).get(url) do |req|
115
+ req.headers['User-Agent'] = USER_AGENT
116
+ req.headers['Accept'] = ACCEPT
117
+ end
118
+ rescue Faraday::Error => e
119
+ raise FetchError, "#{e.class.name.split('::').last} fetching #{url}: #{e.message}"
120
+ end
121
+
122
+ case response.status
123
+ when 200..299
124
+ Fetched.new(body: response.body, content_type: normalize_content_type(response.headers['content-type']), url: url)
125
+ when 300..399
126
+ location = response.headers['location']
127
+ raise FetchError, "HTTP #{response.status} from #{url} with no Location header" if location.nil? || location.empty?
128
+
129
+ fetch(URI.join(url, location).to_s, limit: limit - 1)
130
+ else
131
+ raise FetchError, "HTTP #{response.status} fetching #{url}: #{excerpt(response.body)}"
132
+ end
133
+ end
134
+
135
+ # Route a {Fetched} response to the parser that matches its
136
+ # content-type. Unknown types raise {FetchError} so the LLM gets a
137
+ # legible observation instead of an empty string.
138
+ #
139
+ # @param fetched [Fetched]
140
+ # @return [String] Markdown representation produced by the matched
141
+ # parser
142
+ # @raise [FetchError] when no parser matches the response's
143
+ # content-type
144
+ def self.dispatch(fetched)
145
+ case fetched.content_type
146
+ when 'text/html', 'application/xhtml+xml'
147
+ HTML.extract(fetched.body)
148
+ when 'application/pdf'
149
+ PDF.extract(fetched.body)
150
+ when %r{\Atext/}
151
+ fetched.body
152
+ else
153
+ raise FetchError, "unsupported content-type #{fetched.content_type.inspect} for #{fetched.url}"
154
+ end
155
+ end
156
+
157
+ # Lower-case +raw+ and strip any +; charset=...+ parameters so the
158
+ # dispatcher can match on a canonical token.
159
+ #
160
+ # @param raw [String, nil] raw +Content-Type+ header value
161
+ # @return [String] normalized content-type, or +""+ when the
162
+ # header was missing
163
+ def self.normalize_content_type(raw)
164
+ raw.to_s.split(';').first.to_s.strip.downcase
165
+ end
166
+ private_class_method :normalize_content_type
167
+
168
+ # Whitespace-collapse +body+ and clip to {ERROR_BODY_EXCERPT}
169
+ # characters, so the {FetchError} message stays a single readable
170
+ # line even when the server returned a multi-KB HTML challenge
171
+ # page.
172
+ #
173
+ # @param body [String, nil]
174
+ # @return [String]
175
+ def self.excerpt(body)
176
+ text = body.to_s.gsub(/\s+/, ' ').strip
177
+ text.length > ERROR_BODY_EXCERPT ? "#{text[0, ERROR_BODY_EXCERPT]}..." : text
178
+ end
179
+ private_class_method :excerpt
180
+ end
181
+ end
182
+ end
183
+ end
@@ -0,0 +1,184 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'faraday'
4
+ require 'json'
5
+ require 'nokogiri'
6
+
7
+ module Pikuri
8
+ class Tool
9
+ module Search
10
+ # Performs a Brave Search via the official Web Search API and returns
11
+ # the hits as a list of {Result} rows. Split into a thin HTTP fetch
12
+ # (#search) and a pure parser (#parse) so tests can exercise the
13
+ # parser against fixture JSON without hitting the network. The
14
+ # cascade in {Engines.search} owns the final Markdown rendering.
15
+ #
16
+ # Requires a Brave Search API key. Get one at
17
+ # https://api-dashboard.search.brave.com — the free "Data for Search"
18
+ # tier allows 1 query/sec and ~2k queries/month.
19
+ #
20
+ # == Privacy posture
21
+ #
22
+ # Brave's API Privacy Notice retains Search Query Logs for 90 days
23
+ # (billing / troubleshooting) and states +Brave does not collect any
24
+ # identifiers that can link a search query to an individual or their
25
+ # devices+. Brave publicly commits that the Search API does not use
26
+ # query data to train its own models, and offers Zero Data Retention
27
+ # — but only on the Enterprise plan, not on the free "Data for
28
+ # Search" tier pikuri defaults to.
29
+ #
30
+ # Bottom line: of pikuri's three providers Brave has the cleanest
31
+ # API-level posture — no training-on-queries, no IP linkage, capped
32
+ # 90-day retention by default, real ZDR if you pay for it. Still a
33
+ # logged 90-day window on the cheap tier, so not a substitute for
34
+ # ZDR for genuinely sensitive queries.
35
+ module Brave
36
+ # @return [String] Web Search endpoint
37
+ ENDPOINT = 'https://api.search.brave.com/res/v1/web/search'
38
+ # @return [Integer] default number of results returned, matching
39
+ # {DuckDuckGo::DEFAULT_MAX_RESULTS}
40
+ DEFAULT_MAX_RESULTS = 10
41
+ # @return [String] env var holding the API key; +X-Subscription-Token+
42
+ ENV_KEY = 'BRAVE_SEARCH_API_KEY'
43
+ # @return [RateLimiter] free-tier Brave caps at 1 req/sec; the
44
+ # 5-minute cooldown protects the limited monthly quota from
45
+ # being burned on doomed retries when a 429 hits.
46
+ LIMITER = RateLimiter.new(min_interval: 1.0, cooldown: 300.0)
47
+
48
+ # Fetch results for +query+ and return them as an +Array<Result>+.
49
+ # Calls are throttled to one per second and circuit-broken for 5
50
+ # minutes on rate-limit / quota-exhausted responses; see {LIMITER}.
51
+ # The caller (typically {Engines.search}) is expected to have
52
+ # already normalized the query and to wrap this in a result cache.
53
+ #
54
+ # @param query [String] search query (already normalized)
55
+ # @param max_results [Integer] maximum number of result entries;
56
+ # passed through as Brave's +count+ (1..20)
57
+ # @param api_key [String] Brave Search subscription token; defaults to
58
+ # the {ENV_KEY} environment variable
59
+ # @return [Array<Result>] hits, possibly empty when Brave ran the
60
+ # query and matched nothing
61
+ # @raise [ArgumentError] if no API key is available
62
+ # @raise [Engines::Unavailable] when Brave returns HTTP 429
63
+ # (rate limit / quota exhausted) or 5xx — "try again later"
64
+ # responses the cascade in {Engines.search} can fall back
65
+ # from. Also raised immediately if {LIMITER} is in cooldown.
66
+ # Other non-2xx (e.g. 401/403 from a bad API key) bubble up as
67
+ # +RuntimeError+ so config problems stay visible.
68
+ # @raise [RuntimeError] for non-rate-limit HTTP failures or when the
69
+ # response shape contains no results.
70
+ def self.search(query, max_results: DEFAULT_MAX_RESULTS, api_key: ENV.fetch(ENV_KEY, nil))
71
+ raise ArgumentError, "Brave Search API key not set (#{ENV_KEY})" if api_key.to_s.strip.empty?
72
+
73
+ LIMITER.call do
74
+ response = Faraday.get(
75
+ ENDPOINT,
76
+ { q: query, count: max_results },
77
+ { 'X-Subscription-Token' => api_key, 'Accept' => 'application/json' }
78
+ )
79
+ unless response.success?
80
+ if response.status == 429 || response.status >= 500
81
+ raise Engines::Unavailable, "HTTP #{response.status}"
82
+ end
83
+
84
+ raise "Brave Search request failed: #{response.status} #{response.body}"
85
+ end
86
+
87
+ parse(response.body, max_results: max_results)
88
+ end
89
+ end
90
+
91
+ # Parse a Brave Web Search JSON response into a list of {Result} rows.
92
+ # HTML highlight tags (+<strong>+) inside +title+ and +description+
93
+ # are stripped via Nokogiri so the output is plain text.
94
+ #
95
+ # When the response yields zero result nodes, two cases are
96
+ # distinguished: a genuine "no results" payload (recognized search
97
+ # shape with empty +mixed.main+/+top+/+side+ — typically a too-narrow
98
+ # query Brave couldn't match) returns an empty array instead of
99
+ # raising, so {Engines.search} can render its standard no-results
100
+ # stub. Anything else (unknown layout, structured error) raises
101
+ # with a diagnostic so the failure surfaces.
102
+ #
103
+ # @param json [String] response body from {ENDPOINT}
104
+ # @param max_results [Integer] maximum number of result entries
105
+ # @return [Array<Result>] hits, possibly empty on a recognized
106
+ # empty-results payload
107
+ # @raise [RuntimeError] when the response yields no result entries and
108
+ # is not recognized as a genuine empty-results payload
109
+ def self.parse(json, max_results: DEFAULT_MAX_RESULTS)
110
+ data = JSON.parse(json)
111
+ results = Array(data.dig('web', 'results')).take(max_results).filter_map do |r|
112
+ href = r['url'].to_s
113
+ next nil if href.empty?
114
+
115
+ Result.new(
116
+ url: href,
117
+ title: strip_html(r['title']),
118
+ body: strip_html(r['description'])
119
+ )
120
+ end
121
+
122
+ if results.empty?
123
+ return [] if genuine_no_results?(data)
124
+
125
+ raise diagnose_empty(data, json)
126
+ end
127
+
128
+ results
129
+ end
130
+
131
+ # Strip HTML markup (notably +<strong>+ highlights Brave wraps around
132
+ # query terms in titles and descriptions) and collapse whitespace.
133
+ #
134
+ # @param html [String, nil] raw text from a Brave result field
135
+ # @return [String] plain text with tags removed; empty string for nil
136
+ def self.strip_html(html)
137
+ return '' if html.nil?
138
+
139
+ Nokogiri::HTML.fragment(html).text.gsub(/\s+/, ' ').strip
140
+ end
141
+ private_class_method :strip_html
142
+
143
+ # True when a parsed response with zero +web.results+ entries looks
144
+ # like Brave's own "search ran, nothing matched" payload (typically a
145
+ # too-narrow query) rather than a malformed or error response. The
146
+ # markers are the recognized +type: "search"+ envelope and an empty
147
+ # +mixed+ block — Brave populates +mixed.main/top/side+ with whichever
148
+ # verticals matched, so all three being empty arrays is the canonical
149
+ # signal that the search itself succeeded but found nothing.
150
+ #
151
+ # @param data [Hash, Object] parsed response
152
+ # @return [Boolean]
153
+ def self.genuine_no_results?(data)
154
+ return false unless data.is_a?(Hash) && data['type'] == 'search'
155
+
156
+ mixed = data['mixed']
157
+ return false unless mixed.is_a?(Hash)
158
+
159
+ %w[main top side].all? { |k| Array(mixed[k]).empty? }
160
+ end
161
+ private_class_method :genuine_no_results?
162
+
163
+ # Build an error message for a parsed response that yielded zero
164
+ # results. Quotes Brave's +error.detail+ if present, otherwise
165
+ # truncates the raw body so the caller can see the actual payload.
166
+ #
167
+ # @param data [Hash, Object] parsed response
168
+ # @param raw [String] raw response body
169
+ # @return [String] human-readable diagnostic to feed to +raise+
170
+ def self.diagnose_empty(data, raw)
171
+ if data.is_a?(Hash) && data['error'].is_a?(Hash)
172
+ err = data['error']
173
+ return "Brave Search returned an error: #{[err['code'], err['detail'] || err['meta']].compact.join(' — ')}"
174
+ end
175
+
176
+ snippet = raw.to_s[0, 800]
177
+ snippet += '…' if raw.to_s.length > 800
178
+ "Brave Search returned no results. Body: #{snippet}"
179
+ end
180
+ private_class_method :diagnose_empty
181
+ end
182
+ end
183
+ end
184
+ end
@@ -0,0 +1,196 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'faraday'
4
+ require 'nokogiri'
5
+ require 'uri'
6
+
7
+ module Pikuri
8
+ class Tool
9
+ module Search
10
+ # Performs a DuckDuckGo search by scraping +html.duckduckgo.com+ and
11
+ # returns the hits as a list of {Result} rows. Split into a thin HTTP
12
+ # fetch (#search) and a pure parser (#parse) so tests can exercise
13
+ # the parser against fixture HTML without hitting the network. The
14
+ # cascade in {Engines.search} owns the final Markdown rendering.
15
+ #
16
+ # == Privacy posture
17
+ #
18
+ # DuckDuckGo's privacy policy states +We don't save your IP address
19
+ # or any unique identifiers alongside your searches+ and +We have
20
+ # never sold any personal information+, and they proxy requests on
21
+ # the user's behalf so downstream content providers can't build a
22
+ # per-user search history. That part is real — but DDG is mainly a
23
+ # relay over Bing for web results, so the *query content* still
24
+ # reaches Microsoft for fulfillment even though DDG strips
25
+ # identifying info on the way out.
26
+ #
27
+ # Bottom line: DDG is a genuine privacy improvement over hitting
28
+ # Bing directly (your IP isn't tied to the query, no per-user
29
+ # profile is built on DDG's side), but query content still lands at
30
+ # Microsoft, who has no comparable no-training pledge. Better than
31
+ # Exa for sensitive queries, worse than Brave; for anything
32
+ # genuinely embarrassing, don't search the web at all.
33
+ module DuckDuckGo
34
+ # @return [String] HTML search endpoint
35
+ ENDPOINT = 'https://html.duckduckgo.com/html/'
36
+ # @return [String] User-Agent sent with each request; DDG often rejects
37
+ # requests with no UA or an obvious bot UA
38
+ USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' \
39
+ '(KHTML, like Gecko) Chrome/120.0 Safari/537.36'
40
+ # @return [Integer] default number of results returned, matching smolagents
41
+ DEFAULT_MAX_RESULTS = 10
42
+ # @return [RateLimiter] paces calls (DDG bans IPs that hammer the HTML
43
+ # endpoint) and circuit-breaks on {Engines::Unavailable} so a
44
+ # soft-block response doesn't get retried for the next 5 minutes
45
+ LIMITER = RateLimiter.new(min_interval: 2.0, cooldown: 300.0)
46
+
47
+ # Fetch results for +query+ and return them as an +Array<Result>+.
48
+ # Calls are throttled to one every 2s and circuit-broken for 5 minutes
49
+ # after a soft-block; see {LIMITER}. The caller (typically
50
+ # {Engines.search}) is expected to have already normalized the
51
+ # query and to wrap this in a result cache.
52
+ #
53
+ # @param query [String] search query (already normalized)
54
+ # @param max_results [Integer] maximum number of result entries
55
+ # @return [Array<Result>] hits, possibly empty when DDG ran the
56
+ # query and matched nothing
57
+ # @raise [Engines::Unavailable] when DDG soft-blocks the IP
58
+ # (anomaly/CAPTCHA page) or returns HTTP 429/5xx — i.e. "try again
59
+ # later" responses the cascade in {Engines.search} can fall
60
+ # back from. Also raised immediately if {LIMITER} is in cooldown.
61
+ # @raise [RuntimeError] if the HTTP call fails for other reasons or
62
+ # the empty-results page is in an unrecognized layout. A genuine
63
+ # empty-results page is *not* an error; see {.parse}.
64
+ def self.search(query, max_results: DEFAULT_MAX_RESULTS)
65
+ LIMITER.call do
66
+ response = Faraday.get(ENDPOINT, { q: query }, { 'User-Agent' => USER_AGENT })
67
+ unless response.success?
68
+ if response.status == 429 || response.status >= 500
69
+ raise Engines::Unavailable, "HTTP #{response.status}"
70
+ end
71
+
72
+ raise "DuckDuckGo request failed: #{response.status} #{response.body}"
73
+ end
74
+
75
+ parse(response.body, max_results: max_results)
76
+ end
77
+ end
78
+
79
+ # Parse a +html.duckduckgo.com+ result page into a list of {Result}
80
+ # rows. +<b>+ highlights inside snippets are stripped.
81
+ #
82
+ # When the page has zero result nodes, two cases are distinguished:
83
+ # a genuine "no results" page (narrow query, DDG's own "No results
84
+ # found" indicator) returns an empty array instead of raising, so
85
+ # {Engines.search} can render its standard no-results stub.
86
+ # Anything else (anomaly modal, CAPTCHA, service-unavailable page,
87
+ # unknown layout) raises with the diagnostic text extracted from
88
+ # the body, so an IP soft-block is surfaced rather than silently
89
+ # masquerading as an empty search.
90
+ #
91
+ # @param html [String] HTML document body from html.duckduckgo.com
92
+ # @param max_results [Integer] maximum number of result entries
93
+ # @return [Array<Result>] hits, possibly empty on a genuine
94
+ # no-results page
95
+ # @raise [Engines::Unavailable] when the page is the DDG
96
+ # anomaly/CAPTCHA modal (IP soft-block) — a "try again later" the
97
+ # cascade can fall back from.
98
+ # @raise [RuntimeError] when the page contains no result nodes and is
99
+ # not recognized as either a genuine no-results page or the
100
+ # anomaly modal (likely a layout change worth surfacing loudly).
101
+ def self.parse(html, max_results: DEFAULT_MAX_RESULTS)
102
+ doc = Nokogiri::HTML(html)
103
+ results = doc.css('div.result.web-result').take(max_results).filter_map do |node|
104
+ title_link = node.at_css('a.result__a')
105
+ next nil if title_link.nil?
106
+
107
+ snippet = node.at_css('a.result__snippet')
108
+ Result.new(
109
+ url: extract_url(title_link['href']),
110
+ title: title_link.text.strip,
111
+ body: snippet&.text&.strip.to_s
112
+ )
113
+ end
114
+
115
+ if results.empty?
116
+ return [] if genuine_no_results?(doc)
117
+
118
+ message = diagnose_empty(doc)
119
+ raise(anomaly_modal?(doc) ? Engines::Unavailable : RuntimeError, message)
120
+ end
121
+
122
+ results
123
+ end
124
+
125
+ # True when the page contains DDG's anomaly/CAPTCHA modal — i.e. the
126
+ # IP has been soft-blocked. Used by {.parse} to pick between
127
+ # {Engines::Unavailable} (recoverable, fall back to another
128
+ # provider) and {RuntimeError} (unknown layout, surface loudly).
129
+ #
130
+ # @param doc [Nokogiri::HTML::Document] parsed result page
131
+ # @return [Boolean]
132
+ def self.anomaly_modal?(doc)
133
+ !!(doc.at_css('.anomaly-modal__title') || doc.at_css('.anomaly-modal__description'))
134
+ end
135
+ private_class_method :anomaly_modal?
136
+
137
+ # True when a results page with zero result nodes looks like DDG's own
138
+ # "no results found" page (narrow query) rather than an anomaly/CAPTCHA
139
+ # or other non-results layout. Anomaly modal wins: if the modal divs
140
+ # are present we never treat the page as a genuine empty result, even
141
+ # if the surrounding text happens to mention "No results".
142
+ #
143
+ # @param doc [Nokogiri::HTML::Document] parsed result page
144
+ # @return [Boolean]
145
+ def self.genuine_no_results?(doc)
146
+ return false if doc.at_css('.anomaly-modal__title') || doc.at_css('.anomaly-modal__description')
147
+ return true if doc.at_css('div.no-results')
148
+
149
+ doc.text.include?('No results found')
150
+ end
151
+ private_class_method :genuine_no_results?
152
+
153
+ # Build an error message for a results page that yielded zero matches.
154
+ # Recognizes the DDG anomaly modal explicitly and quotes its title and
155
+ # description; otherwise extracts visible text from the body (with
156
+ # +<script>+/+<style>+/+<noscript>+ stripped and whitespace collapsed)
157
+ # and includes a truncated copy so the caller can see what came back.
158
+ #
159
+ # @param doc [Nokogiri::HTML::Document] parsed result page
160
+ # @return [String] human-readable diagnostic to feed to +raise+
161
+ def self.diagnose_empty(doc)
162
+ title = doc.at_css('.anomaly-modal__title')&.text&.strip
163
+ desc = doc.at_css('.anomaly-modal__description')&.text&.strip
164
+ if title || desc
165
+ return "DuckDuckGo anomaly check (likely IP soft-block): #{[title, desc].compact.reject(&:empty?).join(' — ')}"
166
+ end
167
+
168
+ doc.css('script, style, noscript').remove
169
+ text = doc.text.gsub(/\s+/, ' ').strip
170
+ snippet = text.empty? ? '<empty body>' : text[0, 1500]
171
+ snippet += '…' if text.length > 1500
172
+ "DuckDuckGo returned no results. Page text: #{snippet}"
173
+ end
174
+ private_class_method :diagnose_empty
175
+
176
+ # Decode DuckDuckGo's +//duckduckgo.com/l/?uddg=<encoded>+ redirect wrapper
177
+ # back to the real target URL.
178
+ #
179
+ # @param href [String, nil] href as found on the search-result page
180
+ # @return [String] the decoded target URL, or +href+ unchanged when it is
181
+ # not a recognized DDG redirect or cannot be parsed
182
+ def self.extract_url(href)
183
+ return href if href.nil? || href.empty?
184
+
185
+ uri = URI.parse(href.start_with?('//') ? "https:#{href}" : href)
186
+ return href unless uri.host&.end_with?('duckduckgo.com') && uri.path == '/l/'
187
+
188
+ params = URI.decode_www_form(uri.query.to_s).to_h
189
+ params['uddg'] || href
190
+ rescue URI::InvalidURIError
191
+ href
192
+ end
193
+ end
194
+ end
195
+ end
196
+ end