pikuri-core 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,63 +1,235 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'dentaku'
4
-
5
3
  module Pikuri
6
4
  class Tool
7
- # Evaluates a basic arithmetic expression using Dentaku, with light
8
- # preprocessing so the LLM can emit Python-flavored syntax (notably
9
- # +**+ for exponentiation) instead of learning Dentaku's dialect.
5
+ # Evaluates a basic arithmetic expression with Python operator
6
+ # syntax and semantics, via the hand-rolled recursive-descent
7
+ # {Parser} below.
8
+ #
9
+ # Why hand-rolled rather than a gem: the previous backend, dentaku,
10
+ # pulled in concurrent-ruby (~16k lines of Ruby — the single
11
+ # heaviest audit item in pikuri's whole dependency closure) plus
12
+ # bigdecimal and tsort, all to evaluate four-function arithmetic.
13
+ # The ~100 lines here implement Python's expression grammar
14
+ # directly, which also retires the old preprocessing step that
15
+ # rewrote Python's +**+ into dentaku's +^+ dialect — the model's
16
+ # native syntax is now simply the grammar.
10
17
  #
11
- # Scope is intentionally narrow: operators (+, -, *, /, **, %),
12
- # parentheses, and decimal numbers. No variables, functions, or
13
- # booleans — those would mean teaching the model a dialect, which we
14
- # specifically want to avoid for this tool.
18
+ # Scope is intentionally narrow: operators (+, -, *, /, //, %, **),
19
+ # unary minus, parentheses, and integer / decimal / e-notation
20
+ # literals. No variables, functions, or booleans — those would mean
21
+ # teaching the model a dialect, which we specifically want to avoid
22
+ # for this tool.
15
23
  module Calculator
16
- # Translate the operator differences between Python and Dentaku. In
17
- # practice that is only +**+ +^+; everything else in the supported
18
- # subset is byte-identical.
19
- #
20
- # @param expression [String] raw expression as the model wrote it
21
- # @return [String] expression with Python-style operators rewritten
22
- def self.normalize(expression)
23
- expression.gsub('**', '^')
24
- end
24
+ # Raised internally for anything {.calculate} should hand back to
25
+ # the model as an +"Error: ..."+ observation rather than crash
26
+ # the agent loop: parse failures, division by zero, complex or
27
+ # non-finite results. The message always names the offending
28
+ # token or operands.
29
+ class Error < StandardError; end
25
30
 
26
- # Evaluate +expression+ and return the result formatted as a String.
27
- # Parse, unbound-variable, and division-by-zero failures are caught
28
- # and returned as +"Error: ..."+ strings so the model can read the
29
- # failure as the next observation and self-correct rather than
30
- # crashing the agent loop.
31
+ # Evaluate +expression+ and return the result formatted as a
32
+ # String. Parse and arithmetic failures (division by zero,
33
+ # overflow to infinity, complex results) are caught and returned
34
+ # as +"Error: ..."+ strings so the model can read the failure as
35
+ # the next observation and self-correct rather than crashing the
36
+ # agent loop.
31
37
  #
32
- # @param expression [String]
38
+ # @param expression [String] Python-syntax arithmetic expression
33
39
  # @return [String] numeric result, or +"Error: ..."+ on failure
34
40
  def self.calculate(expression)
35
- result = Dentaku::Calculator.new.evaluate!(normalize(expression))
41
+ result = Parser.new(expression).parse
42
+ if result.is_a?(Float) && !result.finite?
43
+ raise Error, "result of #{expression.inspect} is not a finite number"
44
+ end
45
+
36
46
  format_result(result)
37
- rescue Dentaku::ZeroDivisionError, ZeroDivisionError
38
- 'Error: division by zero'
39
- rescue Dentaku::Error => e
47
+ rescue Error => e
40
48
  "Error: #{e.message}"
41
49
  end
42
50
 
43
- # Dentaku returns BigDecimal for any expression that touches division
44
- # or a decimal literal, with full BigDecimal precision (47-digit tails
45
- # for the leopard expression). Round to 3 places and strip the
46
- # default scientific-notation formatting so the model sees a short
47
- # readable number; integer/other results pass through unchanged.
51
+ # Integers (never produced by division +/+ is Python-3-style
52
+ # true division) pass through exact. Floats are rounded to 3
53
+ # places so the model sees a short readable number, and
54
+ # whole-valued floats drop the trailing +.0+ (+4 / 2+ renders as
55
+ # +"2"+, not +"2.0"+).
48
56
  def self.format_result(result)
49
- case result
50
- when BigDecimal then result.round(3).to_s('F')
51
- else result.to_s
52
- end
57
+ return result.to_s if result.is_a?(Integer)
58
+
59
+ rounded = result.round(3)
60
+ rounded == rounded.truncate ? rounded.truncate.to_s : rounded.to_s
53
61
  end
54
62
  private_class_method :format_result
63
+
64
+ # Recursive-descent parser-evaluator for Python's arithmetic
65
+ # expression grammar:
66
+ #
67
+ # additive := multiplicative (('+' | '-') multiplicative)*
68
+ # multiplicative := unary (('*' | '/' | '//' | '%') unary)*
69
+ # unary := ('+' | '-') unary | power
70
+ # power := atom ('**' unary)?
71
+ # atom := NUMBER | '(' additive ')'
72
+ #
73
+ # The +power+ → +unary+ recursion on the right operand is what
74
+ # makes +**+ right-associative (+2**3**2+ is 512) and lets a sign
75
+ # follow it (+2**-3+); +unary+ sitting *above* +power+ on the
76
+ # left is what makes +-2**2+ evaluate to -4 — both exactly as
77
+ # Python parses them.
78
+ #
79
+ # Semantics follow Python 3 where Ruby differs: +/+ is always
80
+ # true (float) division, +//+ floors, +2**-1+ is the Float 0.5
81
+ # (Ruby would return a Rational), and a negative base under a
82
+ # fractional exponent is rejected (Ruby would return a Complex).
83
+ class Parser
84
+ # One number or operator. +**+ / +//+ listed before their
85
+ # single-character prefixes so the two-character operators win;
86
+ # number literals cover +42+, +4.2+, +5.+, +.5+, and e-notation
87
+ # on any of them. +\G+ anchors each match at the scan position
88
+ # so nothing between tokens goes unnoticed.
89
+ TOKEN_RE = %r{\G\s*(\*\*|//|\d+(?:\.\d*)?(?:[eE][+-]?\d+)?|\.\d+(?:[eE][+-]?\d+)?|[-+*/%()])}
90
+
91
+ # @param expression [String] raw expression as the model wrote it
92
+ # @raise [Error] when +expression+ contains a character no token matches
93
+ def initialize(expression)
94
+ @tokens = tokenize(expression)
95
+ @pos = 0
96
+ end
97
+
98
+ # Parse and evaluate the whole token stream.
99
+ #
100
+ # @return [Integer, Float] the value of the expression
101
+ # @raise [Error] on syntax errors, division by zero, or a complex result
102
+ def parse
103
+ value = additive
104
+ raise Error, "unexpected #{peek.inspect} after expression" if peek
105
+
106
+ value
107
+ end
108
+
109
+ private
110
+
111
+ # @param expression [String]
112
+ # @return [Array<String>] token strings in source order
113
+ def tokenize(expression)
114
+ tokens = []
115
+ pos = 0
116
+ while (match = TOKEN_RE.match(expression, pos))
117
+ tokens << match[1]
118
+ pos = match.end(0)
119
+ end
120
+ rest = expression[pos..].to_s.strip
121
+ raise Error, "unexpected character #{rest[0].inspect} in #{expression.inspect}" unless rest.empty?
122
+
123
+ tokens
124
+ end
125
+
126
+ def additive
127
+ value = multiplicative
128
+ while (op = accept('+', '-'))
129
+ rhs = multiplicative
130
+ value = op == '+' ? value + rhs : value - rhs
131
+ end
132
+ value
133
+ end
134
+
135
+ def multiplicative
136
+ value = unary
137
+ while (op = accept('*', '/', '//', '%'))
138
+ value = apply_multiplicative(op, value, unary)
139
+ end
140
+ value
141
+ end
142
+
143
+ def unary
144
+ op = accept('+', '-')
145
+ return power unless op
146
+
147
+ value = unary
148
+ op == '-' ? -value : value
149
+ end
150
+
151
+ def power
152
+ base = atom
153
+ return base unless accept('**')
154
+
155
+ apply_power(base, unary)
156
+ end
157
+
158
+ def atom
159
+ return parenthesized if accept('(')
160
+
161
+ token = peek
162
+ unless token&.match?(/\A[.\d]/)
163
+ raise Error, token ? "unexpected #{token.inspect}" : 'unexpected end of expression'
164
+ end
165
+
166
+ @pos += 1
167
+ token.match?(/[.eE]/) ? token.to_f : token.to_i
168
+ end
169
+
170
+ def parenthesized
171
+ value = additive
172
+ raise Error, 'missing closing parenthesis' unless accept(')')
173
+
174
+ value
175
+ end
176
+
177
+ # @return [String, nil] the next token without consuming it
178
+ def peek
179
+ @tokens[@pos]
180
+ end
181
+
182
+ # Consume and return the next token if it is one of +expected+.
183
+ #
184
+ # @return [String, nil] the consumed token, or nil on no match
185
+ def accept(*expected)
186
+ token = peek
187
+ return nil unless expected.include?(token)
188
+
189
+ @pos += 1
190
+ token
191
+ end
192
+
193
+ # +/+ is Python-3 true division (always Float); +//+ floors
194
+ # (kept exact in Ruby's arbitrary-precision Integer division
195
+ # when both operands are Integers — Ruby's +Integer#/+ already
196
+ # floors like Python's +//+); +%+ delegates to Ruby's +%+,
197
+ # whose sign-of-divisor semantics match Python's exactly.
198
+ def apply_multiplicative(op, lhs, rhs)
199
+ return lhs * rhs if op == '*'
200
+ raise Error, 'division by zero' if rhs.zero?
201
+
202
+ case op
203
+ when '/' then lhs.fdiv(rhs)
204
+ when '//' then lhs.is_a?(Integer) && rhs.is_a?(Integer) ? lhs / rhs : lhs.fdiv(rhs).floor
205
+ when '%' then lhs % rhs
206
+ end
207
+ end
208
+
209
+ # Two Python-compatibility shims over Ruby's +**+: an Integer
210
+ # raised to a negative Integer yields a Float (Ruby would
211
+ # return a Rational), and a Complex result — negative base
212
+ # under a fractional exponent — is rejected loudly.
213
+ def apply_power(base, exponent)
214
+ if base.is_a?(Integer) && exponent.is_a?(Integer) && exponent.negative?
215
+ raise Error, 'division by zero' if base.zero?
216
+
217
+ return base.to_f**exponent
218
+ end
219
+ result = base**exponent
220
+ if result.is_a?(Complex)
221
+ raise Error, "(#{base})**(#{exponent}) is a complex number; only real arithmetic is supported"
222
+ end
223
+
224
+ result
225
+ end
226
+ end
55
227
  end
56
228
 
57
229
  # Arithmetic-evaluation tool backed by {Tool::Calculator.calculate}.
58
- # Accepts Python-flavored operator syntax (+, -, *, /, ** for
59
- # exponentiation, %, parentheses, decimals) so the model can emit the
60
- # syntax it already knows.
230
+ # Accepts Python expression syntax (+, -, *, /, //, %, ** for
231
+ # exponentiation, unary minus, parentheses, decimals) so the model
232
+ # can emit the syntax it already knows.
61
233
  #
62
234
  # @return [Tool]
63
235
  CALCULATOR = new(
@@ -67,7 +239,7 @@ module Pikuri
67
239
 
68
240
  Usage:
69
241
  - Use this for any arithmetic beyond simple mental math — do not eyeball multi-digit work.
70
- - Operators supported: +, -, *, /, ** (exponentiation), %, parentheses, decimal numbers.
242
+ - Python expression syntax: +, -, *, / (true division), // (floor division), % (modulo), ** (exponentiation), unary minus, parentheses, decimal numbers.
71
243
  - Decimal results are rounded to 3 places; integer results are exact.
72
244
  - Failures (parse error, division by zero) come back as "Error: ..." — read the message and re-call with a corrected expression.
73
245
  DESC
@@ -3,13 +3,14 @@
3
3
  module Pikuri
4
4
  class Tool
5
5
  # Truncation policy and Tool spec for the +fetch+ tool. The HTTP work
6
- # lives in {Tool::Scraper::Simple.fetch}; this module is a thin
6
+ # lives in {Tool::Scraper.fetch}; this module is a thin
7
7
  # wrapper that accepts only textual content-types, applies a character
8
8
  # cap so the LLM doesn't drown in long-form bodies, and exposes the
9
9
  # result to the agent loop in OpenAI tool-call shape.
10
10
  #
11
- # Sister of {Tool::WebScrape}, but without HTML→Markdown or PDF→text
12
- # extraction: bodies are returned verbatim. Useful for raw textual
11
+ # Sister of {Tool::WebScrape}, but with no extraction pass
12
+ # (HTML→Markdown, or whatever plug-in extractors are registered):
13
+ # bodies are returned verbatim. Useful for raw textual
13
14
  # data — JSON APIs, CSV files, +robots.txt+, sitemaps, source files —
14
15
  # where any rendering pass would corrupt the payload.
15
16
  module Fetch
@@ -56,7 +57,7 @@ module Pikuri
56
57
  CACHE
57
58
  end
58
59
 
59
- # Download +url+ via {Tool::Scraper::Simple.fetch} and return the
60
+ # Download +url+ via {Tool::Scraper.fetch} and return the
60
61
  # response body verbatim, provided the content-type is one we deem
61
62
  # textual (any +text/*+, plus the formats listed in
62
63
  # {TEXTUAL_APPLICATION_TYPES}). Anything else — PDFs, images, other
@@ -100,16 +101,16 @@ module Pikuri
100
101
  # redirect-loop exhaustion, missing +Location+ on a 3xx, or a
101
102
  # non-textual content-type
102
103
  def self.download(url)
103
- fetched = Scraper::Simple.fetch(url)
104
+ fetched = Scraper.fetch(url)
104
105
  return fetched.body if textual?(fetched.content_type)
105
106
 
106
107
  raise Scraper::FetchError,
107
108
  "refused to fetch #{url}: content-type #{fetched.content_type.inspect} " \
108
- 'is not textual (use web_scrape for PDFs or rendered pages)'
109
+ 'is not textual (use web_scrape for rendered pages)'
109
110
  end
110
111
 
111
112
  # @param content_type [String] normalized content-type (no +charset+
112
- # parameter, lowercased) as produced by {Scraper::Simple.fetch}
113
+ # parameter, lowercased) as produced by {Scraper.fetch}
113
114
  # @return [Boolean] true when the content-type is +text/*+ or one
114
115
  # of {TEXTUAL_APPLICATION_TYPES}
115
116
  def self.textual?(content_type)
@@ -138,7 +139,7 @@ module Pikuri
138
139
  # Verbatim URL download tool. Thin wrapper over {Tool::Fetch.fetch}
139
140
  # that exposes it to the agent loop in OpenAI tool-call shape. Use for
140
141
  # raw textual payloads (JSON APIs, CSV files, +robots.txt+, source
141
- # files); use {Tool::WEB_SCRAPE} for rendered web pages or PDFs where
142
+ # files); use {Tool::WEB_SCRAPE} for rendered web pages where
142
143
  # readability extraction makes the result usable.
143
144
  #
144
145
  # @return [Tool]
@@ -149,7 +150,7 @@ module Pikuri
149
150
 
150
151
  Usage:
151
152
  - Use for raw textual payloads: JSON APIs, CSV files, robots.txt, sitemaps, source files — anywhere a rendering pass would corrupt the data.
152
- - For rendered HTML pages or PDFs, use web_scrape — it extracts readable content; fetch returns the raw HTML/PDF bytes unchanged.
153
+ - For rendered HTML pages, use web_scrape — it extracts readable content; fetch returns the raw HTML bytes unchanged.
153
154
  - Accepts text/* and common textual application/* types (JSON, XML, JS, XHTML, RSS, Atom). Refuses PDFs, images, and other binaries.
154
155
  DESC
155
156
  parameters: Parameters.build { |p|
@@ -0,0 +1,186 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'faraday'
4
+ require 'stringio'
5
+ require 'uri'
6
+
7
+ module Pikuri
8
+ class Tool
9
+ # HTTP side of the web tools ({Tool::WEB_SCRAPE} and {Tool::FETCH}):
10
+ # GET the URL with a real-browser User-Agent, follow redirects, and
11
+ # hand the response body to {Pikuri::Extractor.extract} with the
12
+ # response's +Content-Type+ as the hint. HTML/XHTML render via
13
+ # {Extractor::HTML}, any other +text/*+ type passes through
14
+ # verbatim, and plug-in extractors extend the set (with pikuri-pdf
15
+ # registered, +application/pdf+ extracts — by header or by +%PDF-+
16
+ # magic, so a PDF served under a lying header still works); the
17
+ # remaining types raise {FetchError} so the LLM observes the
18
+ # failure instead of receiving an empty rendering.
19
+ #
20
+ # Split into a thin HTTP fetch ({.fetch}) and the extraction
21
+ # wrapper ({.visit}) so tests can drive each piece in isolation and
22
+ # {Tool::Fetch} can reuse the HTTP half without the extraction
23
+ # pass. Nothing here knows about the LLM; the tools that wrap this
24
+ # module own caching and truncation and turn rendered Markdown (or
25
+ # {FetchError}) into the next observation.
26
+ module Scraper
27
+ # Raised when a URL cannot be rendered into Markdown text — HTTP
28
+ # non-2xx, network failure, redirect-loop, missing +Location+,
29
+ # unsupported content-type, or a parse failure that reads as "try
30
+ # a different URL" to the LLM. Catching this in
31
+ # {Tool::WEB_SCRAPE} / {Tool::FETCH} turns the failure into an
32
+ # +"Error: ..."+ observation; anything else bubbles up so genuine
33
+ # bugs stay visible.
34
+ class FetchError < StandardError; end
35
+
36
+ # @return [String] User-Agent sent with each request; many sites
37
+ # reject requests with no UA or an obvious bot UA
38
+ USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' \
39
+ '(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
40
+ # @return [String] +Accept+ header sent with each request, so
41
+ # servers that content-negotiate hand back something we can use:
42
+ # rendered HTML first, +application/pdf+ for hosts with a PDF
43
+ # extractor registered, then any +text/*+ for the verbatim
44
+ # pass-through arm.
45
+ ACCEPT = 'text/html,application/xhtml+xml,application/pdf,text/*;q=0.8'
46
+ # @return [Integer] maximum number of HTTP redirects to follow
47
+ # before giving up
48
+ MAX_REDIRECTS = 5
49
+ # @return [Integer] connect timeout in seconds for the underlying
50
+ # Faraday request
51
+ OPEN_TIMEOUT = 10
52
+ # @return [Integer] read timeout in seconds for the underlying
53
+ # Faraday request
54
+ READ_TIMEOUT = 20
55
+
56
+ # @return [Integer] maximum number of characters of an error
57
+ # response body to include in a {FetchError} message. The body is
58
+ # often a multi-kilobyte HTML challenge page (Cloudflare, WAF
59
+ # interstitial, etc.); a short excerpt tells the LLM what kind of
60
+ # page came back without flooding the next observation.
61
+ ERROR_BODY_EXCERPT = 200
62
+
63
+ # Result of a successful {Scraper.fetch}: the response body, the
64
+ # normalized content-type (lower-cased, with any +; charset=...+
65
+ # parameters stripped), and the final URL after redirects.
66
+ Fetched = Data.define(:body, :content_type, :url)
67
+
68
+ # Fetch +url+ and render its main content as Markdown.
69
+ #
70
+ # No caching here — every call hits the network. Callers that want
71
+ # to memoize results should wrap this method themselves (see
72
+ # {Tool::WebScrape.visit}, which does exactly that).
73
+ #
74
+ # The extracted output is +String#strip+'d so the LLM never sees
75
+ # a body that opens or closes with blank lines — common with
76
+ # extracted PDFs' page-feed whitespace and with text bodies that
77
+ # carry a trailing newline. Interior whitespace is preserved
78
+ # because Markdown paragraph breaks and source-code indentation
79
+ # are load-bearing.
80
+ #
81
+ # @param url [String] absolute HTTP(S) URL of the page to download
82
+ # @return [String] full Markdown representation of the page with
83
+ # leading/trailing whitespace trimmed, uncapped otherwise —
84
+ # caller is responsible for any size limiting before feeding
85
+ # the result back to the LLM
86
+ # @raise [FetchError] on HTTP non-2xx, network failure, redirect
87
+ # loop, a 3xx without a +Location+ header, a response no
88
+ # extractor recognizes, or an extraction failure (malformed
89
+ # PDF, ...)
90
+ def self.visit(url)
91
+ extract(fetch(url)).strip
92
+ end
93
+
94
+ # Render a {Fetched} response as Markdown via
95
+ # {Pikuri::Extractor.extract}, re-raising both extraction failure
96
+ # modes as {FetchError} — the single exception type the web tools
97
+ # rescue. The content-type is passed verbatim (including the +""+
98
+ # of a missing header, which matches no text arm — a body without
99
+ # transport metadata is refused, not sniffed; only a strong magic
100
+ # sniff like pikuri-pdf's +%PDF-+ overrides a wrong or missing
101
+ # header, because such a sniff never misfires on text).
102
+ #
103
+ # @param fetched [Fetched]
104
+ # @return [String] Markdown representation produced by the
105
+ # matched extractor
106
+ # @raise [FetchError] when no extractor matches the response's
107
+ # content-type, or when extraction fails
108
+ def self.extract(fetched)
109
+ Pikuri::Extractor.extract(StringIO.new(fetched.body), content_type: fetched.content_type)
110
+ rescue Pikuri::Extractor::Unsupported
111
+ raise FetchError, "unsupported content-type #{fetched.content_type.inspect} for #{fetched.url}"
112
+ rescue Pikuri::Extractor::Error => e
113
+ raise FetchError, e.message
114
+ end
115
+
116
+ # Download the body of +url+, manually following up to
117
+ # {MAX_REDIRECTS} redirects. Faraday is configured with no
118
+ # middleware so behavior here mirrors the rest of the codebase
119
+ # (see +Tool::Search::DuckDuckGo.search+).
120
+ #
121
+ # All recoverable failures — HTTP 4xx/5xx, +Faraday::Error+ network
122
+ # blips, exhausted redirect budget, 3xx without a +Location+ —
123
+ # surface as {FetchError} so the caller has a single exception type
124
+ # to rescue. Error bodies are trimmed to {ERROR_BODY_EXCERPT}
125
+ # characters with whitespace collapsed, so a Cloudflare-challenge
126
+ # response doesn't dump kilobytes of inline HTML into the next LLM
127
+ # observation.
128
+ #
129
+ # @param url [String] absolute HTTP(S) URL to fetch
130
+ # @param limit [Integer] redirects remaining; recurses with
131
+ # +limit - 1+ on each 3xx
132
+ # @return [Fetched] body, normalized content-type, and final URL
133
+ # after redirects
134
+ # @raise [FetchError] on non-2xx/3xx responses, network errors,
135
+ # redirect-loop exhaustion, or 3xx without a +Location+ header
136
+ def self.fetch(url, limit: MAX_REDIRECTS)
137
+ raise FetchError, "too many redirects fetching #{url}" if limit.zero?
138
+
139
+ response = begin
140
+ Faraday.new(request: { open_timeout: OPEN_TIMEOUT, timeout: READ_TIMEOUT }).get(url) do |req|
141
+ req.headers['User-Agent'] = USER_AGENT
142
+ req.headers['Accept'] = ACCEPT
143
+ end
144
+ rescue Faraday::Error => e
145
+ raise FetchError, "#{e.class.name.split('::').last} fetching #{url}: #{e.message}"
146
+ end
147
+
148
+ case response.status
149
+ when 200..299
150
+ Fetched.new(body: response.body, content_type: normalize_content_type(response.headers['content-type']), url: url)
151
+ when 300..399
152
+ location = response.headers['location']
153
+ raise FetchError, "HTTP #{response.status} from #{url} with no Location header" if location.nil? || location.empty?
154
+
155
+ fetch(URI.join(url, location).to_s, limit: limit - 1)
156
+ else
157
+ raise FetchError, "HTTP #{response.status} fetching #{url}: #{excerpt(response.body)}"
158
+ end
159
+ end
160
+
161
+ # Lower-case +raw+ and strip any +; charset=...+ parameters so the
162
+ # extractors can match on a canonical token.
163
+ #
164
+ # @param raw [String, nil] raw +Content-Type+ header value
165
+ # @return [String] normalized content-type, or +""+ when the
166
+ # header was missing
167
+ def self.normalize_content_type(raw)
168
+ raw.to_s.split(';').first.to_s.strip.downcase
169
+ end
170
+ private_class_method :normalize_content_type
171
+
172
+ # Whitespace-collapse +body+ and clip to {ERROR_BODY_EXCERPT}
173
+ # characters, so the {FetchError} message stays a single readable
174
+ # line even when the server returned a multi-KB HTML challenge
175
+ # page.
176
+ #
177
+ # @param body [String, nil]
178
+ # @return [String]
179
+ def self.excerpt(body)
180
+ text = body.to_s.gsub(/\s+/, ' ').strip
181
+ text.length > ERROR_BODY_EXCERPT ? "#{text[0, ERROR_BODY_EXCERPT]}..." : text
182
+ end
183
+ private_class_method :excerpt
184
+ end
185
+ end
186
+ end
@@ -3,7 +3,7 @@
3
3
  module Pikuri
4
4
  class Tool
5
5
  # Truncation policy and Tool spec for the +web_scrape+ tool. The actual
6
- # scraping lives in {Tool::Scraper::Simple}; this module is a thin
6
+ # scraping lives in {Tool::Scraper}; this module is a thin
7
7
  # wrapper that picks the scraper, applies a character cap so the LLM
8
8
  # doesn't drown in long-form content, and exposes the result to the
9
9
  # agent loop in OpenAI tool-call shape.
@@ -37,7 +37,7 @@ module Pikuri
37
37
  CACHE
38
38
  end
39
39
 
40
- # Fetch +url+ via {Tool::Scraper::Simple} and truncate the rendered
40
+ # Fetch +url+ via {Tool::Scraper} and truncate the rendered
41
41
  # Markdown to +max_chars+ characters.
42
42
  #
43
43
  # The full extracted Markdown is cached on disk via {.cache}, keyed
@@ -65,7 +65,7 @@ module Pikuri
65
65
  # truncated, or +"Error: ..."+ on a recoverable fetch failure
66
66
  def self.visit(url, max_chars: DEFAULT_MAX_CHARS)
67
67
  max_chars = max_chars.clamp(1, MAX_MAX_CHARS)
68
- markdown = cache.fetch(url) { Scraper::Simple.visit(url) }
68
+ markdown = cache.fetch(url) { Scraper.visit(url) }
69
69
  truncate(markdown, max_chars)
70
70
  rescue Scraper::FetchError => e
71
71
  "Error: #{e.message}"
@@ -95,10 +95,10 @@ module Pikuri
95
95
  WEB_SCRAPE = new(
96
96
  name: 'web_scrape',
97
97
  description: <<~DESC,
98
- Scrapes the rendered webpage, PDF, or text file at the given URL and returns its main content as Markdown.
98
+ Scrapes the rendered webpage or text file at the given URL and returns its main content as Markdown.
99
99
 
100
100
  Usage:
101
- - Use for HTML pages or PDFs where you want readable content — readability extraction strips nav, sidebars, and boilerplate.
101
+ - Use for HTML pages where you want readable content — readability extraction strips nav, sidebars, and boilerplate.
102
102
  - For raw textual payloads (JSON, CSV, robots.txt, source files), use fetch instead — it returns bytes verbatim, while web_scrape would corrupt them with a Markdown pass.
103
103
  - A Single Page App may return very little or no content. Do NOT retry with a larger max_chars; try a different URL instead.
104
104
  DESC
@@ -6,5 +6,5 @@ module Pikuri
6
6
  # additions to the public surface (+Pikuri::Tool+ / +Pikuri::Agent+ /
7
7
  # listeners / bundled tools), major for breaking changes to that
8
8
  # surface or to the +bin/pikuri-*+ CLIs.
9
- VERSION = '0.0.5'
9
+ VERSION = '0.0.6'
10
10
  end
data/lib/pikuri-core.rb CHANGED
@@ -169,7 +169,6 @@ module Pikuri
169
169
  Loader.ignore(File.expand_path('pikuri/version.rb', __dir__))
170
170
  Loader.inflector.inflect(
171
171
  'html' => 'HTML',
172
- 'pdf' => 'PDF',
173
172
  'duckduckgo' => 'DuckDuckGo'
174
173
  )
175
174
  Loader.setup