pikuri-core 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/pikuri/agent/listener/terminal.rb +18 -36
- data/lib/pikuri/extractor/html.rb +303 -0
- data/lib/pikuri/extractor/passthrough.rb +64 -0
- data/lib/pikuri/extractor.rb +314 -0
- data/lib/pikuri/file_type.rb +74 -266
- data/lib/pikuri/subprocess.rb +73 -2
- data/lib/pikuri/tool/calculator.rb +213 -41
- data/lib/pikuri/tool/fetch.rb +10 -9
- data/lib/pikuri/tool/scraper.rb +186 -0
- data/lib/pikuri/tool/web_scrape.rb +5 -5
- data/lib/pikuri/version.rb +1 -1
- data/lib/pikuri-core.rb +0 -1
- metadata +5 -61
- data/lib/pikuri/tool/scraper/fetch_error.rb +0 -16
- data/lib/pikuri/tool/scraper/html.rb +0 -285
- data/lib/pikuri/tool/scraper/pdf.rb +0 -54
- data/lib/pikuri/tool/scraper/simple.rb +0 -183
|
@@ -1,63 +1,235 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require 'dentaku'
|
|
4
|
-
|
|
5
3
|
module Pikuri
|
|
6
4
|
class Tool
|
|
7
|
-
# Evaluates a basic arithmetic expression
|
|
8
|
-
#
|
|
9
|
-
#
|
|
5
|
+
# Evaluates a basic arithmetic expression with Python operator
|
|
6
|
+
# syntax and semantics, via the hand-rolled recursive-descent
|
|
7
|
+
# {Parser} below.
|
|
8
|
+
#
|
|
9
|
+
# Why hand-rolled rather than a gem: the previous backend, dentaku,
|
|
10
|
+
# pulled in concurrent-ruby (~16k lines of Ruby — the single
|
|
11
|
+
# heaviest audit item in pikuri's whole dependency closure) plus
|
|
12
|
+
# bigdecimal and tsort, all to evaluate four-function arithmetic.
|
|
13
|
+
# The ~100 lines here implement Python's expression grammar
|
|
14
|
+
# directly, which also retires the old preprocessing step that
|
|
15
|
+
# rewrote Python's +**+ into dentaku's +^+ dialect — the model's
|
|
16
|
+
# native syntax is now simply the grammar.
|
|
10
17
|
#
|
|
11
|
-
# Scope is intentionally narrow: operators (+, -, *, /,
|
|
12
|
-
# parentheses, and
|
|
13
|
-
# booleans — those would mean
|
|
14
|
-
# specifically want to avoid
|
|
18
|
+
# Scope is intentionally narrow: operators (+, -, *, /, //, %, **),
|
|
19
|
+
# unary minus, parentheses, and integer / decimal / e-notation
|
|
20
|
+
# literals. No variables, functions, or booleans — those would mean
|
|
21
|
+
# teaching the model a dialect, which we specifically want to avoid
|
|
22
|
+
# for this tool.
|
|
15
23
|
module Calculator
|
|
16
|
-
#
|
|
17
|
-
#
|
|
18
|
-
#
|
|
19
|
-
#
|
|
20
|
-
#
|
|
21
|
-
|
|
22
|
-
def self.normalize(expression)
|
|
23
|
-
expression.gsub('**', '^')
|
|
24
|
-
end
|
|
24
|
+
# Raised internally for anything {.calculate} should hand back to
|
|
25
|
+
# the model as an +"Error: ..."+ observation rather than crash
|
|
26
|
+
# the agent loop: parse failures, division by zero, complex or
|
|
27
|
+
# non-finite results. The message always names the offending
|
|
28
|
+
# token or operands.
|
|
29
|
+
class Error < StandardError; end
|
|
25
30
|
|
|
26
|
-
# Evaluate +expression+ and return the result formatted as a
|
|
27
|
-
# Parse
|
|
28
|
-
#
|
|
29
|
-
#
|
|
30
|
-
# crashing the
|
|
31
|
+
# Evaluate +expression+ and return the result formatted as a
|
|
32
|
+
# String. Parse and arithmetic failures (division by zero,
|
|
33
|
+
# overflow to infinity, complex results) are caught and returned
|
|
34
|
+
# as +"Error: ..."+ strings so the model can read the failure as
|
|
35
|
+
# the next observation and self-correct rather than crashing the
|
|
36
|
+
# agent loop.
|
|
31
37
|
#
|
|
32
|
-
# @param expression [String]
|
|
38
|
+
# @param expression [String] Python-syntax arithmetic expression
|
|
33
39
|
# @return [String] numeric result, or +"Error: ..."+ on failure
|
|
34
40
|
def self.calculate(expression)
|
|
35
|
-
result =
|
|
41
|
+
result = Parser.new(expression).parse
|
|
42
|
+
if result.is_a?(Float) && !result.finite?
|
|
43
|
+
raise Error, "result of #{expression.inspect} is not a finite number"
|
|
44
|
+
end
|
|
45
|
+
|
|
36
46
|
format_result(result)
|
|
37
|
-
rescue
|
|
38
|
-
'Error: division by zero'
|
|
39
|
-
rescue Dentaku::Error => e
|
|
47
|
+
rescue Error => e
|
|
40
48
|
"Error: #{e.message}"
|
|
41
49
|
end
|
|
42
50
|
|
|
43
|
-
#
|
|
44
|
-
#
|
|
45
|
-
#
|
|
46
|
-
#
|
|
47
|
-
#
|
|
51
|
+
# Integers (never produced by division — +/+ is Python-3-style
|
|
52
|
+
# true division) pass through exact. Floats are rounded to 3
|
|
53
|
+
# places so the model sees a short readable number, and
|
|
54
|
+
# whole-valued floats drop the trailing +.0+ (+4 / 2+ renders as
|
|
55
|
+
# +"2"+, not +"2.0"+).
|
|
48
56
|
def self.format_result(result)
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
57
|
+
return result.to_s if result.is_a?(Integer)
|
|
58
|
+
|
|
59
|
+
rounded = result.round(3)
|
|
60
|
+
rounded == rounded.truncate ? rounded.truncate.to_s : rounded.to_s
|
|
53
61
|
end
|
|
54
62
|
private_class_method :format_result
|
|
63
|
+
|
|
64
|
+
# Recursive-descent parser-evaluator for Python's arithmetic
|
|
65
|
+
# expression grammar:
|
|
66
|
+
#
|
|
67
|
+
# additive := multiplicative (('+' | '-') multiplicative)*
|
|
68
|
+
# multiplicative := unary (('*' | '/' | '//' | '%') unary)*
|
|
69
|
+
# unary := ('+' | '-') unary | power
|
|
70
|
+
# power := atom ('**' unary)?
|
|
71
|
+
# atom := NUMBER | '(' additive ')'
|
|
72
|
+
#
|
|
73
|
+
# The +power+ → +unary+ recursion on the right operand is what
|
|
74
|
+
# makes +**+ right-associative (+2**3**2+ is 512) and lets a sign
|
|
75
|
+
# follow it (+2**-3+); +unary+ sitting *above* +power+ on the
|
|
76
|
+
# left is what makes +-2**2+ evaluate to -4 — both exactly as
|
|
77
|
+
# Python parses them.
|
|
78
|
+
#
|
|
79
|
+
# Semantics follow Python 3 where Ruby differs: +/+ is always
|
|
80
|
+
# true (float) division, +//+ floors, +2**-1+ is the Float 0.5
|
|
81
|
+
# (Ruby would return a Rational), and a negative base under a
|
|
82
|
+
# fractional exponent is rejected (Ruby would return a Complex).
|
|
83
|
+
class Parser
|
|
84
|
+
# One number or operator. +**+ / +//+ listed before their
|
|
85
|
+
# single-character prefixes so the two-character operators win;
|
|
86
|
+
# number literals cover +42+, +4.2+, +5.+, +.5+, and e-notation
|
|
87
|
+
# on any of them. +\G+ anchors each match at the scan position
|
|
88
|
+
# so nothing between tokens goes unnoticed.
|
|
89
|
+
TOKEN_RE = %r{\G\s*(\*\*|//|\d+(?:\.\d*)?(?:[eE][+-]?\d+)?|\.\d+(?:[eE][+-]?\d+)?|[-+*/%()])}
|
|
90
|
+
|
|
91
|
+
# @param expression [String] raw expression as the model wrote it
|
|
92
|
+
# @raise [Error] when +expression+ contains a character no token matches
|
|
93
|
+
def initialize(expression)
|
|
94
|
+
@tokens = tokenize(expression)
|
|
95
|
+
@pos = 0
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# Parse and evaluate the whole token stream.
|
|
99
|
+
#
|
|
100
|
+
# @return [Integer, Float] the value of the expression
|
|
101
|
+
# @raise [Error] on syntax errors, division by zero, or a complex result
|
|
102
|
+
def parse
|
|
103
|
+
value = additive
|
|
104
|
+
raise Error, "unexpected #{peek.inspect} after expression" if peek
|
|
105
|
+
|
|
106
|
+
value
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
private
|
|
110
|
+
|
|
111
|
+
# @param expression [String]
|
|
112
|
+
# @return [Array<String>] token strings in source order
|
|
113
|
+
def tokenize(expression)
|
|
114
|
+
tokens = []
|
|
115
|
+
pos = 0
|
|
116
|
+
while (match = TOKEN_RE.match(expression, pos))
|
|
117
|
+
tokens << match[1]
|
|
118
|
+
pos = match.end(0)
|
|
119
|
+
end
|
|
120
|
+
rest = expression[pos..].to_s.strip
|
|
121
|
+
raise Error, "unexpected character #{rest[0].inspect} in #{expression.inspect}" unless rest.empty?
|
|
122
|
+
|
|
123
|
+
tokens
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
def additive
|
|
127
|
+
value = multiplicative
|
|
128
|
+
while (op = accept('+', '-'))
|
|
129
|
+
rhs = multiplicative
|
|
130
|
+
value = op == '+' ? value + rhs : value - rhs
|
|
131
|
+
end
|
|
132
|
+
value
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
def multiplicative
|
|
136
|
+
value = unary
|
|
137
|
+
while (op = accept('*', '/', '//', '%'))
|
|
138
|
+
value = apply_multiplicative(op, value, unary)
|
|
139
|
+
end
|
|
140
|
+
value
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
def unary
|
|
144
|
+
op = accept('+', '-')
|
|
145
|
+
return power unless op
|
|
146
|
+
|
|
147
|
+
value = unary
|
|
148
|
+
op == '-' ? -value : value
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
def power
|
|
152
|
+
base = atom
|
|
153
|
+
return base unless accept('**')
|
|
154
|
+
|
|
155
|
+
apply_power(base, unary)
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
def atom
|
|
159
|
+
return parenthesized if accept('(')
|
|
160
|
+
|
|
161
|
+
token = peek
|
|
162
|
+
unless token&.match?(/\A[.\d]/)
|
|
163
|
+
raise Error, token ? "unexpected #{token.inspect}" : 'unexpected end of expression'
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
@pos += 1
|
|
167
|
+
token.match?(/[.eE]/) ? token.to_f : token.to_i
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
def parenthesized
|
|
171
|
+
value = additive
|
|
172
|
+
raise Error, 'missing closing parenthesis' unless accept(')')
|
|
173
|
+
|
|
174
|
+
value
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
# @return [String, nil] the next token without consuming it
|
|
178
|
+
def peek
|
|
179
|
+
@tokens[@pos]
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
# Consume and return the next token if it is one of +expected+.
|
|
183
|
+
#
|
|
184
|
+
# @return [String, nil] the consumed token, or nil on no match
|
|
185
|
+
def accept(*expected)
|
|
186
|
+
token = peek
|
|
187
|
+
return nil unless expected.include?(token)
|
|
188
|
+
|
|
189
|
+
@pos += 1
|
|
190
|
+
token
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
# +/+ is Python-3 true division (always Float); +//+ floors
|
|
194
|
+
# (kept exact in Ruby's arbitrary-precision Integer division
|
|
195
|
+
# when both operands are Integers — Ruby's +Integer#/+ already
|
|
196
|
+
# floors like Python's +//+); +%+ delegates to Ruby's +%+,
|
|
197
|
+
# whose sign-of-divisor semantics match Python's exactly.
|
|
198
|
+
def apply_multiplicative(op, lhs, rhs)
|
|
199
|
+
return lhs * rhs if op == '*'
|
|
200
|
+
raise Error, 'division by zero' if rhs.zero?
|
|
201
|
+
|
|
202
|
+
case op
|
|
203
|
+
when '/' then lhs.fdiv(rhs)
|
|
204
|
+
when '//' then lhs.is_a?(Integer) && rhs.is_a?(Integer) ? lhs / rhs : lhs.fdiv(rhs).floor
|
|
205
|
+
when '%' then lhs % rhs
|
|
206
|
+
end
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
# Two Python-compatibility shims over Ruby's +**+: an Integer
|
|
210
|
+
# raised to a negative Integer yields a Float (Ruby would
|
|
211
|
+
# return a Rational), and a Complex result — negative base
|
|
212
|
+
# under a fractional exponent — is rejected loudly.
|
|
213
|
+
def apply_power(base, exponent)
|
|
214
|
+
if base.is_a?(Integer) && exponent.is_a?(Integer) && exponent.negative?
|
|
215
|
+
raise Error, 'division by zero' if base.zero?
|
|
216
|
+
|
|
217
|
+
return base.to_f**exponent
|
|
218
|
+
end
|
|
219
|
+
result = base**exponent
|
|
220
|
+
if result.is_a?(Complex)
|
|
221
|
+
raise Error, "(#{base})**(#{exponent}) is a complex number; only real arithmetic is supported"
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
result
|
|
225
|
+
end
|
|
226
|
+
end
|
|
55
227
|
end
|
|
56
228
|
|
|
57
229
|
# Arithmetic-evaluation tool backed by {Tool::Calculator.calculate}.
|
|
58
|
-
# Accepts Python
|
|
59
|
-
# exponentiation,
|
|
60
|
-
# syntax it already knows.
|
|
230
|
+
# Accepts Python expression syntax (+, -, *, /, //, %, ** for
|
|
231
|
+
# exponentiation, unary minus, parentheses, decimals) so the model
|
|
232
|
+
# can emit the syntax it already knows.
|
|
61
233
|
#
|
|
62
234
|
# @return [Tool]
|
|
63
235
|
CALCULATOR = new(
|
|
@@ -67,7 +239,7 @@ module Pikuri
|
|
|
67
239
|
|
|
68
240
|
Usage:
|
|
69
241
|
- Use this for any arithmetic beyond simple mental math — do not eyeball multi-digit work.
|
|
70
|
-
-
|
|
242
|
+
- Python expression syntax: +, -, *, / (true division), // (floor division), % (modulo), ** (exponentiation), unary minus, parentheses, decimal numbers.
|
|
71
243
|
- Decimal results are rounded to 3 places; integer results are exact.
|
|
72
244
|
- Failures (parse error, division by zero) come back as "Error: ..." — read the message and re-call with a corrected expression.
|
|
73
245
|
DESC
|
data/lib/pikuri/tool/fetch.rb
CHANGED
|
@@ -3,13 +3,14 @@
|
|
|
3
3
|
module Pikuri
|
|
4
4
|
class Tool
|
|
5
5
|
# Truncation policy and Tool spec for the +fetch+ tool. The HTTP work
|
|
6
|
-
# lives in {Tool::Scraper
|
|
6
|
+
# lives in {Tool::Scraper.fetch}; this module is a thin
|
|
7
7
|
# wrapper that accepts only textual content-types, applies a character
|
|
8
8
|
# cap so the LLM doesn't drown in long-form bodies, and exposes the
|
|
9
9
|
# result to the agent loop in OpenAI tool-call shape.
|
|
10
10
|
#
|
|
11
|
-
# Sister of {Tool::WebScrape}, but
|
|
12
|
-
#
|
|
11
|
+
# Sister of {Tool::WebScrape}, but with no extraction pass
|
|
12
|
+
# (HTML→Markdown, or whatever plug-in extractors are registered):
|
|
13
|
+
# bodies are returned verbatim. Useful for raw textual
|
|
13
14
|
# data — JSON APIs, CSV files, +robots.txt+, sitemaps, source files —
|
|
14
15
|
# where any rendering pass would corrupt the payload.
|
|
15
16
|
module Fetch
|
|
@@ -56,7 +57,7 @@ module Pikuri
|
|
|
56
57
|
CACHE
|
|
57
58
|
end
|
|
58
59
|
|
|
59
|
-
# Download +url+ via {Tool::Scraper
|
|
60
|
+
# Download +url+ via {Tool::Scraper.fetch} and return the
|
|
60
61
|
# response body verbatim, provided the content-type is one we deem
|
|
61
62
|
# textual (any +text/*+, plus the formats listed in
|
|
62
63
|
# {TEXTUAL_APPLICATION_TYPES}). Anything else — PDFs, images, other
|
|
@@ -100,16 +101,16 @@ module Pikuri
|
|
|
100
101
|
# redirect-loop exhaustion, missing +Location+ on a 3xx, or a
|
|
101
102
|
# non-textual content-type
|
|
102
103
|
def self.download(url)
|
|
103
|
-
fetched = Scraper
|
|
104
|
+
fetched = Scraper.fetch(url)
|
|
104
105
|
return fetched.body if textual?(fetched.content_type)
|
|
105
106
|
|
|
106
107
|
raise Scraper::FetchError,
|
|
107
108
|
"refused to fetch #{url}: content-type #{fetched.content_type.inspect} " \
|
|
108
|
-
'is not textual (use web_scrape for
|
|
109
|
+
'is not textual (use web_scrape for rendered pages)'
|
|
109
110
|
end
|
|
110
111
|
|
|
111
112
|
# @param content_type [String] normalized content-type (no +charset+
|
|
112
|
-
# parameter, lowercased) as produced by {Scraper
|
|
113
|
+
# parameter, lowercased) as produced by {Scraper.fetch}
|
|
113
114
|
# @return [Boolean] true when the content-type is +text/*+ or one
|
|
114
115
|
# of {TEXTUAL_APPLICATION_TYPES}
|
|
115
116
|
def self.textual?(content_type)
|
|
@@ -138,7 +139,7 @@ module Pikuri
|
|
|
138
139
|
# Verbatim URL download tool. Thin wrapper over {Tool::Fetch.fetch}
|
|
139
140
|
# that exposes it to the agent loop in OpenAI tool-call shape. Use for
|
|
140
141
|
# raw textual payloads (JSON APIs, CSV files, +robots.txt+, source
|
|
141
|
-
# files); use {Tool::WEB_SCRAPE} for rendered web pages
|
|
142
|
+
# files); use {Tool::WEB_SCRAPE} for rendered web pages where
|
|
142
143
|
# readability extraction makes the result usable.
|
|
143
144
|
#
|
|
144
145
|
# @return [Tool]
|
|
@@ -149,7 +150,7 @@ module Pikuri
|
|
|
149
150
|
|
|
150
151
|
Usage:
|
|
151
152
|
- Use for raw textual payloads: JSON APIs, CSV files, robots.txt, sitemaps, source files — anywhere a rendering pass would corrupt the data.
|
|
152
|
-
- For rendered HTML pages
|
|
153
|
+
- For rendered HTML pages, use web_scrape — it extracts readable content; fetch returns the raw HTML bytes unchanged.
|
|
153
154
|
- Accepts text/* and common textual application/* types (JSON, XML, JS, XHTML, RSS, Atom). Refuses PDFs, images, and other binaries.
|
|
154
155
|
DESC
|
|
155
156
|
parameters: Parameters.build { |p|
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'faraday'
|
|
4
|
+
require 'stringio'
|
|
5
|
+
require 'uri'
|
|
6
|
+
|
|
7
|
+
module Pikuri
|
|
8
|
+
class Tool
|
|
9
|
+
# HTTP side of the web tools ({Tool::WEB_SCRAPE} and {Tool::FETCH}):
|
|
10
|
+
# GET the URL with a real-browser User-Agent, follow redirects, and
|
|
11
|
+
# hand the response body to {Pikuri::Extractor.extract} with the
|
|
12
|
+
# response's +Content-Type+ as the hint. HTML/XHTML render via
|
|
13
|
+
# {Extractor::HTML}, any other +text/*+ type passes through
|
|
14
|
+
# verbatim, and plug-in extractors extend the set (with pikuri-pdf
|
|
15
|
+
# registered, +application/pdf+ extracts — by header or by +%PDF-+
|
|
16
|
+
# magic, so a PDF served under a lying header still works); the
|
|
17
|
+
# remaining types raise {FetchError} so the LLM observes the
|
|
18
|
+
# failure instead of receiving an empty rendering.
|
|
19
|
+
#
|
|
20
|
+
# Split into a thin HTTP fetch ({.fetch}) and the extraction
|
|
21
|
+
# wrapper ({.visit}) so tests can drive each piece in isolation and
|
|
22
|
+
# {Tool::Fetch} can reuse the HTTP half without the extraction
|
|
23
|
+
# pass. Nothing here knows about the LLM; the tools that wrap this
|
|
24
|
+
# module own caching and truncation and turn rendered Markdown (or
|
|
25
|
+
# {FetchError}) into the next observation.
|
|
26
|
+
module Scraper
|
|
27
|
+
# Raised when a URL cannot be rendered into Markdown text — HTTP
|
|
28
|
+
# non-2xx, network failure, redirect-loop, missing +Location+,
|
|
29
|
+
# unsupported content-type, or a parse failure that reads as "try
|
|
30
|
+
# a different URL" to the LLM. Catching this in
|
|
31
|
+
# {Tool::WEB_SCRAPE} / {Tool::FETCH} turns the failure into an
|
|
32
|
+
# +"Error: ..."+ observation; anything else bubbles up so genuine
|
|
33
|
+
# bugs stay visible.
|
|
34
|
+
class FetchError < StandardError; end
|
|
35
|
+
|
|
36
|
+
# @return [String] User-Agent sent with each request; many sites
|
|
37
|
+
# reject requests with no UA or an obvious bot UA
|
|
38
|
+
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' \
|
|
39
|
+
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
40
|
+
# @return [String] +Accept+ header sent with each request, so
|
|
41
|
+
# servers that content-negotiate hand back something we can use:
|
|
42
|
+
# rendered HTML first, +application/pdf+ for hosts with a PDF
|
|
43
|
+
# extractor registered, then any +text/*+ for the verbatim
|
|
44
|
+
# pass-through arm.
|
|
45
|
+
ACCEPT = 'text/html,application/xhtml+xml,application/pdf,text/*;q=0.8'
|
|
46
|
+
# @return [Integer] maximum number of HTTP redirects to follow
|
|
47
|
+
# before giving up
|
|
48
|
+
MAX_REDIRECTS = 5
|
|
49
|
+
# @return [Integer] connect timeout in seconds for the underlying
|
|
50
|
+
# Faraday request
|
|
51
|
+
OPEN_TIMEOUT = 10
|
|
52
|
+
# @return [Integer] read timeout in seconds for the underlying
|
|
53
|
+
# Faraday request
|
|
54
|
+
READ_TIMEOUT = 20
|
|
55
|
+
|
|
56
|
+
# @return [Integer] maximum number of characters of an error
|
|
57
|
+
# response body to include in a {FetchError} message. The body is
|
|
58
|
+
# often a multi-kilobyte HTML challenge page (Cloudflare, WAF
|
|
59
|
+
# interstitial, etc.); a short excerpt tells the LLM what kind of
|
|
60
|
+
# page came back without flooding the next observation.
|
|
61
|
+
ERROR_BODY_EXCERPT = 200
|
|
62
|
+
|
|
63
|
+
# Result of a successful {Scraper.fetch}: the response body, the
|
|
64
|
+
# normalized content-type (lower-cased, with any +; charset=...+
|
|
65
|
+
# parameters stripped), and the final URL after redirects.
|
|
66
|
+
Fetched = Data.define(:body, :content_type, :url)
|
|
67
|
+
|
|
68
|
+
# Fetch +url+ and render its main content as Markdown.
|
|
69
|
+
#
|
|
70
|
+
# No caching here — every call hits the network. Callers that want
|
|
71
|
+
# to memoize results should wrap this method themselves (see
|
|
72
|
+
# {Tool::WebScrape.visit}, which does exactly that).
|
|
73
|
+
#
|
|
74
|
+
# The extracted output is +String#strip+'d so the LLM never sees
|
|
75
|
+
# a body that opens or closes with blank lines — common with
|
|
76
|
+
# extracted PDFs' page-feed whitespace and with text bodies that
|
|
77
|
+
# carry a trailing newline. Interior whitespace is preserved
|
|
78
|
+
# because Markdown paragraph breaks and source-code indentation
|
|
79
|
+
# are load-bearing.
|
|
80
|
+
#
|
|
81
|
+
# @param url [String] absolute HTTP(S) URL of the page to download
|
|
82
|
+
# @return [String] full Markdown representation of the page with
|
|
83
|
+
# leading/trailing whitespace trimmed, uncapped otherwise —
|
|
84
|
+
# caller is responsible for any size limiting before feeding
|
|
85
|
+
# the result back to the LLM
|
|
86
|
+
# @raise [FetchError] on HTTP non-2xx, network failure, redirect
|
|
87
|
+
# loop, a 3xx without a +Location+ header, a response no
|
|
88
|
+
# extractor recognizes, or an extraction failure (malformed
|
|
89
|
+
# PDF, ...)
|
|
90
|
+
def self.visit(url)
|
|
91
|
+
extract(fetch(url)).strip
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Render a {Fetched} response as Markdown via
|
|
95
|
+
# {Pikuri::Extractor.extract}, re-raising both extraction failure
|
|
96
|
+
# modes as {FetchError} — the single exception type the web tools
|
|
97
|
+
# rescue. The content-type is passed verbatim (including the +""+
|
|
98
|
+
# of a missing header, which matches no text arm — a body without
|
|
99
|
+
# transport metadata is refused, not sniffed; only a strong magic
|
|
100
|
+
# sniff like pikuri-pdf's +%PDF-+ overrides a wrong or missing
|
|
101
|
+
# header, because such a sniff never misfires on text).
|
|
102
|
+
#
|
|
103
|
+
# @param fetched [Fetched]
|
|
104
|
+
# @return [String] Markdown representation produced by the
|
|
105
|
+
# matched extractor
|
|
106
|
+
# @raise [FetchError] when no extractor matches the response's
|
|
107
|
+
# content-type, or when extraction fails
|
|
108
|
+
def self.extract(fetched)
|
|
109
|
+
Pikuri::Extractor.extract(StringIO.new(fetched.body), content_type: fetched.content_type)
|
|
110
|
+
rescue Pikuri::Extractor::Unsupported
|
|
111
|
+
raise FetchError, "unsupported content-type #{fetched.content_type.inspect} for #{fetched.url}"
|
|
112
|
+
rescue Pikuri::Extractor::Error => e
|
|
113
|
+
raise FetchError, e.message
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# Download the body of +url+, manually following up to
|
|
117
|
+
# {MAX_REDIRECTS} redirects. Faraday is configured with no
|
|
118
|
+
# middleware so behavior here mirrors the rest of the codebase
|
|
119
|
+
# (see +Tool::Search::DuckDuckGo.search+).
|
|
120
|
+
#
|
|
121
|
+
# All recoverable failures — HTTP 4xx/5xx, +Faraday::Error+ network
|
|
122
|
+
# blips, exhausted redirect budget, 3xx without a +Location+ —
|
|
123
|
+
# surface as {FetchError} so the caller has a single exception type
|
|
124
|
+
# to rescue. Error bodies are trimmed to {ERROR_BODY_EXCERPT}
|
|
125
|
+
# characters with whitespace collapsed, so a Cloudflare-challenge
|
|
126
|
+
# response doesn't dump kilobytes of inline HTML into the next LLM
|
|
127
|
+
# observation.
|
|
128
|
+
#
|
|
129
|
+
# @param url [String] absolute HTTP(S) URL to fetch
|
|
130
|
+
# @param limit [Integer] redirects remaining; recurses with
|
|
131
|
+
# +limit - 1+ on each 3xx
|
|
132
|
+
# @return [Fetched] body, normalized content-type, and final URL
|
|
133
|
+
# after redirects
|
|
134
|
+
# @raise [FetchError] on non-2xx/3xx responses, network errors,
|
|
135
|
+
# redirect-loop exhaustion, or 3xx without a +Location+ header
|
|
136
|
+
def self.fetch(url, limit: MAX_REDIRECTS)
|
|
137
|
+
raise FetchError, "too many redirects fetching #{url}" if limit.zero?
|
|
138
|
+
|
|
139
|
+
response = begin
|
|
140
|
+
Faraday.new(request: { open_timeout: OPEN_TIMEOUT, timeout: READ_TIMEOUT }).get(url) do |req|
|
|
141
|
+
req.headers['User-Agent'] = USER_AGENT
|
|
142
|
+
req.headers['Accept'] = ACCEPT
|
|
143
|
+
end
|
|
144
|
+
rescue Faraday::Error => e
|
|
145
|
+
raise FetchError, "#{e.class.name.split('::').last} fetching #{url}: #{e.message}"
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
case response.status
|
|
149
|
+
when 200..299
|
|
150
|
+
Fetched.new(body: response.body, content_type: normalize_content_type(response.headers['content-type']), url: url)
|
|
151
|
+
when 300..399
|
|
152
|
+
location = response.headers['location']
|
|
153
|
+
raise FetchError, "HTTP #{response.status} from #{url} with no Location header" if location.nil? || location.empty?
|
|
154
|
+
|
|
155
|
+
fetch(URI.join(url, location).to_s, limit: limit - 1)
|
|
156
|
+
else
|
|
157
|
+
raise FetchError, "HTTP #{response.status} fetching #{url}: #{excerpt(response.body)}"
|
|
158
|
+
end
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
# Lower-case +raw+ and strip any +; charset=...+ parameters so the
|
|
162
|
+
# extractors can match on a canonical token.
|
|
163
|
+
#
|
|
164
|
+
# @param raw [String, nil] raw +Content-Type+ header value
|
|
165
|
+
# @return [String] normalized content-type, or +""+ when the
|
|
166
|
+
# header was missing
|
|
167
|
+
def self.normalize_content_type(raw)
|
|
168
|
+
raw.to_s.split(';').first.to_s.strip.downcase
|
|
169
|
+
end
|
|
170
|
+
private_class_method :normalize_content_type
|
|
171
|
+
|
|
172
|
+
# Whitespace-collapse +body+ and clip to {ERROR_BODY_EXCERPT}
|
|
173
|
+
# characters, so the {FetchError} message stays a single readable
|
|
174
|
+
# line even when the server returned a multi-KB HTML challenge
|
|
175
|
+
# page.
|
|
176
|
+
#
|
|
177
|
+
# @param body [String, nil]
|
|
178
|
+
# @return [String]
|
|
179
|
+
def self.excerpt(body)
|
|
180
|
+
text = body.to_s.gsub(/\s+/, ' ').strip
|
|
181
|
+
text.length > ERROR_BODY_EXCERPT ? "#{text[0, ERROR_BODY_EXCERPT]}..." : text
|
|
182
|
+
end
|
|
183
|
+
private_class_method :excerpt
|
|
184
|
+
end
|
|
185
|
+
end
|
|
186
|
+
end
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
module Pikuri
|
|
4
4
|
class Tool
|
|
5
5
|
# Truncation policy and Tool spec for the +web_scrape+ tool. The actual
|
|
6
|
-
# scraping lives in {Tool::Scraper
|
|
6
|
+
# scraping lives in {Tool::Scraper}; this module is a thin
|
|
7
7
|
# wrapper that picks the scraper, applies a character cap so the LLM
|
|
8
8
|
# doesn't drown in long-form content, and exposes the result to the
|
|
9
9
|
# agent loop in OpenAI tool-call shape.
|
|
@@ -37,7 +37,7 @@ module Pikuri
|
|
|
37
37
|
CACHE
|
|
38
38
|
end
|
|
39
39
|
|
|
40
|
-
# Fetch +url+ via {Tool::Scraper
|
|
40
|
+
# Fetch +url+ via {Tool::Scraper} and truncate the rendered
|
|
41
41
|
# Markdown to +max_chars+ characters.
|
|
42
42
|
#
|
|
43
43
|
# The full extracted Markdown is cached on disk via {.cache}, keyed
|
|
@@ -65,7 +65,7 @@ module Pikuri
|
|
|
65
65
|
# truncated, or +"Error: ..."+ on a recoverable fetch failure
|
|
66
66
|
def self.visit(url, max_chars: DEFAULT_MAX_CHARS)
|
|
67
67
|
max_chars = max_chars.clamp(1, MAX_MAX_CHARS)
|
|
68
|
-
markdown = cache.fetch(url) { Scraper
|
|
68
|
+
markdown = cache.fetch(url) { Scraper.visit(url) }
|
|
69
69
|
truncate(markdown, max_chars)
|
|
70
70
|
rescue Scraper::FetchError => e
|
|
71
71
|
"Error: #{e.message}"
|
|
@@ -95,10 +95,10 @@ module Pikuri
|
|
|
95
95
|
WEB_SCRAPE = new(
|
|
96
96
|
name: 'web_scrape',
|
|
97
97
|
description: <<~DESC,
|
|
98
|
-
Scrapes the rendered webpage
|
|
98
|
+
Scrapes the rendered webpage or text file at the given URL and returns its main content as Markdown.
|
|
99
99
|
|
|
100
100
|
Usage:
|
|
101
|
-
- Use for HTML pages
|
|
101
|
+
- Use for HTML pages where you want readable content — readability extraction strips nav, sidebars, and boilerplate.
|
|
102
102
|
- For raw textual payloads (JSON, CSV, robots.txt, source files), use fetch instead — it returns bytes verbatim, while web_scrape would corrupt them with a Markdown pass.
|
|
103
103
|
- A Single Page App may return very little or no content. Do NOT retry with a larger max_chars; try a different URL instead.
|
|
104
104
|
DESC
|
data/lib/pikuri/version.rb
CHANGED