pikuri-core 0.0.4 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/pikuri/agent/configurator.rb +9 -2
- data/lib/pikuri/agent/context_window_detector.rb +70 -10
- data/lib/pikuri/agent/control/interloper.rb +10 -2
- data/lib/pikuri/agent/event.rb +15 -0
- data/lib/pikuri/agent/extension.rb +37 -9
- data/lib/pikuri/agent/listener/terminal.rb +22 -36
- data/lib/pikuri/agent.rb +174 -73
- data/lib/pikuri/extractor/html.rb +303 -0
- data/lib/pikuri/extractor/passthrough.rb +64 -0
- data/lib/pikuri/extractor.rb +314 -0
- data/lib/pikuri/file_type.rb +87 -59
- data/lib/pikuri/finalizers.rb +118 -0
- data/lib/pikuri/paths.rb +29 -0
- data/lib/pikuri/subprocess.rb +109 -12
- data/lib/pikuri/tool/calculator.rb +213 -41
- data/lib/pikuri/tool/fetch.rb +10 -9
- data/lib/pikuri/tool/scraper.rb +186 -0
- data/lib/pikuri/tool/web_scrape.rb +5 -5
- data/lib/pikuri/version.rb +1 -1
- data/lib/pikuri-core.rb +0 -1
- metadata +8 -62
- data/lib/pikuri/tool/scraper/fetch_error.rb +0 -16
- data/lib/pikuri/tool/scraper/html.rb +0 -285
- data/lib/pikuri/tool/scraper/pdf.rb +0 -54
- data/lib/pikuri/tool/scraper/simple.rb +0 -183
|
@@ -1,54 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'pdf-reader'
|
|
4
|
-
require 'stringio'
|
|
5
|
-
|
|
6
|
-
module Pikuri
|
|
7
|
-
class Tool
|
|
8
|
-
module Scraper
|
|
9
|
-
# PDF → text extractor used by {Simple.visit} when the fetched
|
|
10
|
-
# response carries +application/pdf+. Wraps the +pdf-reader+ gem:
|
|
11
|
-
# walk every page, concatenate the extracted text, hand the result
|
|
12
|
-
# back as a single string the LLM can read.
|
|
13
|
-
#
|
|
14
|
-
# Best-effort by design. +pdf-reader+ produces clean text from PDFs
|
|
15
|
-
# generated from a digital source (LaTeX, Word export, ...) but
|
|
16
|
-
# returns nothing useful from scanned documents — there is no OCR
|
|
17
|
-
# in this path. When extraction yields no text we still return an
|
|
18
|
-
# empty string rather than raising, so the caller's cache stores a
|
|
19
|
-
# consistent result and the LLM sees an empty observation it can
|
|
20
|
-
# react to.
|
|
21
|
-
#
|
|
22
|
-
# Pure parser — no I/O. {.extract} takes PDF bytes and returns text,
|
|
23
|
-
# so tests can drive it against an in-memory fixture without
|
|
24
|
-
# touching the network.
|
|
25
|
-
module PDF
|
|
26
|
-
# Render +bytes+ as plain text, one page per paragraph.
|
|
27
|
-
#
|
|
28
|
-
# +pdf-reader+ raises a handful of typed exceptions for documents
|
|
29
|
-
# it cannot parse — broken xrefs ({::PDF::Reader::MalformedPDFError}),
|
|
30
|
-
# invalid page references ({::PDF::Reader::InvalidPageError}),
|
|
31
|
-
# encrypted/XFA files ({::PDF::Reader::UnsupportedFeatureError}).
|
|
32
|
-
# All three describe a property of the PDF the LLM can react to
|
|
33
|
-
# ("try a different URL"), so we re-raise them as {FetchError} —
|
|
34
|
-
# same convention as the HTTP layer in {Simple.fetch}. Genuine
|
|
35
|
-
# bugs in +pdf-reader+ itself surface as their own classes and
|
|
36
|
-
# crash loud.
|
|
37
|
-
#
|
|
38
|
-
# @param bytes [String] raw PDF document (binary string)
|
|
39
|
-
# @return [String] concatenated page text; possibly empty when
|
|
40
|
-
# the PDF carries no extractable text (scanned image, empty
|
|
41
|
-
# document)
|
|
42
|
-
# @raise [FetchError] when +pdf-reader+ refuses the document
|
|
43
|
-
def self.extract(bytes)
|
|
44
|
-
reader = ::PDF::Reader.new(StringIO.new(bytes))
|
|
45
|
-
reader.pages.map { |p| p.text.strip }.reject(&:empty?).join("\n\n")
|
|
46
|
-
rescue ::PDF::Reader::MalformedPDFError,
|
|
47
|
-
::PDF::Reader::InvalidPageError,
|
|
48
|
-
::PDF::Reader::UnsupportedFeatureError => e
|
|
49
|
-
raise FetchError, "PDF rendering failed: #{e.class.name.split('::').last}: #{e.message}"
|
|
50
|
-
end
|
|
51
|
-
end
|
|
52
|
-
end
|
|
53
|
-
end
|
|
54
|
-
end
|
|
@@ -1,183 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'faraday'
|
|
4
|
-
require 'uri'
|
|
5
|
-
|
|
6
|
-
module Pikuri
|
|
7
|
-
class Tool
|
|
8
|
-
# Namespace for the URL-to-Markdown scraping stack used by
|
|
9
|
-
# {Tool::WEB_SCRAPE} and {Tool::FETCH}: a content-type-dispatching
|
|
10
|
-
# fetcher ({Simple}), pure content extractors ({HTML}, {PDF}), and a
|
|
11
|
-
# shared error type ({FetchError}). Nothing here knows about the LLM
|
|
12
|
-
# — the tools that wrap these layers turn rendered Markdown (or
|
|
13
|
-
# +FetchError+) into the next observation.
|
|
14
|
-
module Scraper
|
|
15
|
-
# Plain HTTP scraper: GET the URL with a real-browser User-Agent,
|
|
16
|
-
# follow redirects, and dispatch the response body to the parser
|
|
17
|
-
# matching its +Content-Type+. HTML and XHTML route to
|
|
18
|
-
# {HTML.extract}; +application/pdf+ routes to {PDF.extract}; any
|
|
19
|
-
# other +text/*+ type (plain text, Markdown, source files, …) is
|
|
20
|
-
# passed through verbatim since the LLM can already read it; the
|
|
21
|
-
# remaining types raise {FetchError} so the LLM observes the
|
|
22
|
-
# failure instead of receiving an empty rendering.
|
|
23
|
-
#
|
|
24
|
-
# Split into a thin HTTP fetch ({.fetch}) and a content-type
|
|
25
|
-
# dispatcher ({.visit}) so tests can drive each piece in isolation.
|
|
26
|
-
# "Simple" because everything happens in one Faraday GET — no
|
|
27
|
-
# headless browser, no JS execution.
|
|
28
|
-
module Simple
|
|
29
|
-
# @return [String] User-Agent sent with each request; many sites
|
|
30
|
-
# reject requests with no UA or an obvious bot UA
|
|
31
|
-
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' \
|
|
32
|
-
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
33
|
-
# @return [String] +Accept+ header sent with each request. Lists
|
|
34
|
-
# every content-type the dispatcher in {.visit} knows how to
|
|
35
|
-
# render, so servers that content-negotiate hand back something
|
|
36
|
-
# we can use. The trailing +text/*;q=0.8+ covers the verbatim
|
|
37
|
-
# pass-through arm (plain text, Markdown, source files, …) at a
|
|
38
|
-
# lower preference than rendered HTML/PDF.
|
|
39
|
-
ACCEPT = 'text/html,application/xhtml+xml,application/pdf,text/*;q=0.8'
|
|
40
|
-
# @return [Integer] maximum number of HTTP redirects to follow
|
|
41
|
-
# before giving up
|
|
42
|
-
MAX_REDIRECTS = 5
|
|
43
|
-
# @return [Integer] connect timeout in seconds for the underlying
|
|
44
|
-
# Faraday request
|
|
45
|
-
OPEN_TIMEOUT = 10
|
|
46
|
-
# @return [Integer] read timeout in seconds for the underlying
|
|
47
|
-
# Faraday request
|
|
48
|
-
READ_TIMEOUT = 20
|
|
49
|
-
|
|
50
|
-
# @return [Integer] maximum number of characters of an error
|
|
51
|
-
# response body to include in a {FetchError} message. The body is
|
|
52
|
-
# often a multi-kilobyte HTML challenge page (Cloudflare, WAF
|
|
53
|
-
# interstitial, etc.); a short excerpt tells the LLM what kind of
|
|
54
|
-
# page came back without flooding the next observation.
|
|
55
|
-
ERROR_BODY_EXCERPT = 200
|
|
56
|
-
|
|
57
|
-
# Result of a successful {Simple.fetch}: the response body, the
|
|
58
|
-
# normalized content-type (lower-cased, with any +; charset=...+
|
|
59
|
-
# parameters stripped), and the final URL after redirects. The
|
|
60
|
-
# final URL is kept so future scrapers can resolve relative links
|
|
61
|
-
# against the actual landing page rather than the originally
|
|
62
|
-
# requested one.
|
|
63
|
-
Fetched = Data.define(:body, :content_type, :url)
|
|
64
|
-
|
|
65
|
-
# Fetch +url+ and render its main content as Markdown.
|
|
66
|
-
#
|
|
67
|
-
# No caching here — every call hits the network. Callers that want
|
|
68
|
-
# to memoize results should wrap this method themselves (see
|
|
69
|
-
# {Tool::WebScrape.visit}, which does exactly that).
|
|
70
|
-
#
|
|
71
|
-
# The dispatcher's output is +String#strip+'d so the LLM never
|
|
72
|
-
# sees a body that opens or closes with blank lines — common with
|
|
73
|
-
# +pdf-reader+'s page-feed whitespace and with text bodies that
|
|
74
|
-
# carry a trailing newline. Interior whitespace is preserved
|
|
75
|
-
# because Markdown paragraph breaks and source-code indentation
|
|
76
|
-
# are load-bearing.
|
|
77
|
-
#
|
|
78
|
-
# @param url [String] absolute HTTP(S) URL of the page to download
|
|
79
|
-
# @return [String] full Markdown representation of the page with
|
|
80
|
-
# leading/trailing whitespace trimmed, uncapped otherwise —
|
|
81
|
-
# caller is responsible for any size limiting before feeding
|
|
82
|
-
# the result back to the LLM
|
|
83
|
-
# @raise [FetchError] on HTTP non-2xx, network failure, redirect
|
|
84
|
-
# loop, a 3xx without a +Location+ header, or a response whose
|
|
85
|
-
# content-type the dispatcher does not recognize
|
|
86
|
-
def self.visit(url)
|
|
87
|
-
dispatch(fetch(url)).strip
|
|
88
|
-
end
|
|
89
|
-
|
|
90
|
-
# Download the body of +url+, manually following up to
|
|
91
|
-
# {MAX_REDIRECTS} redirects. Faraday is configured with no
|
|
92
|
-
# middleware so behavior here mirrors the rest of the codebase
|
|
93
|
-
# (see +Tool::Search::DuckDuckGo.search+).
|
|
94
|
-
#
|
|
95
|
-
# All recoverable failures — HTTP 4xx/5xx, +Faraday::Error+ network
|
|
96
|
-
# blips, exhausted redirect budget, 3xx without a +Location+ —
|
|
97
|
-
# surface as {FetchError} so the caller has a single exception type
|
|
98
|
-
# to rescue. Error bodies are trimmed to {ERROR_BODY_EXCERPT}
|
|
99
|
-
# characters with whitespace collapsed, so a Cloudflare-challenge
|
|
100
|
-
# response doesn't dump kilobytes of inline HTML into the next LLM
|
|
101
|
-
# observation.
|
|
102
|
-
#
|
|
103
|
-
# @param url [String] absolute HTTP(S) URL to fetch
|
|
104
|
-
# @param limit [Integer] redirects remaining; recurses with
|
|
105
|
-
# +limit - 1+ on each 3xx
|
|
106
|
-
# @return [Fetched] body, normalized content-type, and final URL
|
|
107
|
-
# after redirects
|
|
108
|
-
# @raise [FetchError] on non-2xx/3xx responses, network errors,
|
|
109
|
-
# redirect-loop exhaustion, or 3xx without a +Location+ header
|
|
110
|
-
def self.fetch(url, limit: MAX_REDIRECTS)
|
|
111
|
-
raise FetchError, "too many redirects fetching #{url}" if limit.zero?
|
|
112
|
-
|
|
113
|
-
response = begin
|
|
114
|
-
Faraday.new(request: { open_timeout: OPEN_TIMEOUT, timeout: READ_TIMEOUT }).get(url) do |req|
|
|
115
|
-
req.headers['User-Agent'] = USER_AGENT
|
|
116
|
-
req.headers['Accept'] = ACCEPT
|
|
117
|
-
end
|
|
118
|
-
rescue Faraday::Error => e
|
|
119
|
-
raise FetchError, "#{e.class.name.split('::').last} fetching #{url}: #{e.message}"
|
|
120
|
-
end
|
|
121
|
-
|
|
122
|
-
case response.status
|
|
123
|
-
when 200..299
|
|
124
|
-
Fetched.new(body: response.body, content_type: normalize_content_type(response.headers['content-type']), url: url)
|
|
125
|
-
when 300..399
|
|
126
|
-
location = response.headers['location']
|
|
127
|
-
raise FetchError, "HTTP #{response.status} from #{url} with no Location header" if location.nil? || location.empty?
|
|
128
|
-
|
|
129
|
-
fetch(URI.join(url, location).to_s, limit: limit - 1)
|
|
130
|
-
else
|
|
131
|
-
raise FetchError, "HTTP #{response.status} fetching #{url}: #{excerpt(response.body)}"
|
|
132
|
-
end
|
|
133
|
-
end
|
|
134
|
-
|
|
135
|
-
# Route a {Fetched} response to the parser that matches its
|
|
136
|
-
# content-type. Unknown types raise {FetchError} so the LLM gets a
|
|
137
|
-
# legible observation instead of an empty string.
|
|
138
|
-
#
|
|
139
|
-
# @param fetched [Fetched]
|
|
140
|
-
# @return [String] Markdown representation produced by the matched
|
|
141
|
-
# parser
|
|
142
|
-
# @raise [FetchError] when no parser matches the response's
|
|
143
|
-
# content-type
|
|
144
|
-
def self.dispatch(fetched)
|
|
145
|
-
case fetched.content_type
|
|
146
|
-
when 'text/html', 'application/xhtml+xml'
|
|
147
|
-
HTML.extract(fetched.body)
|
|
148
|
-
when 'application/pdf'
|
|
149
|
-
PDF.extract(fetched.body)
|
|
150
|
-
when %r{\Atext/}
|
|
151
|
-
fetched.body
|
|
152
|
-
else
|
|
153
|
-
raise FetchError, "unsupported content-type #{fetched.content_type.inspect} for #{fetched.url}"
|
|
154
|
-
end
|
|
155
|
-
end
|
|
156
|
-
|
|
157
|
-
# Lower-case +raw+ and strip any +; charset=...+ parameters so the
|
|
158
|
-
# dispatcher can match on a canonical token.
|
|
159
|
-
#
|
|
160
|
-
# @param raw [String, nil] raw +Content-Type+ header value
|
|
161
|
-
# @return [String] normalized content-type, or +""+ when the
|
|
162
|
-
# header was missing
|
|
163
|
-
def self.normalize_content_type(raw)
|
|
164
|
-
raw.to_s.split(';').first.to_s.strip.downcase
|
|
165
|
-
end
|
|
166
|
-
private_class_method :normalize_content_type
|
|
167
|
-
|
|
168
|
-
# Whitespace-collapse +body+ and clip to {ERROR_BODY_EXCERPT}
|
|
169
|
-
# characters, so the {FetchError} message stays a single readable
|
|
170
|
-
# line even when the server returned a multi-KB HTML challenge
|
|
171
|
-
# page.
|
|
172
|
-
#
|
|
173
|
-
# @param body [String, nil]
|
|
174
|
-
# @return [String]
|
|
175
|
-
def self.excerpt(body)
|
|
176
|
-
text = body.to_s.gsub(/\s+/, ' ').strip
|
|
177
|
-
text.length > ERROR_BODY_EXCERPT ? "#{text[0, ERROR_BODY_EXCERPT]}..." : text
|
|
178
|
-
end
|
|
179
|
-
private_class_method :excerpt
|
|
180
|
-
end
|
|
181
|
-
end
|
|
182
|
-
end
|
|
183
|
-
end
|