pikuri-core 0.0.4 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,54 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'pdf-reader'
4
- require 'stringio'
5
-
6
- module Pikuri
7
- class Tool
8
- module Scraper
9
- # PDF → text extractor used by {Simple.visit} when the fetched
10
- # response carries +application/pdf+. Wraps the +pdf-reader+ gem:
11
- # walk every page, concatenate the extracted text, hand the result
12
- # back as a single string the LLM can read.
13
- #
14
- # Best-effort by design. +pdf-reader+ produces clean text from PDFs
15
- # generated from a digital source (LaTeX, Word export, ...) but
16
- # returns nothing useful from scanned documents — there is no OCR
17
- # in this path. When extraction yields no text we still return an
18
- # empty string rather than raising, so the caller's cache stores a
19
- # consistent result and the LLM sees an empty observation it can
20
- # react to.
21
- #
22
- # Pure parser — no I/O. {.extract} takes PDF bytes and returns text,
23
- # so tests can drive it against an in-memory fixture without
24
- # touching the network.
25
- module PDF
26
- # Render +bytes+ as plain text, one page per paragraph.
27
- #
28
- # +pdf-reader+ raises a handful of typed exceptions for documents
29
- # it cannot parse — broken xrefs ({::PDF::Reader::MalformedPDFError}),
30
- # invalid page references ({::PDF::Reader::InvalidPageError}),
31
- # encrypted/XFA files ({::PDF::Reader::UnsupportedFeatureError}).
32
- # All three describe a property of the PDF the LLM can react to
33
- # ("try a different URL"), so we re-raise them as {FetchError} —
34
- # same convention as the HTTP layer in {Simple.fetch}. Genuine
35
- # bugs in +pdf-reader+ itself surface as their own classes and
36
- # crash loud.
37
- #
38
- # @param bytes [String] raw PDF document (binary string)
39
- # @return [String] concatenated page text; possibly empty when
40
- # the PDF carries no extractable text (scanned image, empty
41
- # document)
42
- # @raise [FetchError] when +pdf-reader+ refuses the document
43
- def self.extract(bytes)
44
- reader = ::PDF::Reader.new(StringIO.new(bytes))
45
- reader.pages.map { |p| p.text.strip }.reject(&:empty?).join("\n\n")
46
- rescue ::PDF::Reader::MalformedPDFError,
47
- ::PDF::Reader::InvalidPageError,
48
- ::PDF::Reader::UnsupportedFeatureError => e
49
- raise FetchError, "PDF rendering failed: #{e.class.name.split('::').last}: #{e.message}"
50
- end
51
- end
52
- end
53
- end
54
- end
@@ -1,183 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'faraday'
4
- require 'uri'
5
-
6
- module Pikuri
7
- class Tool
8
- # Namespace for the URL-to-Markdown scraping stack used by
9
- # {Tool::WEB_SCRAPE} and {Tool::FETCH}: a content-type-dispatching
10
- # fetcher ({Simple}), pure content extractors ({HTML}, {PDF}), and a
11
- # shared error type ({FetchError}). Nothing here knows about the LLM
12
- # — the tools that wrap these layers turn rendered Markdown (or
13
- # +FetchError+) into the next observation.
14
- module Scraper
15
- # Plain HTTP scraper: GET the URL with a real-browser User-Agent,
16
- # follow redirects, and dispatch the response body to the parser
17
- # matching its +Content-Type+. HTML and XHTML route to
18
- # {HTML.extract}; +application/pdf+ routes to {PDF.extract}; any
19
- # other +text/*+ type (plain text, Markdown, source files, …) is
20
- # passed through verbatim since the LLM can already read it; the
21
- # remaining types raise {FetchError} so the LLM observes the
22
- # failure instead of receiving an empty rendering.
23
- #
24
- # Split into a thin HTTP fetch ({.fetch}) and a content-type
25
- # dispatcher ({.visit}) so tests can drive each piece in isolation.
26
- # "Simple" because everything happens in one Faraday GET — no
27
- # headless browser, no JS execution.
28
- module Simple
29
- # @return [String] User-Agent sent with each request; many sites
30
- # reject requests with no UA or an obvious bot UA
31
- USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' \
32
- '(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
33
- # @return [String] +Accept+ header sent with each request. Lists
34
- # every content-type the dispatcher in {.visit} knows how to
35
- # render, so servers that content-negotiate hand back something
36
- # we can use. The trailing +text/*;q=0.8+ covers the verbatim
37
- # pass-through arm (plain text, Markdown, source files, …) at a
38
- # lower preference than rendered HTML/PDF.
39
- ACCEPT = 'text/html,application/xhtml+xml,application/pdf,text/*;q=0.8'
40
- # @return [Integer] maximum number of HTTP redirects to follow
41
- # before giving up
42
- MAX_REDIRECTS = 5
43
- # @return [Integer] connect timeout in seconds for the underlying
44
- # Faraday request
45
- OPEN_TIMEOUT = 10
46
- # @return [Integer] read timeout in seconds for the underlying
47
- # Faraday request
48
- READ_TIMEOUT = 20
49
-
50
- # @return [Integer] maximum number of characters of an error
51
- # response body to include in a {FetchError} message. The body is
52
- # often a multi-kilobyte HTML challenge page (Cloudflare, WAF
53
- # interstitial, etc.); a short excerpt tells the LLM what kind of
54
- # page came back without flooding the next observation.
55
- ERROR_BODY_EXCERPT = 200
56
-
57
- # Result of a successful {Simple.fetch}: the response body, the
58
- # normalized content-type (lower-cased, with any +; charset=...+
59
- # parameters stripped), and the final URL after redirects. The
60
- # final URL is kept so future scrapers can resolve relative links
61
- # against the actual landing page rather than the originally
62
- # requested one.
63
- Fetched = Data.define(:body, :content_type, :url)
64
-
65
- # Fetch +url+ and render its main content as Markdown.
66
- #
67
- # No caching here — every call hits the network. Callers that want
68
- # to memoize results should wrap this method themselves (see
69
- # {Tool::WebScrape.visit}, which does exactly that).
70
- #
71
- # The dispatcher's output is +String#strip+'d so the LLM never
72
- # sees a body that opens or closes with blank lines — common with
73
- # +pdf-reader+'s page-feed whitespace and with text bodies that
74
- # carry a trailing newline. Interior whitespace is preserved
75
- # because Markdown paragraph breaks and source-code indentation
76
- # are load-bearing.
77
- #
78
- # @param url [String] absolute HTTP(S) URL of the page to download
79
- # @return [String] full Markdown representation of the page with
80
- # leading/trailing whitespace trimmed, uncapped otherwise —
81
- # caller is responsible for any size limiting before feeding
82
- # the result back to the LLM
83
- # @raise [FetchError] on HTTP non-2xx, network failure, redirect
84
- # loop, a 3xx without a +Location+ header, or a response whose
85
- # content-type the dispatcher does not recognize
86
- def self.visit(url)
87
- dispatch(fetch(url)).strip
88
- end
89
-
90
- # Download the body of +url+, manually following up to
91
- # {MAX_REDIRECTS} redirects. Faraday is configured with no
92
- # middleware so behavior here mirrors the rest of the codebase
93
- # (see +Tool::Search::DuckDuckGo.search+).
94
- #
95
- # All recoverable failures — HTTP 4xx/5xx, +Faraday::Error+ network
96
- # blips, exhausted redirect budget, 3xx without a +Location+ —
97
- # surface as {FetchError} so the caller has a single exception type
98
- # to rescue. Error bodies are trimmed to {ERROR_BODY_EXCERPT}
99
- # characters with whitespace collapsed, so a Cloudflare-challenge
100
- # response doesn't dump kilobytes of inline HTML into the next LLM
101
- # observation.
102
- #
103
- # @param url [String] absolute HTTP(S) URL to fetch
104
- # @param limit [Integer] redirects remaining; recurses with
105
- # +limit - 1+ on each 3xx
106
- # @return [Fetched] body, normalized content-type, and final URL
107
- # after redirects
108
- # @raise [FetchError] on non-2xx/3xx responses, network errors,
109
- # redirect-loop exhaustion, or 3xx without a +Location+ header
110
- def self.fetch(url, limit: MAX_REDIRECTS)
111
- raise FetchError, "too many redirects fetching #{url}" if limit.zero?
112
-
113
- response = begin
114
- Faraday.new(request: { open_timeout: OPEN_TIMEOUT, timeout: READ_TIMEOUT }).get(url) do |req|
115
- req.headers['User-Agent'] = USER_AGENT
116
- req.headers['Accept'] = ACCEPT
117
- end
118
- rescue Faraday::Error => e
119
- raise FetchError, "#{e.class.name.split('::').last} fetching #{url}: #{e.message}"
120
- end
121
-
122
- case response.status
123
- when 200..299
124
- Fetched.new(body: response.body, content_type: normalize_content_type(response.headers['content-type']), url: url)
125
- when 300..399
126
- location = response.headers['location']
127
- raise FetchError, "HTTP #{response.status} from #{url} with no Location header" if location.nil? || location.empty?
128
-
129
- fetch(URI.join(url, location).to_s, limit: limit - 1)
130
- else
131
- raise FetchError, "HTTP #{response.status} fetching #{url}: #{excerpt(response.body)}"
132
- end
133
- end
134
-
135
- # Route a {Fetched} response to the parser that matches its
136
- # content-type. Unknown types raise {FetchError} so the LLM gets a
137
- # legible observation instead of an empty string.
138
- #
139
- # @param fetched [Fetched]
140
- # @return [String] Markdown representation produced by the matched
141
- # parser
142
- # @raise [FetchError] when no parser matches the response's
143
- # content-type
144
- def self.dispatch(fetched)
145
- case fetched.content_type
146
- when 'text/html', 'application/xhtml+xml'
147
- HTML.extract(fetched.body)
148
- when 'application/pdf'
149
- PDF.extract(fetched.body)
150
- when %r{\Atext/}
151
- fetched.body
152
- else
153
- raise FetchError, "unsupported content-type #{fetched.content_type.inspect} for #{fetched.url}"
154
- end
155
- end
156
-
157
- # Lower-case +raw+ and strip any +; charset=...+ parameters so the
158
- # dispatcher can match on a canonical token.
159
- #
160
- # @param raw [String, nil] raw +Content-Type+ header value
161
- # @return [String] normalized content-type, or +""+ when the
162
- # header was missing
163
- def self.normalize_content_type(raw)
164
- raw.to_s.split(';').first.to_s.strip.downcase
165
- end
166
- private_class_method :normalize_content_type
167
-
168
- # Whitespace-collapse +body+ and clip to {ERROR_BODY_EXCERPT}
169
- # characters, so the {FetchError} message stays a single readable
170
- # line even when the server returned a multi-KB HTML challenge
171
- # page.
172
- #
173
- # @param body [String, nil]
174
- # @return [String]
175
- def self.excerpt(body)
176
- text = body.to_s.gsub(/\s+/, ' ').strip
177
- text.length > ERROR_BODY_EXCERPT ? "#{text[0, ERROR_BODY_EXCERPT]}..." : text
178
- end
179
- private_class_method :excerpt
180
- end
181
- end
182
- end
183
- end