pikuri-core 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +67 -0
  3. data/lib/pikuri/agent/chat_transport.rb +41 -0
  4. data/lib/pikuri/agent/configurator.rb +270 -0
  5. data/lib/pikuri/agent/context_window_detector.rb +111 -0
  6. data/lib/pikuri/agent/control/cancellable.rb +128 -0
  7. data/lib/pikuri/agent/control/interloper.rb +167 -0
  8. data/lib/pikuri/agent/control/step_limit.rb +93 -0
  9. data/lib/pikuri/agent/control.rb +45 -0
  10. data/lib/pikuri/agent/event.rb +190 -0
  11. data/lib/pikuri/agent/extension.rb +82 -0
  12. data/lib/pikuri/agent/listener/in_memory_event_list.rb +34 -0
  13. data/lib/pikuri/agent/listener/rate_limited.rb +172 -0
  14. data/lib/pikuri/agent/listener/terminal.rb +264 -0
  15. data/lib/pikuri/agent/listener/token_log.rb +216 -0
  16. data/lib/pikuri/agent/listener.rb +54 -0
  17. data/lib/pikuri/agent/listener_list.rb +102 -0
  18. data/lib/pikuri/agent/synthesizer.rb +145 -0
  19. data/lib/pikuri/agent.rb +731 -0
  20. data/lib/pikuri/subprocess.rb +166 -0
  21. data/lib/pikuri/tool/calculator.rb +82 -0
  22. data/lib/pikuri/tool/fetch.rb +171 -0
  23. data/lib/pikuri/tool/parameters.rb +314 -0
  24. data/lib/pikuri/tool/scraper/fetch_error.rb +16 -0
  25. data/lib/pikuri/tool/scraper/html.rb +285 -0
  26. data/lib/pikuri/tool/scraper/pdf.rb +54 -0
  27. data/lib/pikuri/tool/scraper/simple.rb +183 -0
  28. data/lib/pikuri/tool/search/brave.rb +184 -0
  29. data/lib/pikuri/tool/search/duckduckgo.rb +196 -0
  30. data/lib/pikuri/tool/search/engines.rb +163 -0
  31. data/lib/pikuri/tool/search/exa.rb +217 -0
  32. data/lib/pikuri/tool/search/rate_limiter.rb +92 -0
  33. data/lib/pikuri/tool/search/result.rb +29 -0
  34. data/lib/pikuri/tool/sub_agent.rb +150 -0
  35. data/lib/pikuri/tool/web_scrape.rb +121 -0
  36. data/lib/pikuri/tool/web_search.rb +38 -0
  37. data/lib/pikuri/tool.rb +118 -0
  38. data/lib/pikuri/url_cache.rb +112 -0
  39. data/lib/pikuri/version.rb +10 -0
  40. data/lib/pikuri-core.rb +177 -0
  41. data/prompts/pikuri-chat.txt +15 -0
  42. metadata +251 -0
@@ -0,0 +1,314 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'did_you_mean'
4
+
5
+ module Pikuri
6
+ # Loaded by +lib/tools.rb+ after {Tool} itself is defined; the +class Tool+
7
+ # reopening below assumes that order.
8
+ class Tool
9
+ # Schema for a {Tool}'s arguments. Built up via the fluent
10
+ # +<required|optional>_<type>+ methods, then frozen by {.build}; serializes
11
+ # to the OpenAI JSON-Schema shape via {#to_h} and validates LLM-supplied
12
+ # argument hashes via {#validate}.
13
+ #
14
+ # @example
15
+ # params = Tool::Parameters.build { |p| p.required_string :query, 'The query.' }
16
+ # params.to_h
17
+ # # => {type: 'object',
18
+ # # properties: {query: {type: 'string', description: 'The query.'}},
19
+ # # required: ['query']}
20
+ # params.validate('query' => 'cats') # => {query: 'cats'}
21
+ class Parameters
22
+ # Raised by {Parameters#validate} when arguments do not match the declared
23
+ # schema. The message lists every problem and reprints the schema, so it
24
+ # can be fed back to the LLM verbatim as the next tool-call observation.
25
+ class ValidationError < StandardError; end
26
+
27
+ # Yield a fresh builder, freeze it, and return it.
28
+ #
29
+ # @yieldparam builder [Parameters]
30
+ # @return [Parameters] frozen builder, safe to share between calls
31
+ def self.build
32
+ builder = new
33
+ yield builder
34
+ builder.freeze
35
+ end
36
+
37
+ # @return [Parameters]
38
+ def initialize
39
+ @properties = {}
40
+ @required = []
41
+ end
42
+
43
+ # Freeze the builder along with its internal collections, so post-build
44
+ # mutation attempts raise +FrozenError+ instead of silently succeeding.
45
+ #
46
+ # @return [self]
47
+ def freeze
48
+ @properties.freeze
49
+ @required.freeze
50
+ super
51
+ end
52
+
53
+ # Add a required +string+ property.
54
+ #
55
+ # @param name [Symbol] property name
56
+ # @param description [String] human-readable description shown to the LLM
57
+ # @return [self]
58
+ def required_string(name, description)
59
+ add(name, 'string', description, required: true)
60
+ end
61
+
62
+ # Add an optional +string+ property.
63
+ #
64
+ # @param name [Symbol] property name
65
+ # @param description [String] human-readable description shown to the LLM
66
+ # @return [self]
67
+ def optional_string(name, description)
68
+ add(name, 'string', description, required: false)
69
+ end
70
+
71
+ # Add a required +integer+ property. Accepts Integers, Floats with a
72
+ # zero fractional part (e.g. +1.0+), and base-10 numeric Strings (after
73
+ # trimming) that resolve to whole numbers; rejects everything else.
74
+ #
75
+ # @param name [Symbol] property name
76
+ # @param description [String] human-readable description shown to the LLM
77
+ # @return [self]
78
+ def required_integer(name, description)
79
+ add(name, 'integer', description, required: true)
80
+ end
81
+
82
+ # Add an optional +integer+ property. See {#required_integer} for
83
+ # accepted shapes.
84
+ #
85
+ # @param name [Symbol] property name
86
+ # @param description [String] human-readable description shown to the LLM
87
+ # @return [self]
88
+ def optional_integer(name, description)
89
+ add(name, 'integer', description, required: false)
90
+ end
91
+
92
+ # Add a required +number+ property (JSON-Schema +number+: Integer or
93
+ # finite Float). Numeric Strings (after trimming) are parsed; NaN and
94
+ # Infinity are rejected.
95
+ #
96
+ # @param name [Symbol] property name
97
+ # @param description [String] human-readable description shown to the LLM
98
+ # @return [self]
99
+ def required_number(name, description)
100
+ add(name, 'number', description, required: true)
101
+ end
102
+
103
+ # Add an optional +number+ property. See {#required_number} for
104
+ # accepted shapes.
105
+ #
106
+ # @param name [Symbol] property name
107
+ # @param description [String] human-readable description shown to the LLM
108
+ # @return [self]
109
+ def optional_number(name, description)
110
+ add(name, 'number', description, required: false)
111
+ end
112
+
113
+ # Add a required +boolean+ property. Accepts Ruby +true+/+false+
114
+ # as-is, and the literal Strings +"true"+/+"false"+ (some models
115
+ # surface JSON booleans as Strings) after trimming surrounding
116
+ # whitespace. Other Strings, numbers, and +nil+ are rejected —
117
+ # there is no truthy-coercion of +"yes"+ / +0+ / etc.
118
+ #
119
+ # @param name [Symbol] property name
120
+ # @param description [String] human-readable description shown to the LLM
121
+ # @return [self]
122
+ def required_boolean(name, description)
123
+ add(name, 'boolean', description, required: true)
124
+ end
125
+
126
+ # Add an optional +boolean+ property. See {#required_boolean} for
127
+ # accepted shapes.
128
+ #
129
+ # @param name [Symbol] property name
130
+ # @param description [String] human-readable description shown to the LLM
131
+ # @return [self]
132
+ def optional_boolean(name, description)
133
+ add(name, 'boolean', description, required: false)
134
+ end
135
+
136
+ # Schema in OpenAI JSON-Schema shape.
137
+ #
138
+ # @return [Hash] +{type: 'object', properties: {...}, required: [...]}+
139
+ def to_h
140
+ { type: 'object', properties: @properties, required: @required }
141
+ end
142
+
143
+ # Validate a tool-call argument hash against the declared schema. Returns
144
+ # a symbol-keyed hash safe to splat as kwargs into a tool's +execute+
145
+ # Proc; raises {ValidationError} with an LLM-actionable message listing
146
+ # every missing/unknown/mistyped field and reprinting the schema.
147
+ #
148
+ # Strict: unknown keys are rejected (with DidYouMean suggestions), wrong
149
+ # types are rejected. All issues are collected and reported together so
150
+ # the LLM can fix them in one round trip.
151
+ #
152
+ # @param args [Hash] arguments as decoded from the tool-call JSON; keys
153
+ # may be Strings or Symbols
154
+ # @return [Hash{Symbol=>Object}] validated, symbol-keyed arguments
155
+ # @raise [ValidationError] if +args+ is not a Hash, contains unknown
156
+ # keys, omits a required key, or has a value of the wrong type
157
+ def validate(args)
158
+ raise ValidationError, "Arguments must be an object, got #{args.class}." unless args.is_a?(Hash)
159
+
160
+ symbolized = args.transform_keys(&:to_sym)
161
+ errors = []
162
+ result = {}
163
+
164
+ (symbolized.keys - @properties.keys).each do |unknown|
165
+ errors << unknown_key_error(unknown)
166
+ end
167
+
168
+ @properties.each do |name, schema|
169
+ if symbolized.key?(name)
170
+ begin
171
+ result[name] = coerce(symbolized[name], schema[:type])
172
+ rescue CoercionError => e
173
+ errors << "Parameter `#{name}` #{e.message}."
174
+ end
175
+ elsif @required.include?(name.to_s)
176
+ errors << "Missing required parameter `#{name}` (#{schema[:type]}): #{schema[:description]}"
177
+ end
178
+ end
179
+
180
+ return result if errors.empty?
181
+
182
+ raise ValidationError, build_error_message(errors)
183
+ end
184
+
185
+ private
186
+
187
+ # Internal coercion failure. Caught by {#validate} and turned into a
188
+ # {ValidationError} message — never escapes the class.
189
+ class CoercionError < StandardError; end
190
+ private_constant :CoercionError
191
+
192
+ def add(name, type, description, required:)
193
+ @properties[name] = { type: type, description: description }
194
+ @required << name.to_s if required
195
+ self
196
+ end
197
+
198
+ # Coerce +value+ to a Ruby value matching the JSON-Schema +type+,
199
+ # returning the coerced value. Raises {CoercionError} on failure.
200
+ def coerce(value, type)
201
+ case type
202
+ when 'string'
203
+ return value if value.is_a?(String)
204
+
205
+ raise CoercionError, type_message('string', value)
206
+ when 'integer'
207
+ coerce_integer(value)
208
+ when 'number'
209
+ coerce_number(value)
210
+ when 'boolean'
211
+ coerce_boolean(value)
212
+ end
213
+ end
214
+
215
+ def coerce_boolean(value)
216
+ return value if value == true || value == false
217
+
218
+ if value.is_a?(String)
219
+ case value.strip
220
+ when 'true' then return true
221
+ when 'false' then return false
222
+ end
223
+ end
224
+
225
+ raise CoercionError, type_message('boolean', value)
226
+ end
227
+
228
+ def coerce_integer(value)
229
+ case value
230
+ when Integer
231
+ value
232
+ when Float
233
+ raise CoercionError, type_message('integer', value) unless value.finite? && value.modulo(1).zero?
234
+
235
+ value.to_i
236
+ when String
237
+ parsed = parse_numeric_string(value)
238
+ raise CoercionError, type_message('integer', value) unless parsed && parsed.modulo(1).zero?
239
+
240
+ parsed.to_i
241
+ else
242
+ raise CoercionError, type_message('integer', value)
243
+ end
244
+ end
245
+
246
+ def coerce_number(value)
247
+ case value
248
+ when Integer
249
+ value
250
+ when Float
251
+ raise CoercionError, type_message('number', value) unless value.finite?
252
+
253
+ value
254
+ when String
255
+ parsed = parse_numeric_string(value)
256
+ raise CoercionError, type_message('number', value) unless parsed
257
+
258
+ parsed
259
+ else
260
+ raise CoercionError, type_message('number', value)
261
+ end
262
+ end
263
+
264
+ # Matches the decimal-numeric subset that JSON allows: optional sign,
265
+ # mantissa (with optional fractional part), optional decimal exponent.
266
+ # Rejects hex (+0x10+), underscores (+1_000+), +NaN+, +Infinity+.
267
+ DECIMAL_NUMERIC = /\A[-+]?(?:\d+\.?\d*|\.\d+)(?:[eE][-+]?\d+)?\z/
268
+ private_constant :DECIMAL_NUMERIC
269
+
270
+ # Strict base-10 numeric-string parse. Returns a finite Float, or +nil+
271
+ # for empty/whitespace/garbage/hex/NaN/Infinity input.
272
+ def parse_numeric_string(str)
273
+ trimmed = str.strip
274
+ return nil unless trimmed.match?(DECIMAL_NUMERIC)
275
+
276
+ parsed = Float(trimmed, exception: false)
277
+ return nil unless parsed&.finite?
278
+
279
+ parsed
280
+ end
281
+
282
+ def type_message(type, value)
283
+ article = type == 'integer' ? 'an' : 'a'
284
+ "must be #{article} #{type} (got #{value.class}: #{value.inspect})"
285
+ end
286
+
287
+ def unknown_key_error(unknown)
288
+ suggestion = DidYouMean::SpellChecker
289
+ .new(dictionary: @properties.keys.map(&:to_s))
290
+ .correct(unknown.to_s).first
291
+ msg = "Unknown parameter `#{unknown}`."
292
+ msg += suggestion ? " Did you mean `#{suggestion}`?" : " Valid parameters: #{valid_keys_list}."
293
+ msg
294
+ end
295
+
296
+ def valid_keys_list
297
+ @properties.keys.map { |k| "`#{k}`" }.join(', ')
298
+ end
299
+
300
+ def build_error_message(errors)
301
+ [
302
+ 'Invalid arguments:',
303
+ *errors.map { |e| "- #{e}" },
304
+ '',
305
+ 'Expected schema:',
306
+ *@properties.map { |name, prop|
307
+ req = @required.include?(name.to_s) ? 'required' : 'optional'
308
+ " - `#{name}` (#{prop[:type]}, #{req}): #{prop[:description]}"
309
+ }
310
+ ].join("\n")
311
+ end
312
+ end
313
+ end
314
+ end
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pikuri
4
+ class Tool
5
+ module Scraper
6
+ # Raised by anything in the scraper stack when a URL cannot be
7
+ # rendered into Markdown text — HTTP non-2xx, network failure,
8
+ # redirect-loop, missing +Location+, unsupported content-type, or a
9
+ # parse failure that reads as "try a different URL" to the LLM.
10
+ # Catching this in {Tool::WEB_SCRAPE} / {Tool::FETCH} turns the
11
+ # failure into an +"Error: ..."+ observation; anything else bubbles
12
+ # up so genuine bugs stay visible.
13
+ class FetchError < StandardError; end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,285 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'nokogiri'
5
+ require 'readability'
6
+ require 'reverse_markdown'
7
+
8
+ module Pikuri
9
+ class Tool
10
+ module Scraper
11
+ # HTML → Markdown extractor used by {Simple.visit} when the fetched
12
+ # response carries an HTML content-type.
13
+ #
14
+ # Always renders both views of the page when available:
15
+ #
16
+ # 1. JSON-LD section. Any +<script type="application/ld+json">+ node
17
+ # whose +@type+ matches a substantive schema.org content type
18
+ # (Product, Article, Recipe, ...) is rendered as a header — title,
19
+ # metadata bullets (brand, SKU, price, rating, author, published),
20
+ # and the +articleBody+/+description+ copy when present.
21
+ # 2. Readability section. The page is run through +Readability+ +
22
+ # +reverse_markdown+, with a +<main>+/+<article>+ fallback for
23
+ # pages whose content sits mostly outside +<p>+ tags.
24
+ #
25
+ # Concatenated with a horizontal rule, so the LLM gets both the
26
+ # structured metadata and the rendered body and can pick whichever
27
+ # is more useful for the task. Trades some duplication (when a
28
+ # publisher embeds the article body in JSON-LD AND in HTML) for
29
+ # fewer type-based heuristics on which branch should win — the
30
+ # earlier "is this Article's +description+ a teaser or the real
31
+ # body?" carve-out is no longer needed because both end up in
32
+ # the output regardless.
33
+ #
34
+ # Pure parser — no I/O. {.extract} takes an HTML string and returns
35
+ # Markdown, so tests can drive it against fixture HTML without a
36
+ # network round-trip.
37
+ module HTML
38
+ # @return [Array<String>] schema.org +@type+ values that we treat
39
+ # as "the primary entity of this page" when picking a JSON-LD
40
+ # node to render. Order does not matter — the first matching
41
+ # node wins. Skips noise nodes (Organization, BreadcrumbList,
42
+ # WebSite, ...) that ship on most pages but carry no page
43
+ # content.
44
+ INTERESTING_TYPES = %w[
45
+ Product Article NewsArticle BlogPosting Recipe Event Book Movie
46
+ ].freeze
47
+
48
+ # @return [Array<String>] HTML tags preserved by the readability
49
+ # pass. Anything outside this list is stripped before Markdown
50
+ # conversion.
51
+ READABILITY_TAGS = %w[
52
+ h1 h2 h3 h4 h5 h6 p div span ul ol li blockquote pre code a img
53
+ strong em b i br hr table thead tbody tr td th
54
+ ].freeze
55
+
56
+ # @return [Array<String>] HTML attributes preserved by the
57
+ # readability pass; everything else (class, id, style, data-*)
58
+ # is dropped before Markdown conversion
59
+ READABILITY_ATTRS = %w[href src alt title].freeze
60
+
61
+ # @return [Float] minimum +<main>+/+<article>+ to Readability
62
+ # text-length ratio that triggers the semantic-container
63
+ # fallback in {.readability_to_markdown}. Picked low enough to
64
+ # catch the failure mode (Readability collapsing a page that
65
+ # uses divs/lists instead of +<p>+ — e.g. +vaadin.com/company+,
66
+ # ~5x) but high enough that pages where both produce
67
+ # comparable output keep Readability's noise filtering.
68
+ MAIN_FALLBACK_RATIO = 2.0
69
+
70
+ # @return [Integer] minimum text length the
71
+ # +<main>+/+<article>+ container must hold before the fallback
72
+ # in {.readability_to_markdown} can fire. Below this, the
73
+ # ratio comparison is dominated by noise and we'd swap on
74
+ # tiny pages where Readability is doing the right thing.
75
+ MAIN_FALLBACK_MIN_CHARS = 500
76
+
77
+ # Render +html+ as Markdown by emitting both the JSON-LD section
78
+ # (when an interesting node is present) and the readability /
79
+ # +<main>+ section, joined by a horizontal rule. Either section
80
+ # may be missing — pages with no JSON-LD return only the
81
+ # readability output, and a malformed page with no extractable
82
+ # body returns only the JSON-LD render.
83
+ #
84
+ # @param html [String] HTML document body
85
+ # @return [String] Markdown representation
86
+ def self.extract(html)
87
+ sections = [jsonld_section(html), readability_to_markdown(html)]
88
+ sections.reject! { |s| s.nil? || s.strip.empty? }
89
+ sections.join("\n\n---\n\n")
90
+ end
91
+
92
+ # Pick the first JSON-LD node whose +@type+ matches one of
93
+ # {INTERESTING_TYPES} and render it as Markdown. Returns +nil+
94
+ # when no such node exists, in which case {.extract} emits only
95
+ # the readability section.
96
+ #
97
+ # No content-field gating: a node carrying just +name+/+author+/
98
+ # +datePublished+ still renders (as a metadata-only header),
99
+ # because the readability pass independently produces the page
100
+ # body. That is the trade-off that lets us drop the type-based
101
+ # "is this teaser or article copy?" heuristics — duplication is
102
+ # acceptable when both views are available, and the LLM can
103
+ # pick whichever it needs.
104
+ #
105
+ # @param html [String] HTML document body
106
+ # @return [String, nil] Markdown render of the picked JSON-LD
107
+ # node, or +nil+ when nothing matched
108
+ def self.jsonld_section(html)
109
+ node = parse_jsonld(html).find do |n|
110
+ Array(n['@type']).any? { |t| INTERESTING_TYPES.include?(t) }
111
+ end
112
+ node ? jsonld_to_markdown(node) : nil
113
+ end
114
+
115
+ # Collect every JSON-LD payload embedded in +html+, flattening
116
+ # +@graph+ wrappers so callers see one flat array of schema.org
117
+ # nodes. Malformed JSON blocks are silently skipped — sites
118
+ # frequently ship broken JSON-LD and we only need at least one
119
+ # parseable block.
120
+ #
121
+ # @param html [String] HTML document body
122
+ # @return [Array<Hash>] parsed JSON-LD nodes; possibly empty
123
+ def self.parse_jsonld(html)
124
+ doc = Nokogiri::HTML(html)
125
+ blobs = doc.css('script[type="application/ld+json"]').map(&:text)
126
+
127
+ blobs.flat_map do |raw|
128
+ parsed = begin
129
+ JSON.parse(raw)
130
+ rescue JSON::ParserError
131
+ nil
132
+ end
133
+ next [] unless parsed
134
+
135
+ nodes = parsed.is_a?(Array) ? parsed : [parsed]
136
+ nodes.flat_map { |n| n['@graph'].is_a?(Array) ? n['@graph'] : [n] }
137
+ end
138
+ end
139
+
140
+ # Render a single JSON-LD +node+ as Markdown: a top-level title
141
+ # from +name+/+headline+, a bullet list of common useful fields
142
+ # (brand, SKU, price, rating, author, published date, ...), the
143
+ # body copy, and the lead image.
144
+ #
145
+ # When the node carries +articleBody+ (the full publisher-supplied
146
+ # article text), that wins over +description+ — the description
147
+ # is typically a lede teaser and would just repeat the article's
148
+ # opening lines.
149
+ #
150
+ # @param node [Hash] JSON-LD node, typically picked by
151
+ # {.jsonld_section}
152
+ # @return [String] Markdown representation
153
+ def self.jsonld_to_markdown(node)
154
+ out = +''
155
+ name = node['name'] || node['headline']
156
+ out << "# #{name}\n\n" if name
157
+
158
+ offer = first_obj(node['offers'])
159
+ rating = first_obj(node['aggregateRating'])
160
+ brand = first_obj_or_string(node['brand'])
161
+ author = first_obj_or_string(node['author'])
162
+
163
+ brand_name = brand.is_a?(Hash) ? brand['name'] : brand
164
+ author_name = author.is_a?(Hash) ? author['name'] : author
165
+
166
+ fields = {
167
+ 'Brand' => brand_name,
168
+ 'SKU' => node['sku'],
169
+ 'GTIN' => node['gtin13'] || node['gtin'],
170
+ 'Price' => [offer['price'], offer['priceCurrency']].compact.join(' '),
171
+ 'Availability' => offer['availability'],
172
+ 'Rating' => rating['ratingValue'],
173
+ 'Reviews' => rating['reviewCount'],
174
+ 'Author' => author_name,
175
+ 'Published' => node['datePublished']
176
+ }.reject { |_, v| v.nil? || v.to_s.strip.empty? }
177
+
178
+ unless fields.empty?
179
+ fields.each { |k, v| out << "- **#{k}:** #{v}\n" }
180
+ out << "\n"
181
+ end
182
+
183
+ if (body = node['articleBody'] || node['description'])
184
+ out << "#{body}\n\n"
185
+ end
186
+
187
+ if (img = node['image'])
188
+ img = img.first if img.is_a?(Array)
189
+ img = img['url'] if img.is_a?(Hash)
190
+ out << "![image](#{img})\n\n" if img
191
+ end
192
+
193
+ out
194
+ end
195
+
196
+ # Run +Readability+ over +html+ to isolate the main content node,
197
+ # then convert that to Markdown via +reverse_markdown+. The page
198
+ # +<title>+ is rendered as a top-level heading.
199
+ #
200
+ # When the page uses semantic HTML5 (+<main>+ or +<article>+) but
201
+ # leaves most of its content outside +<p>+ tags — divs, lists,
202
+ # spans — Readability's paragraph-density scoring collapses the
203
+ # extraction to a sliver of the page. In that case we render the
204
+ # +<main>+/+<article>+ container directly. The fallback only
205
+ # fires when the container holds substantially more text than
206
+ # Readability picked up (see {MAIN_FALLBACK_RATIO} /
207
+ # {MAIN_FALLBACK_MIN_CHARS}); on pages where both agree we keep
208
+ # Readability so its noise filtering still strips nav/ads/etc.
209
+ #
210
+ # @param html [String] HTML document body
211
+ # @return [String] Markdown representation
212
+ def self.readability_to_markdown(html)
213
+ rdoc = Readability::Document.new(
214
+ html,
215
+ tags: READABILITY_TAGS,
216
+ attributes: READABILITY_ATTRS,
217
+ remove_empty_nodes: true
218
+ )
219
+ readability_html = rdoc.content
220
+ title = rdoc.title
221
+
222
+ body_html = main_fallback_html(html, readability_html) || readability_html
223
+ body = ReverseMarkdown.convert(body_html, unknown_tags: :bypass, github_flavored: true)
224
+
225
+ out = +''
226
+ out << "# #{title.strip}\n\n" if title && !title.strip.empty?
227
+ out << body
228
+ out
229
+ end
230
+
231
+ # If +html+ has a +<main>+ or +<article>+ element holding
232
+ # substantially more text than Readability extracted, return that
233
+ # container's HTML so the caller can render it instead. Returns
234
+ # +nil+ when the fallback should not fire — when there is no
235
+ # semantic container, when it's too small to be meaningful, or
236
+ # when Readability's output is already comparable.
237
+ #
238
+ # @param html [String] full HTML document body, used to locate
239
+ # the +<main>+/+<article>+ container
240
+ # @param readability_html [String] HTML produced by
241
+ # +Readability::Document#content+, used as the comparison
242
+ # baseline
243
+ # @return [String, nil] container HTML when the fallback should
244
+ # fire, +nil+ otherwise
245
+ def self.main_fallback_html(html, readability_html)
246
+ doc = Nokogiri::HTML(html)
247
+ container = doc.at_css('main') || doc.at_css('article')
248
+ return nil unless container
249
+
250
+ container_text_len = container.text.gsub(/\s+/, ' ').strip.length
251
+ return nil if container_text_len < MAIN_FALLBACK_MIN_CHARS
252
+
253
+ readability_text_len = Nokogiri::HTML(readability_html).text.gsub(/\s+/, ' ').strip.length
254
+ return nil if container_text_len < MAIN_FALLBACK_RATIO * readability_text_len
255
+
256
+ container.to_html
257
+ end
258
+ private_class_method :main_fallback_html
259
+
260
+ # JSON-LD fields can be a string, hash, or array of either.
261
+ # Normalize to a single hash (the first one if it's a list) so
262
+ # callers can +.dig+ safely.
263
+ #
264
+ # @param value [Object] raw JSON-LD field value
265
+ # @return [Hash] empty hash when +value+ does not contain a hash
266
+ def self.first_obj(value)
267
+ value = value.first if value.is_a?(Array)
268
+ value.is_a?(Hash) ? value : {}
269
+ end
270
+ private_class_method :first_obj
271
+
272
+ # Same idea as {.first_obj} but preserves a bare string (e.g.
273
+ # +brand: "Apple"+) instead of replacing it with +{}+.
274
+ #
275
+ # @param value [Object] raw JSON-LD field value
276
+ # @return [String, Hash, nil]
277
+ def self.first_obj_or_string(value)
278
+ value = value.first if value.is_a?(Array)
279
+ value
280
+ end
281
+ private_class_method :first_obj_or_string
282
+ end
283
+ end
284
+ end
285
+ end
@@ -0,0 +1,54 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pdf-reader'
4
+ require 'stringio'
5
+
6
+ module Pikuri
7
+ class Tool
8
+ module Scraper
9
+ # PDF → text extractor used by {Simple.visit} when the fetched
10
+ # response carries +application/pdf+. Wraps the +pdf-reader+ gem:
11
+ # walk every page, concatenate the extracted text, hand the result
12
+ # back as a single string the LLM can read.
13
+ #
14
+ # Best-effort by design. +pdf-reader+ produces clean text from PDFs
15
+ # generated from a digital source (LaTeX, Word export, ...) but
16
+ # returns nothing useful from scanned documents — there is no OCR
17
+ # in this path. When extraction yields no text we still return an
18
+ # empty string rather than raising, so the caller's cache stores a
19
+ # consistent result and the LLM sees an empty observation it can
20
+ # react to.
21
+ #
22
+ # Pure parser — no I/O. {.extract} takes PDF bytes and returns text,
23
+ # so tests can drive it against an in-memory fixture without
24
+ # touching the network.
25
+ module PDF
26
+ # Render +bytes+ as plain text, one page per paragraph.
27
+ #
28
+ # +pdf-reader+ raises a handful of typed exceptions for documents
29
+ # it cannot parse — broken xrefs ({::PDF::Reader::MalformedPDFError}),
30
+ # invalid page references ({::PDF::Reader::InvalidPageError}),
31
+ # encrypted/XFA files ({::PDF::Reader::UnsupportedFeatureError}).
32
+ # All three describe a property of the PDF the LLM can react to
33
+ # ("try a different URL"), so we re-raise them as {FetchError} —
34
+ # same convention as the HTTP layer in {Simple.fetch}. Genuine
35
+ # bugs in +pdf-reader+ itself surface as their own classes and
36
+ # crash loud.
37
+ #
38
+ # @param bytes [String] raw PDF document (binary string)
39
+ # @return [String] concatenated page text; possibly empty when
40
+ # the PDF carries no extractable text (scanned image, empty
41
+ # document)
42
+ # @raise [FetchError] when +pdf-reader+ refuses the document
43
+ def self.extract(bytes)
44
+ reader = ::PDF::Reader.new(StringIO.new(bytes))
45
+ reader.pages.map { |p| p.text.strip }.reject(&:empty?).join("\n\n")
46
+ rescue ::PDF::Reader::MalformedPDFError,
47
+ ::PDF::Reader::InvalidPageError,
48
+ ::PDF::Reader::UnsupportedFeatureError => e
49
+ raise FetchError, "PDF rendering failed: #{e.class.name.split('::').last}: #{e.message}"
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end