pikuri-core 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +67 -0
- data/lib/pikuri/agent/chat_transport.rb +41 -0
- data/lib/pikuri/agent/configurator.rb +270 -0
- data/lib/pikuri/agent/context_window_detector.rb +111 -0
- data/lib/pikuri/agent/control/cancellable.rb +128 -0
- data/lib/pikuri/agent/control/interloper.rb +167 -0
- data/lib/pikuri/agent/control/step_limit.rb +93 -0
- data/lib/pikuri/agent/control.rb +45 -0
- data/lib/pikuri/agent/event.rb +190 -0
- data/lib/pikuri/agent/extension.rb +82 -0
- data/lib/pikuri/agent/listener/in_memory_event_list.rb +34 -0
- data/lib/pikuri/agent/listener/rate_limited.rb +172 -0
- data/lib/pikuri/agent/listener/terminal.rb +264 -0
- data/lib/pikuri/agent/listener/token_log.rb +216 -0
- data/lib/pikuri/agent/listener.rb +54 -0
- data/lib/pikuri/agent/listener_list.rb +102 -0
- data/lib/pikuri/agent/synthesizer.rb +145 -0
- data/lib/pikuri/agent.rb +731 -0
- data/lib/pikuri/subprocess.rb +166 -0
- data/lib/pikuri/tool/calculator.rb +82 -0
- data/lib/pikuri/tool/fetch.rb +171 -0
- data/lib/pikuri/tool/parameters.rb +314 -0
- data/lib/pikuri/tool/scraper/fetch_error.rb +16 -0
- data/lib/pikuri/tool/scraper/html.rb +285 -0
- data/lib/pikuri/tool/scraper/pdf.rb +54 -0
- data/lib/pikuri/tool/scraper/simple.rb +183 -0
- data/lib/pikuri/tool/search/brave.rb +184 -0
- data/lib/pikuri/tool/search/duckduckgo.rb +196 -0
- data/lib/pikuri/tool/search/engines.rb +163 -0
- data/lib/pikuri/tool/search/exa.rb +217 -0
- data/lib/pikuri/tool/search/rate_limiter.rb +92 -0
- data/lib/pikuri/tool/search/result.rb +29 -0
- data/lib/pikuri/tool/sub_agent.rb +150 -0
- data/lib/pikuri/tool/web_scrape.rb +121 -0
- data/lib/pikuri/tool/web_search.rb +38 -0
- data/lib/pikuri/tool.rb +118 -0
- data/lib/pikuri/url_cache.rb +112 -0
- data/lib/pikuri/version.rb +10 -0
- data/lib/pikuri-core.rb +177 -0
- data/prompts/pikuri-chat.txt +15 -0
- metadata +251 -0
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'did_you_mean'
|
|
4
|
+
|
|
5
|
+
module Pikuri
|
|
6
|
+
# Loaded by +lib/tools.rb+ after {Tool} itself is defined; the +class Tool+
|
|
7
|
+
# reopening below assumes that order.
|
|
8
|
+
class Tool
|
|
9
|
+
# Schema for a {Tool}'s arguments. Built up via the fluent
|
|
10
|
+
# +<required|optional>_<type>+ methods, then frozen by {.build}; serializes
|
|
11
|
+
# to the OpenAI JSON-Schema shape via {#to_h} and validates LLM-supplied
|
|
12
|
+
# argument hashes via {#validate}.
|
|
13
|
+
#
|
|
14
|
+
# @example
|
|
15
|
+
# params = Tool::Parameters.build { |p| p.required_string :query, 'The query.' }
|
|
16
|
+
# params.to_h
|
|
17
|
+
# # => {type: 'object',
|
|
18
|
+
# # properties: {query: {type: 'string', description: 'The query.'}},
|
|
19
|
+
# # required: ['query']}
|
|
20
|
+
# params.validate('query' => 'cats') # => {query: 'cats'}
|
|
21
|
+
class Parameters
|
|
22
|
+
# Raised by {Parameters#validate} when arguments do not match the declared
|
|
23
|
+
# schema. The message lists every problem and reprints the schema, so it
|
|
24
|
+
# can be fed back to the LLM verbatim as the next tool-call observation.
|
|
25
|
+
class ValidationError < StandardError; end
|
|
26
|
+
|
|
27
|
+
# Yield a fresh builder, freeze it, and return it.
|
|
28
|
+
#
|
|
29
|
+
# @yieldparam builder [Parameters]
|
|
30
|
+
# @return [Parameters] frozen builder, safe to share between calls
|
|
31
|
+
def self.build
|
|
32
|
+
builder = new
|
|
33
|
+
yield builder
|
|
34
|
+
builder.freeze
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# @return [Parameters]
|
|
38
|
+
def initialize
|
|
39
|
+
@properties = {}
|
|
40
|
+
@required = []
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Freeze the builder along with its internal collections, so post-build
|
|
44
|
+
# mutation attempts raise +FrozenError+ instead of silently succeeding.
|
|
45
|
+
#
|
|
46
|
+
# @return [self]
|
|
47
|
+
def freeze
|
|
48
|
+
@properties.freeze
|
|
49
|
+
@required.freeze
|
|
50
|
+
super
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Add a required +string+ property.
|
|
54
|
+
#
|
|
55
|
+
# @param name [Symbol] property name
|
|
56
|
+
# @param description [String] human-readable description shown to the LLM
|
|
57
|
+
# @return [self]
|
|
58
|
+
def required_string(name, description)
|
|
59
|
+
add(name, 'string', description, required: true)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Add an optional +string+ property.
|
|
63
|
+
#
|
|
64
|
+
# @param name [Symbol] property name
|
|
65
|
+
# @param description [String] human-readable description shown to the LLM
|
|
66
|
+
# @return [self]
|
|
67
|
+
def optional_string(name, description)
|
|
68
|
+
add(name, 'string', description, required: false)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Add a required +integer+ property. Accepts Integers, Floats with a
|
|
72
|
+
# zero fractional part (e.g. +1.0+), and base-10 numeric Strings (after
|
|
73
|
+
# trimming) that resolve to whole numbers; rejects everything else.
|
|
74
|
+
#
|
|
75
|
+
# @param name [Symbol] property name
|
|
76
|
+
# @param description [String] human-readable description shown to the LLM
|
|
77
|
+
# @return [self]
|
|
78
|
+
def required_integer(name, description)
|
|
79
|
+
add(name, 'integer', description, required: true)
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# Add an optional +integer+ property. See {#required_integer} for
|
|
83
|
+
# accepted shapes.
|
|
84
|
+
#
|
|
85
|
+
# @param name [Symbol] property name
|
|
86
|
+
# @param description [String] human-readable description shown to the LLM
|
|
87
|
+
# @return [self]
|
|
88
|
+
def optional_integer(name, description)
|
|
89
|
+
add(name, 'integer', description, required: false)
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# Add a required +number+ property (JSON-Schema +number+: Integer or
|
|
93
|
+
# finite Float). Numeric Strings (after trimming) are parsed; NaN and
|
|
94
|
+
# Infinity are rejected.
|
|
95
|
+
#
|
|
96
|
+
# @param name [Symbol] property name
|
|
97
|
+
# @param description [String] human-readable description shown to the LLM
|
|
98
|
+
# @return [self]
|
|
99
|
+
def required_number(name, description)
|
|
100
|
+
add(name, 'number', description, required: true)
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# Add an optional +number+ property. See {#required_number} for
|
|
104
|
+
# accepted shapes.
|
|
105
|
+
#
|
|
106
|
+
# @param name [Symbol] property name
|
|
107
|
+
# @param description [String] human-readable description shown to the LLM
|
|
108
|
+
# @return [self]
|
|
109
|
+
def optional_number(name, description)
|
|
110
|
+
add(name, 'number', description, required: false)
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# Add a required +boolean+ property. Accepts Ruby +true+/+false+
|
|
114
|
+
# as-is, and the literal Strings +"true"+/+"false"+ (some models
|
|
115
|
+
# surface JSON booleans as Strings) after trimming surrounding
|
|
116
|
+
# whitespace. Other Strings, numbers, and +nil+ are rejected —
|
|
117
|
+
# there is no truthy-coercion of +"yes"+ / +0+ / etc.
|
|
118
|
+
#
|
|
119
|
+
# @param name [Symbol] property name
|
|
120
|
+
# @param description [String] human-readable description shown to the LLM
|
|
121
|
+
# @return [self]
|
|
122
|
+
def required_boolean(name, description)
|
|
123
|
+
add(name, 'boolean', description, required: true)
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# Add an optional +boolean+ property. See {#required_boolean} for
|
|
127
|
+
# accepted shapes.
|
|
128
|
+
#
|
|
129
|
+
# @param name [Symbol] property name
|
|
130
|
+
# @param description [String] human-readable description shown to the LLM
|
|
131
|
+
# @return [self]
|
|
132
|
+
def optional_boolean(name, description)
|
|
133
|
+
add(name, 'boolean', description, required: false)
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
# Schema in OpenAI JSON-Schema shape.
|
|
137
|
+
#
|
|
138
|
+
# @return [Hash] +{type: 'object', properties: {...}, required: [...]}+
|
|
139
|
+
def to_h
|
|
140
|
+
{ type: 'object', properties: @properties, required: @required }
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
# Validate a tool-call argument hash against the declared schema. Returns
|
|
144
|
+
# a symbol-keyed hash safe to splat as kwargs into a tool's +execute+
|
|
145
|
+
# Proc; raises {ValidationError} with an LLM-actionable message listing
|
|
146
|
+
# every missing/unknown/mistyped field and reprinting the schema.
|
|
147
|
+
#
|
|
148
|
+
# Strict: unknown keys are rejected (with DidYouMean suggestions), wrong
|
|
149
|
+
# types are rejected. All issues are collected and reported together so
|
|
150
|
+
# the LLM can fix them in one round trip.
|
|
151
|
+
#
|
|
152
|
+
# @param args [Hash] arguments as decoded from the tool-call JSON; keys
|
|
153
|
+
# may be Strings or Symbols
|
|
154
|
+
# @return [Hash{Symbol=>Object}] validated, symbol-keyed arguments
|
|
155
|
+
# @raise [ValidationError] if +args+ is not a Hash, contains unknown
|
|
156
|
+
# keys, omits a required key, or has a value of the wrong type
|
|
157
|
+
def validate(args)
|
|
158
|
+
raise ValidationError, "Arguments must be an object, got #{args.class}." unless args.is_a?(Hash)
|
|
159
|
+
|
|
160
|
+
symbolized = args.transform_keys(&:to_sym)
|
|
161
|
+
errors = []
|
|
162
|
+
result = {}
|
|
163
|
+
|
|
164
|
+
(symbolized.keys - @properties.keys).each do |unknown|
|
|
165
|
+
errors << unknown_key_error(unknown)
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
@properties.each do |name, schema|
|
|
169
|
+
if symbolized.key?(name)
|
|
170
|
+
begin
|
|
171
|
+
result[name] = coerce(symbolized[name], schema[:type])
|
|
172
|
+
rescue CoercionError => e
|
|
173
|
+
errors << "Parameter `#{name}` #{e.message}."
|
|
174
|
+
end
|
|
175
|
+
elsif @required.include?(name.to_s)
|
|
176
|
+
errors << "Missing required parameter `#{name}` (#{schema[:type]}): #{schema[:description]}"
|
|
177
|
+
end
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
return result if errors.empty?
|
|
181
|
+
|
|
182
|
+
raise ValidationError, build_error_message(errors)
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
private
|
|
186
|
+
|
|
187
|
+
# Internal coercion failure. Caught by {#validate} and turned into a
|
|
188
|
+
# {ValidationError} message — never escapes the class.
|
|
189
|
+
class CoercionError < StandardError; end
|
|
190
|
+
private_constant :CoercionError
|
|
191
|
+
|
|
192
|
+
def add(name, type, description, required:)
|
|
193
|
+
@properties[name] = { type: type, description: description }
|
|
194
|
+
@required << name.to_s if required
|
|
195
|
+
self
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
# Coerce +value+ to a Ruby value matching the JSON-Schema +type+,
|
|
199
|
+
# returning the coerced value. Raises {CoercionError} on failure.
|
|
200
|
+
def coerce(value, type)
|
|
201
|
+
case type
|
|
202
|
+
when 'string'
|
|
203
|
+
return value if value.is_a?(String)
|
|
204
|
+
|
|
205
|
+
raise CoercionError, type_message('string', value)
|
|
206
|
+
when 'integer'
|
|
207
|
+
coerce_integer(value)
|
|
208
|
+
when 'number'
|
|
209
|
+
coerce_number(value)
|
|
210
|
+
when 'boolean'
|
|
211
|
+
coerce_boolean(value)
|
|
212
|
+
end
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
def coerce_boolean(value)
|
|
216
|
+
return value if value == true || value == false
|
|
217
|
+
|
|
218
|
+
if value.is_a?(String)
|
|
219
|
+
case value.strip
|
|
220
|
+
when 'true' then return true
|
|
221
|
+
when 'false' then return false
|
|
222
|
+
end
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
raise CoercionError, type_message('boolean', value)
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
def coerce_integer(value)
|
|
229
|
+
case value
|
|
230
|
+
when Integer
|
|
231
|
+
value
|
|
232
|
+
when Float
|
|
233
|
+
raise CoercionError, type_message('integer', value) unless value.finite? && value.modulo(1).zero?
|
|
234
|
+
|
|
235
|
+
value.to_i
|
|
236
|
+
when String
|
|
237
|
+
parsed = parse_numeric_string(value)
|
|
238
|
+
raise CoercionError, type_message('integer', value) unless parsed && parsed.modulo(1).zero?
|
|
239
|
+
|
|
240
|
+
parsed.to_i
|
|
241
|
+
else
|
|
242
|
+
raise CoercionError, type_message('integer', value)
|
|
243
|
+
end
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
def coerce_number(value)
|
|
247
|
+
case value
|
|
248
|
+
when Integer
|
|
249
|
+
value
|
|
250
|
+
when Float
|
|
251
|
+
raise CoercionError, type_message('number', value) unless value.finite?
|
|
252
|
+
|
|
253
|
+
value
|
|
254
|
+
when String
|
|
255
|
+
parsed = parse_numeric_string(value)
|
|
256
|
+
raise CoercionError, type_message('number', value) unless parsed
|
|
257
|
+
|
|
258
|
+
parsed
|
|
259
|
+
else
|
|
260
|
+
raise CoercionError, type_message('number', value)
|
|
261
|
+
end
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
# Matches the decimal-numeric subset that JSON allows: optional sign,
|
|
265
|
+
# mantissa (with optional fractional part), optional decimal exponent.
|
|
266
|
+
# Rejects hex (+0x10+), underscores (+1_000+), +NaN+, +Infinity+.
|
|
267
|
+
DECIMAL_NUMERIC = /\A[-+]?(?:\d+\.?\d*|\.\d+)(?:[eE][-+]?\d+)?\z/
|
|
268
|
+
private_constant :DECIMAL_NUMERIC
|
|
269
|
+
|
|
270
|
+
# Strict base-10 numeric-string parse. Returns a finite Float, or +nil+
|
|
271
|
+
# for empty/whitespace/garbage/hex/NaN/Infinity input.
|
|
272
|
+
def parse_numeric_string(str)
|
|
273
|
+
trimmed = str.strip
|
|
274
|
+
return nil unless trimmed.match?(DECIMAL_NUMERIC)
|
|
275
|
+
|
|
276
|
+
parsed = Float(trimmed, exception: false)
|
|
277
|
+
return nil unless parsed&.finite?
|
|
278
|
+
|
|
279
|
+
parsed
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
def type_message(type, value)
|
|
283
|
+
article = type == 'integer' ? 'an' : 'a'
|
|
284
|
+
"must be #{article} #{type} (got #{value.class}: #{value.inspect})"
|
|
285
|
+
end
|
|
286
|
+
|
|
287
|
+
def unknown_key_error(unknown)
|
|
288
|
+
suggestion = DidYouMean::SpellChecker
|
|
289
|
+
.new(dictionary: @properties.keys.map(&:to_s))
|
|
290
|
+
.correct(unknown.to_s).first
|
|
291
|
+
msg = "Unknown parameter `#{unknown}`."
|
|
292
|
+
msg += suggestion ? " Did you mean `#{suggestion}`?" : " Valid parameters: #{valid_keys_list}."
|
|
293
|
+
msg
|
|
294
|
+
end
|
|
295
|
+
|
|
296
|
+
def valid_keys_list
|
|
297
|
+
@properties.keys.map { |k| "`#{k}`" }.join(', ')
|
|
298
|
+
end
|
|
299
|
+
|
|
300
|
+
def build_error_message(errors)
|
|
301
|
+
[
|
|
302
|
+
'Invalid arguments:',
|
|
303
|
+
*errors.map { |e| "- #{e}" },
|
|
304
|
+
'',
|
|
305
|
+
'Expected schema:',
|
|
306
|
+
*@properties.map { |name, prop|
|
|
307
|
+
req = @required.include?(name.to_s) ? 'required' : 'optional'
|
|
308
|
+
" - `#{name}` (#{prop[:type]}, #{req}): #{prop[:description]}"
|
|
309
|
+
}
|
|
310
|
+
].join("\n")
|
|
311
|
+
end
|
|
312
|
+
end
|
|
313
|
+
end
|
|
314
|
+
end
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Pikuri
|
|
4
|
+
class Tool
|
|
5
|
+
module Scraper
|
|
6
|
+
# Raised by anything in the scraper stack when a URL cannot be
|
|
7
|
+
# rendered into Markdown text — HTTP non-2xx, network failure,
|
|
8
|
+
# redirect-loop, missing +Location+, unsupported content-type, or a
|
|
9
|
+
# parse failure that reads as "try a different URL" to the LLM.
|
|
10
|
+
# Catching this in {Tool::WEB_SCRAPE} / {Tool::FETCH} turns the
|
|
11
|
+
# failure into an +"Error: ..."+ observation; anything else bubbles
|
|
12
|
+
# up so genuine bugs stay visible.
|
|
13
|
+
class FetchError < StandardError; end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
require 'nokogiri'
|
|
5
|
+
require 'readability'
|
|
6
|
+
require 'reverse_markdown'
|
|
7
|
+
|
|
8
|
+
module Pikuri
|
|
9
|
+
class Tool
|
|
10
|
+
module Scraper
|
|
11
|
+
# HTML → Markdown extractor used by {Simple.visit} when the fetched
|
|
12
|
+
# response carries an HTML content-type.
|
|
13
|
+
#
|
|
14
|
+
# Always renders both views of the page when available:
|
|
15
|
+
#
|
|
16
|
+
# 1. JSON-LD section. Any +<script type="application/ld+json">+ node
|
|
17
|
+
# whose +@type+ matches a substantive schema.org content type
|
|
18
|
+
# (Product, Article, Recipe, ...) is rendered as a header — title,
|
|
19
|
+
# metadata bullets (brand, SKU, price, rating, author, published),
|
|
20
|
+
# and the +articleBody+/+description+ copy when present.
|
|
21
|
+
# 2. Readability section. The page is run through +Readability+ +
|
|
22
|
+
# +reverse_markdown+, with a +<main>+/+<article>+ fallback for
|
|
23
|
+
# pages whose content sits mostly outside +<p>+ tags.
|
|
24
|
+
#
|
|
25
|
+
# Concatenated with a horizontal rule, so the LLM gets both the
|
|
26
|
+
# structured metadata and the rendered body and can pick whichever
|
|
27
|
+
# is more useful for the task. Trades some duplication (when a
|
|
28
|
+
# publisher embeds the article body in JSON-LD AND in HTML) for
|
|
29
|
+
# fewer type-based heuristics on which branch should win — the
|
|
30
|
+
# earlier "is this Article's +description+ a teaser or the real
|
|
31
|
+
# body?" carve-out is no longer needed because both end up in
|
|
32
|
+
# the output regardless.
|
|
33
|
+
#
|
|
34
|
+
# Pure parser — no I/O. {.extract} takes an HTML string and returns
|
|
35
|
+
# Markdown, so tests can drive it against fixture HTML without a
|
|
36
|
+
# network round-trip.
|
|
37
|
+
module HTML
|
|
38
|
+
# @return [Array<String>] schema.org +@type+ values that we treat
|
|
39
|
+
# as "the primary entity of this page" when picking a JSON-LD
|
|
40
|
+
# node to render. Order does not matter — the first matching
|
|
41
|
+
# node wins. Skips noise nodes (Organization, BreadcrumbList,
|
|
42
|
+
# WebSite, ...) that ship on most pages but carry no page
|
|
43
|
+
# content.
|
|
44
|
+
INTERESTING_TYPES = %w[
|
|
45
|
+
Product Article NewsArticle BlogPosting Recipe Event Book Movie
|
|
46
|
+
].freeze
|
|
47
|
+
|
|
48
|
+
# @return [Array<String>] HTML tags preserved by the readability
|
|
49
|
+
# pass. Anything outside this list is stripped before Markdown
|
|
50
|
+
# conversion.
|
|
51
|
+
READABILITY_TAGS = %w[
|
|
52
|
+
h1 h2 h3 h4 h5 h6 p div span ul ol li blockquote pre code a img
|
|
53
|
+
strong em b i br hr table thead tbody tr td th
|
|
54
|
+
].freeze
|
|
55
|
+
|
|
56
|
+
# @return [Array<String>] HTML attributes preserved by the
|
|
57
|
+
# readability pass; everything else (class, id, style, data-*)
|
|
58
|
+
# is dropped before Markdown conversion
|
|
59
|
+
READABILITY_ATTRS = %w[href src alt title].freeze
|
|
60
|
+
|
|
61
|
+
# @return [Float] minimum +<main>+/+<article>+ to Readability
|
|
62
|
+
# text-length ratio that triggers the semantic-container
|
|
63
|
+
# fallback in {.readability_to_markdown}. Picked low enough to
|
|
64
|
+
# catch the failure mode (Readability collapsing a page that
|
|
65
|
+
# uses divs/lists instead of +<p>+ — e.g. +vaadin.com/company+,
|
|
66
|
+
# ~5x) but high enough that pages where both produce
|
|
67
|
+
# comparable output keep Readability's noise filtering.
|
|
68
|
+
MAIN_FALLBACK_RATIO = 2.0
|
|
69
|
+
|
|
70
|
+
# @return [Integer] minimum text length the
|
|
71
|
+
# +<main>+/+<article>+ container must hold before the fallback
|
|
72
|
+
# in {.readability_to_markdown} can fire. Below this, the
|
|
73
|
+
# ratio comparison is dominated by noise and we'd swap on
|
|
74
|
+
# tiny pages where Readability is doing the right thing.
|
|
75
|
+
MAIN_FALLBACK_MIN_CHARS = 500
|
|
76
|
+
|
|
77
|
+
# Render +html+ as Markdown by emitting both the JSON-LD section
|
|
78
|
+
# (when an interesting node is present) and the readability /
|
|
79
|
+
# +<main>+ section, joined by a horizontal rule. Either section
|
|
80
|
+
# may be missing — pages with no JSON-LD return only the
|
|
81
|
+
# readability output, and a malformed page with no extractable
|
|
82
|
+
# body returns only the JSON-LD render.
|
|
83
|
+
#
|
|
84
|
+
# @param html [String] HTML document body
|
|
85
|
+
# @return [String] Markdown representation
|
|
86
|
+
def self.extract(html)
|
|
87
|
+
sections = [jsonld_section(html), readability_to_markdown(html)]
|
|
88
|
+
sections.reject! { |s| s.nil? || s.strip.empty? }
|
|
89
|
+
sections.join("\n\n---\n\n")
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# Pick the first JSON-LD node whose +@type+ matches one of
|
|
93
|
+
# {INTERESTING_TYPES} and render it as Markdown. Returns +nil+
|
|
94
|
+
# when no such node exists, in which case {.extract} emits only
|
|
95
|
+
# the readability section.
|
|
96
|
+
#
|
|
97
|
+
# No content-field gating: a node carrying just +name+/+author+/
|
|
98
|
+
# +datePublished+ still renders (as a metadata-only header),
|
|
99
|
+
# because the readability pass independently produces the page
|
|
100
|
+
# body. That is the trade-off that lets us drop the type-based
|
|
101
|
+
# "is this teaser or article copy?" heuristics — duplication is
|
|
102
|
+
# acceptable when both views are available, and the LLM can
|
|
103
|
+
# pick whichever it needs.
|
|
104
|
+
#
|
|
105
|
+
# @param html [String] HTML document body
|
|
106
|
+
# @return [String, nil] Markdown render of the picked JSON-LD
|
|
107
|
+
# node, or +nil+ when nothing matched
|
|
108
|
+
def self.jsonld_section(html)
|
|
109
|
+
node = parse_jsonld(html).find do |n|
|
|
110
|
+
Array(n['@type']).any? { |t| INTERESTING_TYPES.include?(t) }
|
|
111
|
+
end
|
|
112
|
+
node ? jsonld_to_markdown(node) : nil
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# Collect every JSON-LD payload embedded in +html+, flattening
|
|
116
|
+
# +@graph+ wrappers so callers see one flat array of schema.org
|
|
117
|
+
# nodes. Malformed JSON blocks are silently skipped — sites
|
|
118
|
+
# frequently ship broken JSON-LD and we only need at least one
|
|
119
|
+
# parseable block.
|
|
120
|
+
#
|
|
121
|
+
# @param html [String] HTML document body
|
|
122
|
+
# @return [Array<Hash>] parsed JSON-LD nodes; possibly empty
|
|
123
|
+
def self.parse_jsonld(html)
|
|
124
|
+
doc = Nokogiri::HTML(html)
|
|
125
|
+
blobs = doc.css('script[type="application/ld+json"]').map(&:text)
|
|
126
|
+
|
|
127
|
+
blobs.flat_map do |raw|
|
|
128
|
+
parsed = begin
|
|
129
|
+
JSON.parse(raw)
|
|
130
|
+
rescue JSON::ParserError
|
|
131
|
+
nil
|
|
132
|
+
end
|
|
133
|
+
next [] unless parsed
|
|
134
|
+
|
|
135
|
+
nodes = parsed.is_a?(Array) ? parsed : [parsed]
|
|
136
|
+
nodes.flat_map { |n| n['@graph'].is_a?(Array) ? n['@graph'] : [n] }
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
# Render a single JSON-LD +node+ as Markdown: a top-level title
|
|
141
|
+
# from +name+/+headline+, a bullet list of common useful fields
|
|
142
|
+
# (brand, SKU, price, rating, author, published date, ...), the
|
|
143
|
+
# body copy, and the lead image.
|
|
144
|
+
#
|
|
145
|
+
# When the node carries +articleBody+ (the full publisher-supplied
|
|
146
|
+
# article text), that wins over +description+ — the description
|
|
147
|
+
# is typically a lede teaser and would just repeat the article's
|
|
148
|
+
# opening lines.
|
|
149
|
+
#
|
|
150
|
+
# @param node [Hash] JSON-LD node, typically picked by
|
|
151
|
+
# {.jsonld_section}
|
|
152
|
+
# @return [String] Markdown representation
|
|
153
|
+
def self.jsonld_to_markdown(node)
|
|
154
|
+
out = +''
|
|
155
|
+
name = node['name'] || node['headline']
|
|
156
|
+
out << "# #{name}\n\n" if name
|
|
157
|
+
|
|
158
|
+
offer = first_obj(node['offers'])
|
|
159
|
+
rating = first_obj(node['aggregateRating'])
|
|
160
|
+
brand = first_obj_or_string(node['brand'])
|
|
161
|
+
author = first_obj_or_string(node['author'])
|
|
162
|
+
|
|
163
|
+
brand_name = brand.is_a?(Hash) ? brand['name'] : brand
|
|
164
|
+
author_name = author.is_a?(Hash) ? author['name'] : author
|
|
165
|
+
|
|
166
|
+
fields = {
|
|
167
|
+
'Brand' => brand_name,
|
|
168
|
+
'SKU' => node['sku'],
|
|
169
|
+
'GTIN' => node['gtin13'] || node['gtin'],
|
|
170
|
+
'Price' => [offer['price'], offer['priceCurrency']].compact.join(' '),
|
|
171
|
+
'Availability' => offer['availability'],
|
|
172
|
+
'Rating' => rating['ratingValue'],
|
|
173
|
+
'Reviews' => rating['reviewCount'],
|
|
174
|
+
'Author' => author_name,
|
|
175
|
+
'Published' => node['datePublished']
|
|
176
|
+
}.reject { |_, v| v.nil? || v.to_s.strip.empty? }
|
|
177
|
+
|
|
178
|
+
unless fields.empty?
|
|
179
|
+
fields.each { |k, v| out << "- **#{k}:** #{v}\n" }
|
|
180
|
+
out << "\n"
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
if (body = node['articleBody'] || node['description'])
|
|
184
|
+
out << "#{body}\n\n"
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
if (img = node['image'])
|
|
188
|
+
img = img.first if img.is_a?(Array)
|
|
189
|
+
img = img['url'] if img.is_a?(Hash)
|
|
190
|
+
out << "\n\n" if img
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
out
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
# Run +Readability+ over +html+ to isolate the main content node,
|
|
197
|
+
# then convert that to Markdown via +reverse_markdown+. The page
|
|
198
|
+
# +<title>+ is rendered as a top-level heading.
|
|
199
|
+
#
|
|
200
|
+
# When the page uses semantic HTML5 (+<main>+ or +<article>+) but
|
|
201
|
+
# leaves most of its content outside +<p>+ tags — divs, lists,
|
|
202
|
+
# spans — Readability's paragraph-density scoring collapses the
|
|
203
|
+
# extraction to a sliver of the page. In that case we render the
|
|
204
|
+
# +<main>+/+<article>+ container directly. The fallback only
|
|
205
|
+
# fires when the container holds substantially more text than
|
|
206
|
+
# Readability picked up (see {MAIN_FALLBACK_RATIO} /
|
|
207
|
+
# {MAIN_FALLBACK_MIN_CHARS}); on pages where both agree we keep
|
|
208
|
+
# Readability so its noise filtering still strips nav/ads/etc.
|
|
209
|
+
#
|
|
210
|
+
# @param html [String] HTML document body
|
|
211
|
+
# @return [String] Markdown representation
|
|
212
|
+
def self.readability_to_markdown(html)
|
|
213
|
+
rdoc = Readability::Document.new(
|
|
214
|
+
html,
|
|
215
|
+
tags: READABILITY_TAGS,
|
|
216
|
+
attributes: READABILITY_ATTRS,
|
|
217
|
+
remove_empty_nodes: true
|
|
218
|
+
)
|
|
219
|
+
readability_html = rdoc.content
|
|
220
|
+
title = rdoc.title
|
|
221
|
+
|
|
222
|
+
body_html = main_fallback_html(html, readability_html) || readability_html
|
|
223
|
+
body = ReverseMarkdown.convert(body_html, unknown_tags: :bypass, github_flavored: true)
|
|
224
|
+
|
|
225
|
+
out = +''
|
|
226
|
+
out << "# #{title.strip}\n\n" if title && !title.strip.empty?
|
|
227
|
+
out << body
|
|
228
|
+
out
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
# If +html+ has a +<main>+ or +<article>+ element holding
|
|
232
|
+
# substantially more text than Readability extracted, return that
|
|
233
|
+
# container's HTML so the caller can render it instead. Returns
|
|
234
|
+
# +nil+ when the fallback should not fire — when there is no
|
|
235
|
+
# semantic container, when it's too small to be meaningful, or
|
|
236
|
+
# when Readability's output is already comparable.
|
|
237
|
+
#
|
|
238
|
+
# @param html [String] full HTML document body, used to locate
|
|
239
|
+
# the +<main>+/+<article>+ container
|
|
240
|
+
# @param readability_html [String] HTML produced by
|
|
241
|
+
# +Readability::Document#content+, used as the comparison
|
|
242
|
+
# baseline
|
|
243
|
+
# @return [String, nil] container HTML when the fallback should
|
|
244
|
+
# fire, +nil+ otherwise
|
|
245
|
+
def self.main_fallback_html(html, readability_html)
|
|
246
|
+
doc = Nokogiri::HTML(html)
|
|
247
|
+
container = doc.at_css('main') || doc.at_css('article')
|
|
248
|
+
return nil unless container
|
|
249
|
+
|
|
250
|
+
container_text_len = container.text.gsub(/\s+/, ' ').strip.length
|
|
251
|
+
return nil if container_text_len < MAIN_FALLBACK_MIN_CHARS
|
|
252
|
+
|
|
253
|
+
readability_text_len = Nokogiri::HTML(readability_html).text.gsub(/\s+/, ' ').strip.length
|
|
254
|
+
return nil if container_text_len < MAIN_FALLBACK_RATIO * readability_text_len
|
|
255
|
+
|
|
256
|
+
container.to_html
|
|
257
|
+
end
|
|
258
|
+
private_class_method :main_fallback_html
|
|
259
|
+
|
|
260
|
+
# JSON-LD fields can be a string, hash, or array of either.
|
|
261
|
+
# Normalize to a single hash (the first one if it's a list) so
|
|
262
|
+
# callers can +.dig+ safely.
|
|
263
|
+
#
|
|
264
|
+
# @param value [Object] raw JSON-LD field value
|
|
265
|
+
# @return [Hash] empty hash when +value+ does not contain a hash
|
|
266
|
+
def self.first_obj(value)
|
|
267
|
+
value = value.first if value.is_a?(Array)
|
|
268
|
+
value.is_a?(Hash) ? value : {}
|
|
269
|
+
end
|
|
270
|
+
private_class_method :first_obj
|
|
271
|
+
|
|
272
|
+
# Same idea as {.first_obj} but preserves a bare string (e.g.
|
|
273
|
+
# +brand: "Apple"+) instead of replacing it with +{}+.
|
|
274
|
+
#
|
|
275
|
+
# @param value [Object] raw JSON-LD field value
|
|
276
|
+
# @return [String, Hash, nil]
|
|
277
|
+
def self.first_obj_or_string(value)
|
|
278
|
+
value = value.first if value.is_a?(Array)
|
|
279
|
+
value
|
|
280
|
+
end
|
|
281
|
+
private_class_method :first_obj_or_string
|
|
282
|
+
end
|
|
283
|
+
end
|
|
284
|
+
end
|
|
285
|
+
end
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'pdf-reader'
|
|
4
|
+
require 'stringio'
|
|
5
|
+
|
|
6
|
+
module Pikuri
|
|
7
|
+
class Tool
|
|
8
|
+
module Scraper
|
|
9
|
+
# PDF → text extractor used by {Simple.visit} when the fetched
|
|
10
|
+
# response carries +application/pdf+. Wraps the +pdf-reader+ gem:
|
|
11
|
+
# walk every page, concatenate the extracted text, hand the result
|
|
12
|
+
# back as a single string the LLM can read.
|
|
13
|
+
#
|
|
14
|
+
# Best-effort by design. +pdf-reader+ produces clean text from PDFs
|
|
15
|
+
# generated from a digital source (LaTeX, Word export, ...) but
|
|
16
|
+
# returns nothing useful from scanned documents — there is no OCR
|
|
17
|
+
# in this path. When extraction yields no text we still return an
|
|
18
|
+
# empty string rather than raising, so the caller's cache stores a
|
|
19
|
+
# consistent result and the LLM sees an empty observation it can
|
|
20
|
+
# react to.
|
|
21
|
+
#
|
|
22
|
+
# Pure parser — no I/O. {.extract} takes PDF bytes and returns text,
|
|
23
|
+
# so tests can drive it against an in-memory fixture without
|
|
24
|
+
# touching the network.
|
|
25
|
+
module PDF
|
|
26
|
+
# Render +bytes+ as plain text, one page per paragraph.
|
|
27
|
+
#
|
|
28
|
+
# +pdf-reader+ raises a handful of typed exceptions for documents
|
|
29
|
+
# it cannot parse — broken xrefs ({::PDF::Reader::MalformedPDFError}),
|
|
30
|
+
# invalid page references ({::PDF::Reader::InvalidPageError}),
|
|
31
|
+
# encrypted/XFA files ({::PDF::Reader::UnsupportedFeatureError}).
|
|
32
|
+
# All three describe a property of the PDF the LLM can react to
|
|
33
|
+
# ("try a different URL"), so we re-raise them as {FetchError} —
|
|
34
|
+
# same convention as the HTTP layer in {Simple.fetch}. Genuine
|
|
35
|
+
# bugs in +pdf-reader+ itself surface as their own classes and
|
|
36
|
+
# crash loud.
|
|
37
|
+
#
|
|
38
|
+
# @param bytes [String] raw PDF document (binary string)
|
|
39
|
+
# @return [String] concatenated page text; possibly empty when
|
|
40
|
+
# the PDF carries no extractable text (scanned image, empty
|
|
41
|
+
# document)
|
|
42
|
+
# @raise [FetchError] when +pdf-reader+ refuses the document
|
|
43
|
+
def self.extract(bytes)
|
|
44
|
+
reader = ::PDF::Reader.new(StringIO.new(bytes))
|
|
45
|
+
reader.pages.map { |p| p.text.strip }.reject(&:empty?).join("\n\n")
|
|
46
|
+
rescue ::PDF::Reader::MalformedPDFError,
|
|
47
|
+
::PDF::Reader::InvalidPageError,
|
|
48
|
+
::PDF::Reader::UnsupportedFeatureError => e
|
|
49
|
+
raise FetchError, "PDF rendering failed: #{e.class.name.split('::').last}: #{e.message}"
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|