pikuri 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +62 -0
- data/GETTING_STARTED.md +223 -0
- data/LICENSE +21 -0
- data/README.md +193 -0
- data/lib/pikuri/agent/chat_transport.rb +41 -0
- data/lib/pikuri/agent/context_window_detector.rb +101 -0
- data/lib/pikuri/agent/listener/in_memory_message_list.rb +33 -0
- data/lib/pikuri/agent/listener/message_listener.rb +93 -0
- data/lib/pikuri/agent/listener/step_limit.rb +97 -0
- data/lib/pikuri/agent/listener/terminal.rb +137 -0
- data/lib/pikuri/agent/listener/token_log.rb +166 -0
- data/lib/pikuri/agent/listener_list.rb +113 -0
- data/lib/pikuri/agent/message.rb +61 -0
- data/lib/pikuri/agent/synthesizer.rb +120 -0
- data/lib/pikuri/agent/tokens.rb +56 -0
- data/lib/pikuri/agent.rb +286 -0
- data/lib/pikuri/subprocess.rb +166 -0
- data/lib/pikuri/tool/bash.rb +272 -0
- data/lib/pikuri/tool/calculator.rb +82 -0
- data/lib/pikuri/tool/confirmer.rb +96 -0
- data/lib/pikuri/tool/edit.rb +196 -0
- data/lib/pikuri/tool/fetch.rb +167 -0
- data/lib/pikuri/tool/glob.rb +310 -0
- data/lib/pikuri/tool/grep.rb +338 -0
- data/lib/pikuri/tool/parameters.rb +314 -0
- data/lib/pikuri/tool/read.rb +254 -0
- data/lib/pikuri/tool/scraper/fetch_error.rb +16 -0
- data/lib/pikuri/tool/scraper/html.rb +285 -0
- data/lib/pikuri/tool/scraper/pdf.rb +54 -0
- data/lib/pikuri/tool/scraper/simple.rb +177 -0
- data/lib/pikuri/tool/search/brave.rb +184 -0
- data/lib/pikuri/tool/search/duckduckgo.rb +196 -0
- data/lib/pikuri/tool/search/engines.rb +154 -0
- data/lib/pikuri/tool/search/exa.rb +217 -0
- data/lib/pikuri/tool/search/rate_limiter.rb +92 -0
- data/lib/pikuri/tool/search/result.rb +29 -0
- data/lib/pikuri/tool/skill.rb +80 -0
- data/lib/pikuri/tool/skill_catalog.rb +376 -0
- data/lib/pikuri/tool/sub_agent.rb +102 -0
- data/lib/pikuri/tool/web_scrape.rb +117 -0
- data/lib/pikuri/tool/web_search.rb +38 -0
- data/lib/pikuri/tool/workspace.rb +150 -0
- data/lib/pikuri/tool/write.rb +170 -0
- data/lib/pikuri/tool.rb +118 -0
- data/lib/pikuri/url_cache.rb +106 -0
- data/lib/pikuri/version.rb +10 -0
- data/lib/pikuri.rb +165 -0
- data/prompts/coding-system-prompt.txt +28 -0
- data/prompts/pikuri-chat.txt +15 -0
- metadata +259 -0
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Pikuri
|
|
4
|
+
class Tool
|
|
5
|
+
# The +read+ tool, expressed as a {Tool} subclass: instantiating
|
|
6
|
+
# +Tool::Read.new(workspace: ws)+ produces a tool whose
|
|
7
|
+
# {Tool#to_ruby_llm_tool} wiring is identical to any bundled tool's,
|
|
8
|
+
# so ruby_llm sees nothing special about it. Same shape as
|
|
9
|
+
# {Tool::SubAgent} — workspace is captured by the +execute+ closure
|
|
10
|
+
# at construction.
|
|
11
|
+
#
|
|
12
|
+
# == Output format
|
|
13
|
+
#
|
|
14
|
+
# cat-n: each line is rendered as +"%6d\t%s"+ (six-column right-
|
|
15
|
+
# padded line number, tab, content). Chosen for breadth of training-
|
|
16
|
+
# data exposure: +cat -n+ output shows up across virtually every Unix
|
|
17
|
+
# tutorial and Stack Overflow answer, so even small local models
|
|
18
|
+
# recognize the shape. opencode's shorter +"<n>: <content>"+ format
|
|
19
|
+
# saves a few thousand tokens per 2K-line file but trades model
|
|
20
|
+
# familiarity; pi omits line numbers entirely (cheapest tokens, but
|
|
21
|
+
# the model loses the ability to cite ranges or pick {Edit}
|
|
22
|
+
# boundaries precisely).
|
|
23
|
+
#
|
|
24
|
+
# == Truncation rules
|
|
25
|
+
#
|
|
26
|
+
# Two independent limits, whichever fires first wins:
|
|
27
|
+
#
|
|
28
|
+
# * *Line limit* — {DEFAULT_LIMIT} lines (overridable via +limit+).
|
|
29
|
+
# * *Byte cap* — {MAX_BYTES} bytes of input content; not exposed as a
|
|
30
|
+
# parameter. Bypassable in practice by paging via +offset+.
|
|
31
|
+
#
|
|
32
|
+
# Additionally, individual lines longer than {MAX_LINE_LENGTH} chars
|
|
33
|
+
# are truncated with {LINE_TRUNCATION_MARKER} appended; the model is
|
|
34
|
+
# told to reach for +grep+ to find content inside such files.
|
|
35
|
+
#
|
|
36
|
+
# == Refusals
|
|
37
|
+
#
|
|
38
|
+
# * Path outside the workspace → caught from
|
|
39
|
+
# {Tool::Workspace::Error}, returned as +"Error: ..."+.
|
|
40
|
+
# * File not found, EACCES → +"Error: ..."+.
|
|
41
|
+
# * Path is a directory → +"Error: ... use the glob tool"+, keeping
|
|
42
|
+
# directory listing as the glob tool's responsibility (Step 9).
|
|
43
|
+
# * Binary content → sniffed from the first {BINARY_SAMPLE_BYTES} of
|
|
44
|
+
# the file: any +NUL+ byte, or more than {BINARY_NONPRINTABLE_THRESHOLD}
|
|
45
|
+
# non-printable bytes (control chars outside +\t \n \v \f \r+),
|
|
46
|
+
# triggers refusal. Catches images, PDFs, archives, and compiled
|
|
47
|
+
# artifacts without an extension list to maintain.
|
|
48
|
+
# * Offset past EOF → +"Error: offset N is beyond end of file (M lines total)"+.
|
|
49
|
+
class Read < Tool
|
|
50
|
+
# @return [Integer] default value of the +limit+ parameter (number
|
|
51
|
+
# of lines to read per call).
|
|
52
|
+
DEFAULT_LIMIT = 2000
|
|
53
|
+
|
|
54
|
+
# @return [Integer] per-line character cap; longer lines are
|
|
55
|
+
# truncated with {LINE_TRUNCATION_MARKER}.
|
|
56
|
+
MAX_LINE_LENGTH = 2000
|
|
57
|
+
|
|
58
|
+
# @return [String] suffix appended to lines truncated by
|
|
59
|
+
# {MAX_LINE_LENGTH}.
|
|
60
|
+
LINE_TRUNCATION_MARKER = "... (line truncated to #{MAX_LINE_LENGTH} chars)"
|
|
61
|
+
|
|
62
|
+
# @return [Integer] hard byte cap on input content collected per
|
|
63
|
+
# call. Counted on the line bytes (plus one for the joining
|
|
64
|
+
# newline); the rendered output is slightly larger due to the
|
|
65
|
+
# per-line +"%6d\t"+ prefix.
|
|
66
|
+
MAX_BYTES = 50 * 1024
|
|
67
|
+
|
|
68
|
+
# @return [String] human-readable form of {MAX_BYTES} for the
|
|
69
|
+
# continuation marker.
|
|
70
|
+
MAX_BYTES_LABEL = "#{MAX_BYTES / 1024} KB"
|
|
71
|
+
|
|
72
|
+
# @return [Integer] number of bytes sampled from the start of the
|
|
73
|
+
# file for binary-content detection.
|
|
74
|
+
BINARY_SAMPLE_BYTES = 4096
|
|
75
|
+
|
|
76
|
+
# @return [Float] fraction of the sample that may be non-printable
|
|
77
|
+
# before the file is classified as binary. Matches opencode's
|
|
78
|
+
# 30%.
|
|
79
|
+
BINARY_NONPRINTABLE_THRESHOLD = 0.30
|
|
80
|
+
|
|
81
|
+
# Description shown to the LLM. Follows the opencode-shape (summary
|
|
82
|
+
# + +Usage:+ bullets) prescribed by the project's tool-description
|
|
83
|
+
# convention. Per-parameter constraints (defaults, format) live in
|
|
84
|
+
# the parameter descriptions, not here.
|
|
85
|
+
#
|
|
86
|
+
# @return [String]
|
|
87
|
+
DESCRIPTION = <<~DESC
|
|
88
|
+
Read a file from the workspace and return its contents with line numbers.
|
|
89
|
+
|
|
90
|
+
Usage:
|
|
91
|
+
- Output is line-numbered in `cat -n` style so subsequent edits can reference exact line numbers.
|
|
92
|
+
- Use `offset` and `limit` to page through large files; when the response ends in `Use offset=N to continue`, call again with that offset.
|
|
93
|
+
- Lines longer than #{MAX_LINE_LENGTH} chars are truncated with a marker — use `grep` for content inside such files.
|
|
94
|
+
- Binary files (images, PDFs, archives, compiled artifacts) are refused; this tool reads text only.
|
|
95
|
+
- Directories are refused — use the `glob` tool to list files.
|
|
96
|
+
- If unsure of the path, use `glob` first to look up filenames.
|
|
97
|
+
- Avoid tiny repeated slices — if you need more context, read a larger window.
|
|
98
|
+
DESC
|
|
99
|
+
|
|
100
|
+
# @param workspace [Tool::Workspace] captured for path resolution;
|
|
101
|
+
# all reads route through +workspace.resolve_for_read+.
|
|
102
|
+
# @return [Read]
|
|
103
|
+
def initialize(workspace:)
|
|
104
|
+
super(
|
|
105
|
+
name: 'read',
|
|
106
|
+
description: DESCRIPTION,
|
|
107
|
+
parameters: Parameters.build { |p|
|
|
108
|
+
p.required_string :path,
|
|
109
|
+
'Path to the file to read. Relative paths ' \
|
|
110
|
+
'resolve against the workspace root, e.g. ' \
|
|
111
|
+
'"lib/foo.rb" or "/abs/path/to/file.txt".'
|
|
112
|
+
p.optional_integer :offset,
|
|
113
|
+
'Line number to start reading from (1-indexed). ' \
|
|
114
|
+
"Defaults to 1, e.g. 200."
|
|
115
|
+
p.optional_integer :limit,
|
|
116
|
+
'Maximum number of lines to read. Defaults to ' \
|
|
117
|
+
"#{DEFAULT_LIMIT}, e.g. 500."
|
|
118
|
+
},
|
|
119
|
+
execute: ->(path:, offset: 1, limit: DEFAULT_LIMIT) {
|
|
120
|
+
Read.read(workspace: workspace, path: path, offset: offset, limit: limit)
|
|
121
|
+
}
|
|
122
|
+
)
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
# Resolve +path+ against +workspace+, refuse directories / binaries /
|
|
126
|
+
# missing files, and return either the cat-n-formatted slice or an
|
|
127
|
+
# +"Error: ..."+ observation.
|
|
128
|
+
#
|
|
129
|
+
# @param workspace [Tool::Workspace]
|
|
130
|
+
# @param path [String] raw path as supplied by the LLM
|
|
131
|
+
# @param offset [Integer] 1-indexed line number to start at
|
|
132
|
+
# @param limit [Integer] maximum lines to return
|
|
133
|
+
# @return [String] tool observation
|
|
134
|
+
def self.read(workspace:, path:, offset:, limit:)
|
|
135
|
+
return "Error: offset must be >= 1, got #{offset}" if offset < 1
|
|
136
|
+
return "Error: limit must be >= 1, got #{limit}" if limit < 1
|
|
137
|
+
|
|
138
|
+
resolved = workspace.resolve_for_read(path)
|
|
139
|
+
return "Error: file not found: #{path}" unless resolved.exist?
|
|
140
|
+
return "Error: #{path} is a directory; use the glob tool to list files." if resolved.directory?
|
|
141
|
+
|
|
142
|
+
sample = read_sample(resolved)
|
|
143
|
+
return "Error: cannot read binary file: #{path}" if binary?(sample)
|
|
144
|
+
|
|
145
|
+
format_slice(path: path, resolved: resolved, offset: offset, limit: limit)
|
|
146
|
+
rescue Tool::Workspace::Error => e
|
|
147
|
+
"Error: #{e.message}"
|
|
148
|
+
rescue Errno::EACCES => e
|
|
149
|
+
"Error: cannot read #{path}: #{e.message}"
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
# Read up to {BINARY_SAMPLE_BYTES} of the file in binary mode for
|
|
153
|
+
# the {.binary?} sniff. Returns an empty String for an empty file
|
|
154
|
+
# (which {.binary?} treats as not-binary).
|
|
155
|
+
#
|
|
156
|
+
# @param resolved [Pathname]
|
|
157
|
+
# @return [String] raw bytes (ASCII-8BIT encoding)
|
|
158
|
+
def self.read_sample(resolved)
|
|
159
|
+
resolved.open('rb') { |io| io.read(BINARY_SAMPLE_BYTES) || +'' }
|
|
160
|
+
end
|
|
161
|
+
private_class_method :read_sample
|
|
162
|
+
|
|
163
|
+
# Heuristic binary classifier matching opencode's: any NUL byte
|
|
164
|
+
# forces +true+; otherwise count bytes outside the printable +\t \n
|
|
165
|
+
# \v \f \r+ + ASCII-32..126 range and ratio against the sample
|
|
166
|
+
# size. UTF-8 continuation bytes (0x80-0xBF) are >127 so they sit
|
|
167
|
+
# outside the non-printable ranges and pass through unflagged,
|
|
168
|
+
# letting UTF-8 text read fine.
|
|
169
|
+
#
|
|
170
|
+
# Public because {Tool::Edit} reuses it to refuse binary targets —
|
|
171
|
+
# if Edit accepted a binary file the model has no way to have read,
|
|
172
|
+
# it could corrupt bytes the model never inspected. Same sniff, same
|
|
173
|
+
# threshold, one definition.
|
|
174
|
+
#
|
|
175
|
+
# @param bytes [String] sample bytes
|
|
176
|
+
# @return [Boolean]
|
|
177
|
+
def self.binary?(bytes)
|
|
178
|
+
return false if bytes.empty?
|
|
179
|
+
|
|
180
|
+
non_printable = 0
|
|
181
|
+
bytes.each_byte do |b|
|
|
182
|
+
return true if b.zero?
|
|
183
|
+
|
|
184
|
+
non_printable += 1 if b < 9 || (b > 13 && b < 32)
|
|
185
|
+
end
|
|
186
|
+
non_printable.to_f / bytes.bytesize > BINARY_NONPRINTABLE_THRESHOLD
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
# Stream the file line-by-line, collect at most +limit+ lines
|
|
190
|
+
# starting at +offset+, and stop early if {MAX_BYTES} is reached.
|
|
191
|
+
# We keep counting lines past the collection window so the trailer
|
|
192
|
+
# can report total line count when the line limit (not the byte
|
|
193
|
+
# cap) was the stopping criterion — same trick opencode uses.
|
|
194
|
+
#
|
|
195
|
+
# @return [String]
|
|
196
|
+
def self.format_slice(path:, resolved:, offset:, limit:)
|
|
197
|
+
start_index = offset - 1
|
|
198
|
+
collected = []
|
|
199
|
+
total_lines = 0
|
|
200
|
+
bytes = 0
|
|
201
|
+
byte_cap_hit = false
|
|
202
|
+
has_more = false
|
|
203
|
+
|
|
204
|
+
resolved.each_line do |raw|
|
|
205
|
+
total_lines += 1
|
|
206
|
+
next if total_lines <= start_index
|
|
207
|
+
|
|
208
|
+
if collected.length >= limit
|
|
209
|
+
has_more = true
|
|
210
|
+
next
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
line = raw.chomp
|
|
214
|
+
if line.length > MAX_LINE_LENGTH
|
|
215
|
+
line = line[0, MAX_LINE_LENGTH] + LINE_TRUNCATION_MARKER
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
size = line.bytesize + 1 # +1 for the joining newline
|
|
219
|
+
if bytes + size > MAX_BYTES
|
|
220
|
+
byte_cap_hit = true
|
|
221
|
+
has_more = true
|
|
222
|
+
break
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
collected << line
|
|
226
|
+
bytes += size
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
return '(Empty file)' if total_lines.zero?
|
|
230
|
+
|
|
231
|
+
if start_index >= total_lines
|
|
232
|
+
return "Error: offset #{offset} is beyond end of file (#{total_lines} lines total)"
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
last_line = offset + collected.length - 1
|
|
236
|
+
body = collected.each_with_index.map { |line, i| format("%6d\t%s", i + offset, line) }.join("\n")
|
|
237
|
+
|
|
238
|
+
trailer =
|
|
239
|
+
if byte_cap_hit
|
|
240
|
+
"(Output capped at #{MAX_BYTES_LABEL}. Showing lines #{offset}-#{last_line}. " \
|
|
241
|
+
"Use offset=#{last_line + 1} to continue.)"
|
|
242
|
+
elsif has_more
|
|
243
|
+
"(Showing lines #{offset}-#{last_line} of #{total_lines}. " \
|
|
244
|
+
"Use offset=#{last_line + 1} to continue.)"
|
|
245
|
+
else
|
|
246
|
+
"(End of file - total #{total_lines} lines)"
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
"#{body}\n\n#{trailer}"
|
|
250
|
+
end
|
|
251
|
+
private_class_method :format_slice
|
|
252
|
+
end
|
|
253
|
+
end
|
|
254
|
+
end
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Pikuri
|
|
4
|
+
class Tool
|
|
5
|
+
module Scraper
|
|
6
|
+
# Raised by anything in the scraper stack when a URL cannot be
|
|
7
|
+
# rendered into Markdown text — HTTP non-2xx, network failure,
|
|
8
|
+
# redirect-loop, missing +Location+, unsupported content-type, or a
|
|
9
|
+
# parse failure that reads as "try a different URL" to the LLM.
|
|
10
|
+
# Catching this in {Tool::WEB_SCRAPE} / {Tool::FETCH} turns the
|
|
11
|
+
# failure into an +"Error: ..."+ observation; anything else bubbles
|
|
12
|
+
# up so genuine bugs stay visible.
|
|
13
|
+
class FetchError < StandardError; end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
require 'nokogiri'
|
|
5
|
+
require 'readability'
|
|
6
|
+
require 'reverse_markdown'
|
|
7
|
+
|
|
8
|
+
module Pikuri
|
|
9
|
+
class Tool
|
|
10
|
+
module Scraper
|
|
11
|
+
# HTML → Markdown extractor used by {Simple.visit} when the fetched
|
|
12
|
+
# response carries an HTML content-type.
|
|
13
|
+
#
|
|
14
|
+
# Always renders both views of the page when available:
|
|
15
|
+
#
|
|
16
|
+
# 1. JSON-LD section. Any +<script type="application/ld+json">+ node
|
|
17
|
+
# whose +@type+ matches a substantive schema.org content type
|
|
18
|
+
# (Product, Article, Recipe, ...) is rendered as a header — title,
|
|
19
|
+
# metadata bullets (brand, SKU, price, rating, author, published),
|
|
20
|
+
# and the +articleBody+/+description+ copy when present.
|
|
21
|
+
# 2. Readability section. The page is run through +Readability+ +
|
|
22
|
+
# +reverse_markdown+, with a +<main>+/+<article>+ fallback for
|
|
23
|
+
# pages whose content sits mostly outside +<p>+ tags.
|
|
24
|
+
#
|
|
25
|
+
# Concatenated with a horizontal rule, so the LLM gets both the
|
|
26
|
+
# structured metadata and the rendered body and can pick whichever
|
|
27
|
+
# is more useful for the task. Trades some duplication (when a
|
|
28
|
+
# publisher embeds the article body in JSON-LD AND in HTML) for
|
|
29
|
+
# fewer type-based heuristics on which branch should win — the
|
|
30
|
+
# earlier "is this Article's +description+ a teaser or the real
|
|
31
|
+
# body?" carve-out is no longer needed because both end up in
|
|
32
|
+
# the output regardless.
|
|
33
|
+
#
|
|
34
|
+
# Pure parser — no I/O. {.extract} takes an HTML string and returns
|
|
35
|
+
# Markdown, so tests can drive it against fixture HTML without a
|
|
36
|
+
# network round-trip.
|
|
37
|
+
module HTML
|
|
38
|
+
# @return [Array<String>] schema.org +@type+ values that we treat
|
|
39
|
+
# as "the primary entity of this page" when picking a JSON-LD
|
|
40
|
+
# node to render. Order does not matter — the first matching
|
|
41
|
+
# node wins. Skips noise nodes (Organization, BreadcrumbList,
|
|
42
|
+
# WebSite, ...) that ship on most pages but carry no page
|
|
43
|
+
# content.
|
|
44
|
+
INTERESTING_TYPES = %w[
|
|
45
|
+
Product Article NewsArticle BlogPosting Recipe Event Book Movie
|
|
46
|
+
].freeze
|
|
47
|
+
|
|
48
|
+
# @return [Array<String>] HTML tags preserved by the readability
|
|
49
|
+
# pass. Anything outside this list is stripped before Markdown
|
|
50
|
+
# conversion.
|
|
51
|
+
READABILITY_TAGS = %w[
|
|
52
|
+
h1 h2 h3 h4 h5 h6 p div span ul ol li blockquote pre code a img
|
|
53
|
+
strong em b i br hr table thead tbody tr td th
|
|
54
|
+
].freeze
|
|
55
|
+
|
|
56
|
+
# @return [Array<String>] HTML attributes preserved by the
|
|
57
|
+
# readability pass; everything else (class, id, style, data-*)
|
|
58
|
+
# is dropped before Markdown conversion
|
|
59
|
+
READABILITY_ATTRS = %w[href src alt title].freeze
|
|
60
|
+
|
|
61
|
+
# @return [Float] minimum +<main>+/+<article>+ to Readability
|
|
62
|
+
# text-length ratio that triggers the semantic-container
|
|
63
|
+
# fallback in {.readability_to_markdown}. Picked low enough to
|
|
64
|
+
# catch the failure mode (Readability collapsing a page that
|
|
65
|
+
# uses divs/lists instead of +<p>+ — e.g. +vaadin.com/company+,
|
|
66
|
+
# ~5x) but high enough that pages where both produce
|
|
67
|
+
# comparable output keep Readability's noise filtering.
|
|
68
|
+
MAIN_FALLBACK_RATIO = 2.0
|
|
69
|
+
|
|
70
|
+
# @return [Integer] minimum text length the
|
|
71
|
+
# +<main>+/+<article>+ container must hold before the fallback
|
|
72
|
+
# in {.readability_to_markdown} can fire. Below this, the
|
|
73
|
+
# ratio comparison is dominated by noise and we'd swap on
|
|
74
|
+
# tiny pages where Readability is doing the right thing.
|
|
75
|
+
MAIN_FALLBACK_MIN_CHARS = 500
|
|
76
|
+
|
|
77
|
+
# Render +html+ as Markdown by emitting both the JSON-LD section
|
|
78
|
+
# (when an interesting node is present) and the readability /
|
|
79
|
+
# +<main>+ section, joined by a horizontal rule. Either section
|
|
80
|
+
# may be missing — pages with no JSON-LD return only the
|
|
81
|
+
# readability output, and a malformed page with no extractable
|
|
82
|
+
# body returns only the JSON-LD render.
|
|
83
|
+
#
|
|
84
|
+
# @param html [String] HTML document body
|
|
85
|
+
# @return [String] Markdown representation
|
|
86
|
+
def self.extract(html)
|
|
87
|
+
sections = [jsonld_section(html), readability_to_markdown(html)]
|
|
88
|
+
sections.reject! { |s| s.nil? || s.strip.empty? }
|
|
89
|
+
sections.join("\n\n---\n\n")
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# Pick the first JSON-LD node whose +@type+ matches one of
|
|
93
|
+
# {INTERESTING_TYPES} and render it as Markdown. Returns +nil+
|
|
94
|
+
# when no such node exists, in which case {.extract} emits only
|
|
95
|
+
# the readability section.
|
|
96
|
+
#
|
|
97
|
+
# No content-field gating: a node carrying just +name+/+author+/
|
|
98
|
+
# +datePublished+ still renders (as a metadata-only header),
|
|
99
|
+
# because the readability pass independently produces the page
|
|
100
|
+
# body. That is the trade-off that lets us drop the type-based
|
|
101
|
+
# "is this teaser or article copy?" heuristics — duplication is
|
|
102
|
+
# acceptable when both views are available, and the LLM can
|
|
103
|
+
# pick whichever it needs.
|
|
104
|
+
#
|
|
105
|
+
# @param html [String] HTML document body
|
|
106
|
+
# @return [String, nil] Markdown render of the picked JSON-LD
|
|
107
|
+
# node, or +nil+ when nothing matched
|
|
108
|
+
def self.jsonld_section(html)
|
|
109
|
+
node = parse_jsonld(html).find do |n|
|
|
110
|
+
Array(n['@type']).any? { |t| INTERESTING_TYPES.include?(t) }
|
|
111
|
+
end
|
|
112
|
+
node ? jsonld_to_markdown(node) : nil
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# Collect every JSON-LD payload embedded in +html+, flattening
|
|
116
|
+
# +@graph+ wrappers so callers see one flat array of schema.org
|
|
117
|
+
# nodes. Malformed JSON blocks are silently skipped — sites
|
|
118
|
+
# frequently ship broken JSON-LD and we only need at least one
|
|
119
|
+
# parseable block.
|
|
120
|
+
#
|
|
121
|
+
# @param html [String] HTML document body
|
|
122
|
+
# @return [Array<Hash>] parsed JSON-LD nodes; possibly empty
|
|
123
|
+
def self.parse_jsonld(html)
|
|
124
|
+
doc = Nokogiri::HTML(html)
|
|
125
|
+
blobs = doc.css('script[type="application/ld+json"]').map(&:text)
|
|
126
|
+
|
|
127
|
+
blobs.flat_map do |raw|
|
|
128
|
+
parsed = begin
|
|
129
|
+
JSON.parse(raw)
|
|
130
|
+
rescue JSON::ParserError
|
|
131
|
+
nil
|
|
132
|
+
end
|
|
133
|
+
next [] unless parsed
|
|
134
|
+
|
|
135
|
+
nodes = parsed.is_a?(Array) ? parsed : [parsed]
|
|
136
|
+
nodes.flat_map { |n| n['@graph'].is_a?(Array) ? n['@graph'] : [n] }
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
# Render a single JSON-LD +node+ as Markdown: a top-level title
|
|
141
|
+
# from +name+/+headline+, a bullet list of common useful fields
|
|
142
|
+
# (brand, SKU, price, rating, author, published date, ...), the
|
|
143
|
+
# body copy, and the lead image.
|
|
144
|
+
#
|
|
145
|
+
# When the node carries +articleBody+ (the full publisher-supplied
|
|
146
|
+
# article text), that wins over +description+ — the description
|
|
147
|
+
# is typically a lede teaser and would just repeat the article's
|
|
148
|
+
# opening lines.
|
|
149
|
+
#
|
|
150
|
+
# @param node [Hash] JSON-LD node, typically picked by
|
|
151
|
+
# {.jsonld_section}
|
|
152
|
+
# @return [String] Markdown representation
|
|
153
|
+
def self.jsonld_to_markdown(node)
|
|
154
|
+
out = +''
|
|
155
|
+
name = node['name'] || node['headline']
|
|
156
|
+
out << "# #{name}\n\n" if name
|
|
157
|
+
|
|
158
|
+
offer = first_obj(node['offers'])
|
|
159
|
+
rating = first_obj(node['aggregateRating'])
|
|
160
|
+
brand = first_obj_or_string(node['brand'])
|
|
161
|
+
author = first_obj_or_string(node['author'])
|
|
162
|
+
|
|
163
|
+
brand_name = brand.is_a?(Hash) ? brand['name'] : brand
|
|
164
|
+
author_name = author.is_a?(Hash) ? author['name'] : author
|
|
165
|
+
|
|
166
|
+
fields = {
|
|
167
|
+
'Brand' => brand_name,
|
|
168
|
+
'SKU' => node['sku'],
|
|
169
|
+
'GTIN' => node['gtin13'] || node['gtin'],
|
|
170
|
+
'Price' => [offer['price'], offer['priceCurrency']].compact.join(' '),
|
|
171
|
+
'Availability' => offer['availability'],
|
|
172
|
+
'Rating' => rating['ratingValue'],
|
|
173
|
+
'Reviews' => rating['reviewCount'],
|
|
174
|
+
'Author' => author_name,
|
|
175
|
+
'Published' => node['datePublished']
|
|
176
|
+
}.reject { |_, v| v.nil? || v.to_s.strip.empty? }
|
|
177
|
+
|
|
178
|
+
unless fields.empty?
|
|
179
|
+
fields.each { |k, v| out << "- **#{k}:** #{v}\n" }
|
|
180
|
+
out << "\n"
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
if (body = node['articleBody'] || node['description'])
|
|
184
|
+
out << "#{body}\n\n"
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
if (img = node['image'])
|
|
188
|
+
img = img.first if img.is_a?(Array)
|
|
189
|
+
img = img['url'] if img.is_a?(Hash)
|
|
190
|
+
out << "\n\n" if img
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
out
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
# Run +Readability+ over +html+ to isolate the main content node,
|
|
197
|
+
# then convert that to Markdown via +reverse_markdown+. The page
|
|
198
|
+
# +<title>+ is rendered as a top-level heading.
|
|
199
|
+
#
|
|
200
|
+
# When the page uses semantic HTML5 (+<main>+ or +<article>+) but
|
|
201
|
+
# leaves most of its content outside +<p>+ tags — divs, lists,
|
|
202
|
+
# spans — Readability's paragraph-density scoring collapses the
|
|
203
|
+
# extraction to a sliver of the page. In that case we render the
|
|
204
|
+
# +<main>+/+<article>+ container directly. The fallback only
|
|
205
|
+
# fires when the container holds substantially more text than
|
|
206
|
+
# Readability picked up (see {MAIN_FALLBACK_RATIO} /
|
|
207
|
+
# {MAIN_FALLBACK_MIN_CHARS}); on pages where both agree we keep
|
|
208
|
+
# Readability so its noise filtering still strips nav/ads/etc.
|
|
209
|
+
#
|
|
210
|
+
# @param html [String] HTML document body
|
|
211
|
+
# @return [String] Markdown representation
|
|
212
|
+
def self.readability_to_markdown(html)
|
|
213
|
+
rdoc = Readability::Document.new(
|
|
214
|
+
html,
|
|
215
|
+
tags: READABILITY_TAGS,
|
|
216
|
+
attributes: READABILITY_ATTRS,
|
|
217
|
+
remove_empty_nodes: true
|
|
218
|
+
)
|
|
219
|
+
readability_html = rdoc.content
|
|
220
|
+
title = rdoc.title
|
|
221
|
+
|
|
222
|
+
body_html = main_fallback_html(html, readability_html) || readability_html
|
|
223
|
+
body = ReverseMarkdown.convert(body_html, unknown_tags: :bypass, github_flavored: true)
|
|
224
|
+
|
|
225
|
+
out = +''
|
|
226
|
+
out << "# #{title.strip}\n\n" if title && !title.strip.empty?
|
|
227
|
+
out << body
|
|
228
|
+
out
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
# If +html+ has a +<main>+ or +<article>+ element holding
|
|
232
|
+
# substantially more text than Readability extracted, return that
|
|
233
|
+
# container's HTML so the caller can render it instead. Returns
|
|
234
|
+
# +nil+ when the fallback should not fire — when there is no
|
|
235
|
+
# semantic container, when it's too small to be meaningful, or
|
|
236
|
+
# when Readability's output is already comparable.
|
|
237
|
+
#
|
|
238
|
+
# @param html [String] full HTML document body, used to locate
|
|
239
|
+
# the +<main>+/+<article>+ container
|
|
240
|
+
# @param readability_html [String] HTML produced by
|
|
241
|
+
# +Readability::Document#content+, used as the comparison
|
|
242
|
+
# baseline
|
|
243
|
+
# @return [String, nil] container HTML when the fallback should
|
|
244
|
+
# fire, +nil+ otherwise
|
|
245
|
+
def self.main_fallback_html(html, readability_html)
|
|
246
|
+
doc = Nokogiri::HTML(html)
|
|
247
|
+
container = doc.at_css('main') || doc.at_css('article')
|
|
248
|
+
return nil unless container
|
|
249
|
+
|
|
250
|
+
container_text_len = container.text.gsub(/\s+/, ' ').strip.length
|
|
251
|
+
return nil if container_text_len < MAIN_FALLBACK_MIN_CHARS
|
|
252
|
+
|
|
253
|
+
readability_text_len = Nokogiri::HTML(readability_html).text.gsub(/\s+/, ' ').strip.length
|
|
254
|
+
return nil if container_text_len < MAIN_FALLBACK_RATIO * readability_text_len
|
|
255
|
+
|
|
256
|
+
container.to_html
|
|
257
|
+
end
|
|
258
|
+
private_class_method :main_fallback_html
|
|
259
|
+
|
|
260
|
+
# JSON-LD fields can be a string, hash, or array of either.
|
|
261
|
+
# Normalize to a single hash (the first one if it's a list) so
|
|
262
|
+
# callers can +.dig+ safely.
|
|
263
|
+
#
|
|
264
|
+
# @param value [Object] raw JSON-LD field value
|
|
265
|
+
# @return [Hash] empty hash when +value+ does not contain a hash
|
|
266
|
+
def self.first_obj(value)
|
|
267
|
+
value = value.first if value.is_a?(Array)
|
|
268
|
+
value.is_a?(Hash) ? value : {}
|
|
269
|
+
end
|
|
270
|
+
private_class_method :first_obj
|
|
271
|
+
|
|
272
|
+
# Same idea as {.first_obj} but preserves a bare string (e.g.
|
|
273
|
+
# +brand: "Apple"+) instead of replacing it with +{}+.
|
|
274
|
+
#
|
|
275
|
+
# @param value [Object] raw JSON-LD field value
|
|
276
|
+
# @return [String, Hash, nil]
|
|
277
|
+
def self.first_obj_or_string(value)
|
|
278
|
+
value = value.first if value.is_a?(Array)
|
|
279
|
+
value
|
|
280
|
+
end
|
|
281
|
+
private_class_method :first_obj_or_string
|
|
282
|
+
end
|
|
283
|
+
end
|
|
284
|
+
end
|
|
285
|
+
end
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'pdf-reader'
|
|
4
|
+
require 'stringio'
|
|
5
|
+
|
|
6
|
+
module Pikuri
|
|
7
|
+
class Tool
|
|
8
|
+
module Scraper
|
|
9
|
+
# PDF → text extractor used by {Simple.visit} when the fetched
|
|
10
|
+
# response carries +application/pdf+. Wraps the +pdf-reader+ gem:
|
|
11
|
+
# walk every page, concatenate the extracted text, hand the result
|
|
12
|
+
# back as a single string the LLM can read.
|
|
13
|
+
#
|
|
14
|
+
# Best-effort by design. +pdf-reader+ produces clean text from PDFs
|
|
15
|
+
# generated from a digital source (LaTeX, Word export, ...) but
|
|
16
|
+
# returns nothing useful from scanned documents — there is no OCR
|
|
17
|
+
# in this path. When extraction yields no text we still return an
|
|
18
|
+
# empty string rather than raising, so the caller's cache stores a
|
|
19
|
+
# consistent result and the LLM sees an empty observation it can
|
|
20
|
+
# react to.
|
|
21
|
+
#
|
|
22
|
+
# Pure parser — no I/O. {.extract} takes PDF bytes and returns text,
|
|
23
|
+
# so tests can drive it against an in-memory fixture without
|
|
24
|
+
# touching the network.
|
|
25
|
+
module PDF
|
|
26
|
+
# Render +bytes+ as plain text, one page per paragraph.
|
|
27
|
+
#
|
|
28
|
+
# +pdf-reader+ raises a handful of typed exceptions for documents
|
|
29
|
+
# it cannot parse — broken xrefs ({::PDF::Reader::MalformedPDFError}),
|
|
30
|
+
# invalid page references ({::PDF::Reader::InvalidPageError}),
|
|
31
|
+
# encrypted/XFA files ({::PDF::Reader::UnsupportedFeatureError}).
|
|
32
|
+
# All three describe a property of the PDF the LLM can react to
|
|
33
|
+
# ("try a different URL"), so we re-raise them as {FetchError} —
|
|
34
|
+
# same convention as the HTTP layer in {Simple.fetch}. Genuine
|
|
35
|
+
# bugs in +pdf-reader+ itself surface as their own classes and
|
|
36
|
+
# crash loud.
|
|
37
|
+
#
|
|
38
|
+
# @param bytes [String] raw PDF document (binary string)
|
|
39
|
+
# @return [String] concatenated page text; possibly empty when
|
|
40
|
+
# the PDF carries no extractable text (scanned image, empty
|
|
41
|
+
# document)
|
|
42
|
+
# @raise [FetchError] when +pdf-reader+ refuses the document
|
|
43
|
+
def self.extract(bytes)
|
|
44
|
+
reader = ::PDF::Reader.new(StringIO.new(bytes))
|
|
45
|
+
reader.pages.map { |p| p.text.strip }.reject(&:empty?).join("\n\n")
|
|
46
|
+
rescue ::PDF::Reader::MalformedPDFError,
|
|
47
|
+
::PDF::Reader::InvalidPageError,
|
|
48
|
+
::PDF::Reader::UnsupportedFeatureError => e
|
|
49
|
+
raise FetchError, "PDF rendering failed: #{e.class.name.split('::').last}: #{e.message}"
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|