pikuri 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +62 -0
  3. data/GETTING_STARTED.md +223 -0
  4. data/LICENSE +21 -0
  5. data/README.md +193 -0
  6. data/lib/pikuri/agent/chat_transport.rb +41 -0
  7. data/lib/pikuri/agent/context_window_detector.rb +101 -0
  8. data/lib/pikuri/agent/listener/in_memory_message_list.rb +33 -0
  9. data/lib/pikuri/agent/listener/message_listener.rb +93 -0
  10. data/lib/pikuri/agent/listener/step_limit.rb +97 -0
  11. data/lib/pikuri/agent/listener/terminal.rb +137 -0
  12. data/lib/pikuri/agent/listener/token_log.rb +166 -0
  13. data/lib/pikuri/agent/listener_list.rb +113 -0
  14. data/lib/pikuri/agent/message.rb +61 -0
  15. data/lib/pikuri/agent/synthesizer.rb +120 -0
  16. data/lib/pikuri/agent/tokens.rb +56 -0
  17. data/lib/pikuri/agent.rb +286 -0
  18. data/lib/pikuri/subprocess.rb +166 -0
  19. data/lib/pikuri/tool/bash.rb +272 -0
  20. data/lib/pikuri/tool/calculator.rb +82 -0
  21. data/lib/pikuri/tool/confirmer.rb +96 -0
  22. data/lib/pikuri/tool/edit.rb +196 -0
  23. data/lib/pikuri/tool/fetch.rb +167 -0
  24. data/lib/pikuri/tool/glob.rb +310 -0
  25. data/lib/pikuri/tool/grep.rb +338 -0
  26. data/lib/pikuri/tool/parameters.rb +314 -0
  27. data/lib/pikuri/tool/read.rb +254 -0
  28. data/lib/pikuri/tool/scraper/fetch_error.rb +16 -0
  29. data/lib/pikuri/tool/scraper/html.rb +285 -0
  30. data/lib/pikuri/tool/scraper/pdf.rb +54 -0
  31. data/lib/pikuri/tool/scraper/simple.rb +177 -0
  32. data/lib/pikuri/tool/search/brave.rb +184 -0
  33. data/lib/pikuri/tool/search/duckduckgo.rb +196 -0
  34. data/lib/pikuri/tool/search/engines.rb +154 -0
  35. data/lib/pikuri/tool/search/exa.rb +217 -0
  36. data/lib/pikuri/tool/search/rate_limiter.rb +92 -0
  37. data/lib/pikuri/tool/search/result.rb +29 -0
  38. data/lib/pikuri/tool/skill.rb +80 -0
  39. data/lib/pikuri/tool/skill_catalog.rb +376 -0
  40. data/lib/pikuri/tool/sub_agent.rb +102 -0
  41. data/lib/pikuri/tool/web_scrape.rb +117 -0
  42. data/lib/pikuri/tool/web_search.rb +38 -0
  43. data/lib/pikuri/tool/workspace.rb +150 -0
  44. data/lib/pikuri/tool/write.rb +170 -0
  45. data/lib/pikuri/tool.rb +118 -0
  46. data/lib/pikuri/url_cache.rb +106 -0
  47. data/lib/pikuri/version.rb +10 -0
  48. data/lib/pikuri.rb +165 -0
  49. data/prompts/coding-system-prompt.txt +28 -0
  50. data/prompts/pikuri-chat.txt +15 -0
  51. metadata +259 -0
@@ -0,0 +1,254 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pikuri
4
+ class Tool
5
+ # The +read+ tool, expressed as a {Tool} subclass: instantiating
6
+ # +Tool::Read.new(workspace: ws)+ produces a tool whose
7
+ # {Tool#to_ruby_llm_tool} wiring is identical to any bundled tool's,
8
+ # so ruby_llm sees nothing special about it. Same shape as
9
+ # {Tool::SubAgent} — workspace is captured by the +execute+ closure
10
+ # at construction.
11
+ #
12
+ # == Output format
13
+ #
14
+ # cat-n: each line is rendered as +"%6d\t%s"+ (six-column right-
15
+ # padded line number, tab, content). Chosen for breadth of training-
16
+ # data exposure: +cat -n+ output shows up across virtually every Unix
17
+ # tutorial and Stack Overflow answer, so even small local models
18
+ # recognize the shape. opencode's shorter +"<n>: <content>"+ format
19
+ # saves a few thousand tokens per 2K-line file but trades model
20
+ # familiarity; pi omits line numbers entirely (cheapest tokens, but
21
+ # the model loses the ability to cite ranges or pick {Edit}
22
+ # boundaries precisely).
23
+ #
24
+ # == Truncation rules
25
+ #
26
+ # Two independent limits, whichever fires first wins:
27
+ #
28
+ # * *Line limit* — {DEFAULT_LIMIT} lines (overridable via +limit+).
29
+ # * *Byte cap* — {MAX_BYTES} bytes of input content; not exposed as a
30
+ # parameter. Bypassable in practice by paging via +offset+.
31
+ #
32
+ # Additionally, individual lines longer than {MAX_LINE_LENGTH} chars
33
+ # are truncated with {LINE_TRUNCATION_MARKER} appended; the model is
34
+ # told to reach for +grep+ to find content inside such files.
35
+ #
36
+ # == Refusals
37
+ #
38
+ # * Path outside the workspace → caught from
39
+ # {Tool::Workspace::Error}, returned as +"Error: ..."+.
40
+ # * File not found, EACCES → +"Error: ..."+.
41
+ # * Path is a directory → +"Error: ... use the glob tool"+, keeping
42
+ # directory listing as the glob tool's responsibility (Step 9).
43
+ # * Binary content → sniffed from the first {BINARY_SAMPLE_BYTES} of
44
+ # the file: any +NUL+ byte, or more than {BINARY_NONPRINTABLE_THRESHOLD}
45
+ # non-printable bytes (control chars outside +\t \n \v \f \r+),
46
+ # triggers refusal. Catches images, PDFs, archives, and compiled
47
+ # artifacts without an extension list to maintain.
48
+ # * Offset past EOF → +"Error: offset N is beyond end of file (M lines total)"+.
49
+ class Read < Tool
50
+ # @return [Integer] default value of the +limit+ parameter (number
51
+ # of lines to read per call).
52
+ DEFAULT_LIMIT = 2000
53
+
54
+ # @return [Integer] per-line character cap; longer lines are
55
+ # truncated with {LINE_TRUNCATION_MARKER}.
56
+ MAX_LINE_LENGTH = 2000
57
+
58
+ # @return [String] suffix appended to lines truncated by
59
+ # {MAX_LINE_LENGTH}.
60
+ LINE_TRUNCATION_MARKER = "... (line truncated to #{MAX_LINE_LENGTH} chars)"
61
+
62
+ # @return [Integer] hard byte cap on input content collected per
63
+ # call. Counted on the line bytes (plus one for the joining
64
+ # newline); the rendered output is slightly larger due to the
65
+ # per-line +"%6d\t"+ prefix.
66
+ MAX_BYTES = 50 * 1024
67
+
68
+ # @return [String] human-readable form of {MAX_BYTES} for the
69
+ # continuation marker.
70
+ MAX_BYTES_LABEL = "#{MAX_BYTES / 1024} KB"
71
+
72
+ # @return [Integer] number of bytes sampled from the start of the
73
+ # file for binary-content detection.
74
+ BINARY_SAMPLE_BYTES = 4096
75
+
76
+ # @return [Float] fraction of the sample that may be non-printable
77
+ # before the file is classified as binary. Matches opencode's
78
+ # 30%.
79
+ BINARY_NONPRINTABLE_THRESHOLD = 0.30
80
+
81
+ # Description shown to the LLM. Follows the opencode-shape (summary
82
+ # + +Usage:+ bullets) prescribed by the project's tool-description
83
+ # convention. Per-parameter constraints (defaults, format) live in
84
+ # the parameter descriptions, not here.
85
+ #
86
+ # @return [String]
87
+ DESCRIPTION = <<~DESC
88
+ Read a file from the workspace and return its contents with line numbers.
89
+
90
+ Usage:
91
+ - Output is line-numbered in `cat -n` style so subsequent edits can reference exact line numbers.
92
+ - Use `offset` and `limit` to page through large files; when the response ends in `Use offset=N to continue`, call again with that offset.
93
+ - Lines longer than #{MAX_LINE_LENGTH} chars are truncated with a marker — use `grep` for content inside such files.
94
+ - Binary files (images, PDFs, archives, compiled artifacts) are refused; this tool reads text only.
95
+ - Directories are refused — use the `glob` tool to list files.
96
+ - If unsure of the path, use `glob` first to look up filenames.
97
+ - Avoid tiny repeated slices — if you need more context, read a larger window.
98
+ DESC
99
+
100
+ # @param workspace [Tool::Workspace] captured for path resolution;
101
+ # all reads route through +workspace.resolve_for_read+.
102
+ # @return [Read]
103
+ def initialize(workspace:)
104
+ super(
105
+ name: 'read',
106
+ description: DESCRIPTION,
107
+ parameters: Parameters.build { |p|
108
+ p.required_string :path,
109
+ 'Path to the file to read. Relative paths ' \
110
+ 'resolve against the workspace root, e.g. ' \
111
+ '"lib/foo.rb" or "/abs/path/to/file.txt".'
112
+ p.optional_integer :offset,
113
+ 'Line number to start reading from (1-indexed). ' \
114
+ "Defaults to 1, e.g. 200."
115
+ p.optional_integer :limit,
116
+ 'Maximum number of lines to read. Defaults to ' \
117
+ "#{DEFAULT_LIMIT}, e.g. 500."
118
+ },
119
+ execute: ->(path:, offset: 1, limit: DEFAULT_LIMIT) {
120
+ Read.read(workspace: workspace, path: path, offset: offset, limit: limit)
121
+ }
122
+ )
123
+ end
124
+
125
+ # Resolve +path+ against +workspace+, refuse directories / binaries /
126
+ # missing files, and return either the cat-n-formatted slice or an
127
+ # +"Error: ..."+ observation.
128
+ #
129
+ # @param workspace [Tool::Workspace]
130
+ # @param path [String] raw path as supplied by the LLM
131
+ # @param offset [Integer] 1-indexed line number to start at
132
+ # @param limit [Integer] maximum lines to return
133
+ # @return [String] tool observation
134
+ def self.read(workspace:, path:, offset:, limit:)
135
+ return "Error: offset must be >= 1, got #{offset}" if offset < 1
136
+ return "Error: limit must be >= 1, got #{limit}" if limit < 1
137
+
138
+ resolved = workspace.resolve_for_read(path)
139
+ return "Error: file not found: #{path}" unless resolved.exist?
140
+ return "Error: #{path} is a directory; use the glob tool to list files." if resolved.directory?
141
+
142
+ sample = read_sample(resolved)
143
+ return "Error: cannot read binary file: #{path}" if binary?(sample)
144
+
145
+ format_slice(path: path, resolved: resolved, offset: offset, limit: limit)
146
+ rescue Tool::Workspace::Error => e
147
+ "Error: #{e.message}"
148
+ rescue Errno::EACCES => e
149
+ "Error: cannot read #{path}: #{e.message}"
150
+ end
151
+
152
+ # Read up to {BINARY_SAMPLE_BYTES} of the file in binary mode for
153
+ # the {.binary?} sniff. Returns an empty String for an empty file
154
+ # (which {.binary?} treats as not-binary).
155
+ #
156
+ # @param resolved [Pathname]
157
+ # @return [String] raw bytes (ASCII-8BIT encoding)
158
+ def self.read_sample(resolved)
159
+ resolved.open('rb') { |io| io.read(BINARY_SAMPLE_BYTES) || +'' }
160
+ end
161
+ private_class_method :read_sample
162
+
163
+ # Heuristic binary classifier matching opencode's: any NUL byte
164
+ # forces +true+; otherwise count bytes outside the printable +\t \n
165
+ # \v \f \r+ + ASCII-32..126 range and ratio against the sample
166
+ # size. UTF-8 continuation bytes (0x80-0xBF) are >127 so they sit
167
+ # outside the non-printable ranges and pass through unflagged,
168
+ # letting UTF-8 text read fine.
169
+ #
170
+ # Public because {Tool::Edit} reuses it to refuse binary targets —
171
+ # if Edit accepted a binary file the model has no way to have read,
172
+ # it could corrupt bytes the model never inspected. Same sniff, same
173
+ # threshold, one definition.
174
+ #
175
+ # @param bytes [String] sample bytes
176
+ # @return [Boolean]
177
+ def self.binary?(bytes)
178
+ return false if bytes.empty?
179
+
180
+ non_printable = 0
181
+ bytes.each_byte do |b|
182
+ return true if b.zero?
183
+
184
+ non_printable += 1 if b < 9 || (b > 13 && b < 32)
185
+ end
186
+ non_printable.to_f / bytes.bytesize > BINARY_NONPRINTABLE_THRESHOLD
187
+ end
188
+
189
+ # Stream the file line-by-line, collect at most +limit+ lines
190
+ # starting at +offset+, and stop early if {MAX_BYTES} is reached.
191
+ # We keep counting lines past the collection window so the trailer
192
+ # can report total line count when the line limit (not the byte
193
+ # cap) was the stopping criterion — same trick opencode uses.
194
+ #
195
+ # @return [String]
196
+ def self.format_slice(path:, resolved:, offset:, limit:)
197
+ start_index = offset - 1
198
+ collected = []
199
+ total_lines = 0
200
+ bytes = 0
201
+ byte_cap_hit = false
202
+ has_more = false
203
+
204
+ resolved.each_line do |raw|
205
+ total_lines += 1
206
+ next if total_lines <= start_index
207
+
208
+ if collected.length >= limit
209
+ has_more = true
210
+ next
211
+ end
212
+
213
+ line = raw.chomp
214
+ if line.length > MAX_LINE_LENGTH
215
+ line = line[0, MAX_LINE_LENGTH] + LINE_TRUNCATION_MARKER
216
+ end
217
+
218
+ size = line.bytesize + 1 # +1 for the joining newline
219
+ if bytes + size > MAX_BYTES
220
+ byte_cap_hit = true
221
+ has_more = true
222
+ break
223
+ end
224
+
225
+ collected << line
226
+ bytes += size
227
+ end
228
+
229
+ return '(Empty file)' if total_lines.zero?
230
+
231
+ if start_index >= total_lines
232
+ return "Error: offset #{offset} is beyond end of file (#{total_lines} lines total)"
233
+ end
234
+
235
+ last_line = offset + collected.length - 1
236
+ body = collected.each_with_index.map { |line, i| format("%6d\t%s", i + offset, line) }.join("\n")
237
+
238
+ trailer =
239
+ if byte_cap_hit
240
+ "(Output capped at #{MAX_BYTES_LABEL}. Showing lines #{offset}-#{last_line}. " \
241
+ "Use offset=#{last_line + 1} to continue.)"
242
+ elsif has_more
243
+ "(Showing lines #{offset}-#{last_line} of #{total_lines}. " \
244
+ "Use offset=#{last_line + 1} to continue.)"
245
+ else
246
+ "(End of file - total #{total_lines} lines)"
247
+ end
248
+
249
+ "#{body}\n\n#{trailer}"
250
+ end
251
+ private_class_method :format_slice
252
+ end
253
+ end
254
+ end
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pikuri
4
+ class Tool
5
+ module Scraper
6
+ # Raised by anything in the scraper stack when a URL cannot be
7
+ # rendered into Markdown text — HTTP non-2xx, network failure,
8
+ # redirect-loop, missing +Location+, unsupported content-type, or a
9
+ # parse failure that reads as "try a different URL" to the LLM.
10
+ # Catching this in {Tool::WEB_SCRAPE} / {Tool::FETCH} turns the
11
+ # failure into an +"Error: ..."+ observation; anything else bubbles
12
+ # up so genuine bugs stay visible.
13
+ class FetchError < StandardError; end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,285 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'nokogiri'
5
+ require 'readability'
6
+ require 'reverse_markdown'
7
+
8
+ module Pikuri
9
+ class Tool
10
+ module Scraper
11
+ # HTML → Markdown extractor used by {Simple.visit} when the fetched
12
+ # response carries an HTML content-type.
13
+ #
14
+ # Always renders both views of the page when available:
15
+ #
16
+ # 1. JSON-LD section. Any +<script type="application/ld+json">+ node
17
+ # whose +@type+ matches a substantive schema.org content type
18
+ # (Product, Article, Recipe, ...) is rendered as a header — title,
19
+ # metadata bullets (brand, SKU, price, rating, author, published),
20
+ # and the +articleBody+/+description+ copy when present.
21
+ # 2. Readability section. The page is run through +Readability+ +
22
+ # +reverse_markdown+, with a +<main>+/+<article>+ fallback for
23
+ # pages whose content sits mostly outside +<p>+ tags.
24
+ #
25
+ # Concatenated with a horizontal rule, so the LLM gets both the
26
+ # structured metadata and the rendered body and can pick whichever
27
+ # is more useful for the task. Trades some duplication (when a
28
+ # publisher embeds the article body in JSON-LD AND in HTML) for
29
+ # fewer type-based heuristics on which branch should win — the
30
+ # earlier "is this Article's +description+ a teaser or the real
31
+ # body?" carve-out is no longer needed because both end up in
32
+ # the output regardless.
33
+ #
34
+ # Pure parser — no I/O. {.extract} takes an HTML string and returns
35
+ # Markdown, so tests can drive it against fixture HTML without a
36
+ # network round-trip.
37
+ module HTML
38
+ # @return [Array<String>] schema.org +@type+ values that we treat
39
+ # as "the primary entity of this page" when picking a JSON-LD
40
+ # node to render. Order does not matter — the first matching
41
+ # node wins. Skips noise nodes (Organization, BreadcrumbList,
42
+ # WebSite, ...) that ship on most pages but carry no page
43
+ # content.
44
+ INTERESTING_TYPES = %w[
45
+ Product Article NewsArticle BlogPosting Recipe Event Book Movie
46
+ ].freeze
47
+
48
+ # @return [Array<String>] HTML tags preserved by the readability
49
+ # pass. Anything outside this list is stripped before Markdown
50
+ # conversion.
51
+ READABILITY_TAGS = %w[
52
+ h1 h2 h3 h4 h5 h6 p div span ul ol li blockquote pre code a img
53
+ strong em b i br hr table thead tbody tr td th
54
+ ].freeze
55
+
56
+ # @return [Array<String>] HTML attributes preserved by the
57
+ # readability pass; everything else (class, id, style, data-*)
58
+ # is dropped before Markdown conversion
59
+ READABILITY_ATTRS = %w[href src alt title].freeze
60
+
61
+ # @return [Float] minimum +<main>+/+<article>+ to Readability
62
+ # text-length ratio that triggers the semantic-container
63
+ # fallback in {.readability_to_markdown}. Picked low enough to
64
+ # catch the failure mode (Readability collapsing a page that
65
+ # uses divs/lists instead of +<p>+ — e.g. +vaadin.com/company+,
66
+ # ~5x) but high enough that pages where both produce
67
+ # comparable output keep Readability's noise filtering.
68
+ MAIN_FALLBACK_RATIO = 2.0
69
+
70
+ # @return [Integer] minimum text length the
71
+ # +<main>+/+<article>+ container must hold before the fallback
72
+ # in {.readability_to_markdown} can fire. Below this, the
73
+ # ratio comparison is dominated by noise and we'd swap on
74
+ # tiny pages where Readability is doing the right thing.
75
+ MAIN_FALLBACK_MIN_CHARS = 500
76
+
77
+ # Render +html+ as Markdown by emitting both the JSON-LD section
78
+ # (when an interesting node is present) and the readability /
79
+ # +<main>+ section, joined by a horizontal rule. Either section
80
+ # may be missing — pages with no JSON-LD return only the
81
+ # readability output, and a malformed page with no extractable
82
+ # body returns only the JSON-LD render.
83
+ #
84
+ # @param html [String] HTML document body
85
+ # @return [String] Markdown representation
86
+ def self.extract(html)
87
+ sections = [jsonld_section(html), readability_to_markdown(html)]
88
+ sections.reject! { |s| s.nil? || s.strip.empty? }
89
+ sections.join("\n\n---\n\n")
90
+ end
91
+
92
+ # Pick the first JSON-LD node whose +@type+ matches one of
93
+ # {INTERESTING_TYPES} and render it as Markdown. Returns +nil+
94
+ # when no such node exists, in which case {.extract} emits only
95
+ # the readability section.
96
+ #
97
+ # No content-field gating: a node carrying just +name+/+author+/
98
+ # +datePublished+ still renders (as a metadata-only header),
99
+ # because the readability pass independently produces the page
100
+ # body. That is the trade-off that lets us drop the type-based
101
+ # "is this teaser or article copy?" heuristics — duplication is
102
+ # acceptable when both views are available, and the LLM can
103
+ # pick whichever it needs.
104
+ #
105
+ # @param html [String] HTML document body
106
+ # @return [String, nil] Markdown render of the picked JSON-LD
107
+ # node, or +nil+ when nothing matched
108
+ def self.jsonld_section(html)
109
+ node = parse_jsonld(html).find do |n|
110
+ Array(n['@type']).any? { |t| INTERESTING_TYPES.include?(t) }
111
+ end
112
+ node ? jsonld_to_markdown(node) : nil
113
+ end
114
+
115
+ # Collect every JSON-LD payload embedded in +html+, flattening
116
+ # +@graph+ wrappers so callers see one flat array of schema.org
117
+ # nodes. Malformed JSON blocks are silently skipped — sites
118
+ # frequently ship broken JSON-LD and we only need at least one
119
+ # parseable block.
120
+ #
121
+ # @param html [String] HTML document body
122
+ # @return [Array<Hash>] parsed JSON-LD nodes; possibly empty
123
+ def self.parse_jsonld(html)
124
+ doc = Nokogiri::HTML(html)
125
+ blobs = doc.css('script[type="application/ld+json"]').map(&:text)
126
+
127
+ blobs.flat_map do |raw|
128
+ parsed = begin
129
+ JSON.parse(raw)
130
+ rescue JSON::ParserError
131
+ nil
132
+ end
133
+ next [] unless parsed
134
+
135
+ nodes = parsed.is_a?(Array) ? parsed : [parsed]
136
+ nodes.flat_map { |n| n['@graph'].is_a?(Array) ? n['@graph'] : [n] }
137
+ end
138
+ end
139
+
140
+ # Render a single JSON-LD +node+ as Markdown: a top-level title
141
+ # from +name+/+headline+, a bullet list of common useful fields
142
+ # (brand, SKU, price, rating, author, published date, ...), the
143
+ # body copy, and the lead image.
144
+ #
145
+ # When the node carries +articleBody+ (the full publisher-supplied
146
+ # article text), that wins over +description+ — the description
147
+ # is typically a lede teaser and would just repeat the article's
148
+ # opening lines.
149
+ #
150
+ # @param node [Hash] JSON-LD node, typically picked by
151
+ # {.jsonld_section}
152
+ # @return [String] Markdown representation
153
+ def self.jsonld_to_markdown(node)
154
+ out = +''
155
+ name = node['name'] || node['headline']
156
+ out << "# #{name}\n\n" if name
157
+
158
+ offer = first_obj(node['offers'])
159
+ rating = first_obj(node['aggregateRating'])
160
+ brand = first_obj_or_string(node['brand'])
161
+ author = first_obj_or_string(node['author'])
162
+
163
+ brand_name = brand.is_a?(Hash) ? brand['name'] : brand
164
+ author_name = author.is_a?(Hash) ? author['name'] : author
165
+
166
+ fields = {
167
+ 'Brand' => brand_name,
168
+ 'SKU' => node['sku'],
169
+ 'GTIN' => node['gtin13'] || node['gtin'],
170
+ 'Price' => [offer['price'], offer['priceCurrency']].compact.join(' '),
171
+ 'Availability' => offer['availability'],
172
+ 'Rating' => rating['ratingValue'],
173
+ 'Reviews' => rating['reviewCount'],
174
+ 'Author' => author_name,
175
+ 'Published' => node['datePublished']
176
+ }.reject { |_, v| v.nil? || v.to_s.strip.empty? }
177
+
178
+ unless fields.empty?
179
+ fields.each { |k, v| out << "- **#{k}:** #{v}\n" }
180
+ out << "\n"
181
+ end
182
+
183
+ if (body = node['articleBody'] || node['description'])
184
+ out << "#{body}\n\n"
185
+ end
186
+
187
+ if (img = node['image'])
188
+ img = img.first if img.is_a?(Array)
189
+ img = img['url'] if img.is_a?(Hash)
190
+ out << "![image](#{img})\n\n" if img
191
+ end
192
+
193
+ out
194
+ end
195
+
196
+ # Run +Readability+ over +html+ to isolate the main content node,
197
+ # then convert that to Markdown via +reverse_markdown+. The page
198
+ # +<title>+ is rendered as a top-level heading.
199
+ #
200
+ # When the page uses semantic HTML5 (+<main>+ or +<article>+) but
201
+ # leaves most of its content outside +<p>+ tags — divs, lists,
202
+ # spans — Readability's paragraph-density scoring collapses the
203
+ # extraction to a sliver of the page. In that case we render the
204
+ # +<main>+/+<article>+ container directly. The fallback only
205
+ # fires when the container holds substantially more text than
206
+ # Readability picked up (see {MAIN_FALLBACK_RATIO} /
207
+ # {MAIN_FALLBACK_MIN_CHARS}); on pages where both agree we keep
208
+ # Readability so its noise filtering still strips nav/ads/etc.
209
+ #
210
+ # @param html [String] HTML document body
211
+ # @return [String] Markdown representation
212
+ def self.readability_to_markdown(html)
213
+ rdoc = Readability::Document.new(
214
+ html,
215
+ tags: READABILITY_TAGS,
216
+ attributes: READABILITY_ATTRS,
217
+ remove_empty_nodes: true
218
+ )
219
+ readability_html = rdoc.content
220
+ title = rdoc.title
221
+
222
+ body_html = main_fallback_html(html, readability_html) || readability_html
223
+ body = ReverseMarkdown.convert(body_html, unknown_tags: :bypass, github_flavored: true)
224
+
225
+ out = +''
226
+ out << "# #{title.strip}\n\n" if title && !title.strip.empty?
227
+ out << body
228
+ out
229
+ end
230
+
231
+ # If +html+ has a +<main>+ or +<article>+ element holding
232
+ # substantially more text than Readability extracted, return that
233
+ # container's HTML so the caller can render it instead. Returns
234
+ # +nil+ when the fallback should not fire — when there is no
235
+ # semantic container, when it's too small to be meaningful, or
236
+ # when Readability's output is already comparable.
237
+ #
238
+ # @param html [String] full HTML document body, used to locate
239
+ # the +<main>+/+<article>+ container
240
+ # @param readability_html [String] HTML produced by
241
+ # +Readability::Document#content+, used as the comparison
242
+ # baseline
243
+ # @return [String, nil] container HTML when the fallback should
244
+ # fire, +nil+ otherwise
245
+ def self.main_fallback_html(html, readability_html)
246
+ doc = Nokogiri::HTML(html)
247
+ container = doc.at_css('main') || doc.at_css('article')
248
+ return nil unless container
249
+
250
+ container_text_len = container.text.gsub(/\s+/, ' ').strip.length
251
+ return nil if container_text_len < MAIN_FALLBACK_MIN_CHARS
252
+
253
+ readability_text_len = Nokogiri::HTML(readability_html).text.gsub(/\s+/, ' ').strip.length
254
+ return nil if container_text_len < MAIN_FALLBACK_RATIO * readability_text_len
255
+
256
+ container.to_html
257
+ end
258
+ private_class_method :main_fallback_html
259
+
260
+ # JSON-LD fields can be a string, hash, or array of either.
261
+ # Normalize to a single hash (the first one if it's a list) so
262
+ # callers can +.dig+ safely.
263
+ #
264
+ # @param value [Object] raw JSON-LD field value
265
+ # @return [Hash] empty hash when +value+ does not contain a hash
266
+ def self.first_obj(value)
267
+ value = value.first if value.is_a?(Array)
268
+ value.is_a?(Hash) ? value : {}
269
+ end
270
+ private_class_method :first_obj
271
+
272
+ # Same idea as {.first_obj} but preserves a bare string (e.g.
273
+ # +brand: "Apple"+) instead of replacing it with +{}+.
274
+ #
275
+ # @param value [Object] raw JSON-LD field value
276
+ # @return [String, Hash, nil]
277
+ def self.first_obj_or_string(value)
278
+ value = value.first if value.is_a?(Array)
279
+ value
280
+ end
281
+ private_class_method :first_obj_or_string
282
+ end
283
+ end
284
+ end
285
+ end
@@ -0,0 +1,54 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pdf-reader'
4
+ require 'stringio'
5
+
6
+ module Pikuri
7
+ class Tool
8
+ module Scraper
9
+ # PDF → text extractor used by {Simple.visit} when the fetched
10
+ # response carries +application/pdf+. Wraps the +pdf-reader+ gem:
11
+ # walk every page, concatenate the extracted text, hand the result
12
+ # back as a single string the LLM can read.
13
+ #
14
+ # Best-effort by design. +pdf-reader+ produces clean text from PDFs
15
+ # generated from a digital source (LaTeX, Word export, ...) but
16
+ # returns nothing useful from scanned documents — there is no OCR
17
+ # in this path. When extraction yields no text we still return an
18
+ # empty string rather than raising, so the caller's cache stores a
19
+ # consistent result and the LLM sees an empty observation it can
20
+ # react to.
21
+ #
22
+ # Pure parser — no I/O. {.extract} takes PDF bytes and returns text,
23
+ # so tests can drive it against an in-memory fixture without
24
+ # touching the network.
25
+ module PDF
26
+ # Render +bytes+ as plain text, one page per paragraph.
27
+ #
28
+ # +pdf-reader+ raises a handful of typed exceptions for documents
29
+ # it cannot parse — broken xrefs ({::PDF::Reader::MalformedPDFError}),
30
+ # invalid page references ({::PDF::Reader::InvalidPageError}),
31
+ # encrypted/XFA files ({::PDF::Reader::UnsupportedFeatureError}).
32
+ # All three describe a property of the PDF the LLM can react to
33
+ # ("try a different URL"), so we re-raise them as {FetchError} —
34
+ # same convention as the HTTP layer in {Simple.fetch}. Genuine
35
+ # bugs in +pdf-reader+ itself surface as their own classes and
36
+ # crash loud.
37
+ #
38
+ # @param bytes [String] raw PDF document (binary string)
39
+ # @return [String] concatenated page text; possibly empty when
40
+ # the PDF carries no extractable text (scanned image, empty
41
+ # document)
42
+ # @raise [FetchError] when +pdf-reader+ refuses the document
43
+ def self.extract(bytes)
44
+ reader = ::PDF::Reader.new(StringIO.new(bytes))
45
+ reader.pages.map { |p| p.text.strip }.reject(&:empty?).join("\n\n")
46
+ rescue ::PDF::Reader::MalformedPDFError,
47
+ ::PDF::Reader::InvalidPageError,
48
+ ::PDF::Reader::UnsupportedFeatureError => e
49
+ raise FetchError, "PDF rendering failed: #{e.class.name.split('::').last}: #{e.message}"
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end