pikuri-workspace 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +50 -0
- data/lib/pikuri/tool/confirmer.rb +96 -0
- data/lib/pikuri/tool/edit.rb +196 -0
- data/lib/pikuri/tool/glob.rb +310 -0
- data/lib/pikuri/tool/grep.rb +338 -0
- data/lib/pikuri/tool/read.rb +254 -0
- data/lib/pikuri/tool/workspace.rb +150 -0
- data/lib/pikuri/tool/write.rb +170 -0
- data/lib/pikuri-workspace.rb +27 -0
- metadata +80 -0
|
@@ -0,0 +1,338 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Pikuri
|
|
4
|
+
class Tool
|
|
5
|
+
# The +grep+ tool — content search across the workspace via
|
|
6
|
+
# +ripgrep+. Instantiating +Tool::Grep.new(workspace: ws)+ produces a
|
|
7
|
+
# tool whose {Tool#to_ruby_llm_tool} wiring is identical to any
|
|
8
|
+
# bundled tool's. Same shape as {Tool::Read} (workspace captured by
|
|
9
|
+
# the +execute+ closure, no confirmer — search is read-only).
|
|
10
|
+
#
|
|
11
|
+
# == ripgrep dependency
|
|
12
|
+
#
|
|
13
|
+
# Hard dependency: {.check_binaries!} runs in +initialize+ and raises
|
|
14
|
+
# if +rg+ isn't on +PATH+. Mirrors {Tool::Bash}'s posture for
|
|
15
|
+
# +bash+/+timeout+. We don't ship a Ruby fallback — replicating
|
|
16
|
+
# rg's Rust-regex dialect, glob handling, and +.gitignore+ parsing
|
|
17
|
+
# is a research-loop dead end. Failure message includes the install
|
|
18
|
+
# hint.
|
|
19
|
+
#
|
|
20
|
+
# == Argv
|
|
21
|
+
#
|
|
22
|
+
# rg --line-number --color=never --no-heading --with-filename \
|
|
23
|
+
# --hidden --max-columns=2000 --max-columns-preview \
|
|
24
|
+
# --sort=path \
|
|
25
|
+
# [-i] [--glob <g>] [--files-with-matches|--count-matches] \
|
|
26
|
+
# -- <pattern> <relative-path-or-dot>
|
|
27
|
+
#
|
|
28
|
+
# * +--no-heading+ + +--with-filename+ → flat +path:line:content+ rows
|
|
29
|
+
# regardless of whether the search target is a directory or a single
|
|
30
|
+
# file (rg defaults to suppressing the filename for single-file
|
|
31
|
+
# searches — we force it on for output consistency).
|
|
32
|
+
# * +--hidden+ → search dotfiles (still respects +.gitignore+).
|
|
33
|
+
# * +--max-columns=2000 --max-columns-preview+ → rg truncates lines
|
|
34
|
+
# longer than {MAX_LINE_LENGTH} bytes server-side and appends a
|
|
35
|
+
# preview marker, sparing us per-line truncation.
|
|
36
|
+
# * +--sort=path+ → deterministic output (single-threaded; fine for
|
|
37
|
+
# typical repos under ~10k files). Makes specs assertable and gives
|
|
38
|
+
# the model a stable order to scan.
|
|
39
|
+
# * Subprocess runs with +chdir: workspace.cwd+ and is *always* given
|
|
40
|
+
# an explicit path argument. {Pikuri::Subprocess.spawn} uses
|
|
41
|
+
# +popen2e+ which gives the child a piped (non-tty) stdin; rg's
|
|
42
|
+
# default heuristic on no-path-arg-with-piped-stdin is to search
|
|
43
|
+
# stdin (which we then close — yielding zero matches). Passing the
|
|
44
|
+
# path argument explicitly bypasses the heuristic. Output paths
|
|
45
|
+
# come back as +./...+ when the path is +.+; the leading +./+ is
|
|
46
|
+
# stripped post-rg so the model sees clean workspace-relative paths.
|
|
47
|
+
#
|
|
48
|
+
# == Output modes
|
|
49
|
+
#
|
|
50
|
+
# * +content+ (default) — +path:line:content+ rows.
|
|
51
|
+
# * +files_with_matches+ — just file paths, one per line.
|
|
52
|
+
# * +count+ — +path:count+ per file.
|
|
53
|
+
#
|
|
54
|
+
# Use +files_with_matches+ to scope a broad search cheaply before
|
|
55
|
+
# paying tokens for +content+.
|
|
56
|
+
#
|
|
57
|
+
# == Truncation
|
|
58
|
+
#
|
|
59
|
+
# Total output is head-truncated to {MAX_BYTES} (head-only — grep
|
|
60
|
+
# tails usually carry less signal than the first matches; opposite
|
|
61
|
+
# bias from {Tool::Bash}). Cut at the last line boundary, with a
|
|
62
|
+
# marker reporting omitted bytes and the original total so the model
|
|
63
|
+
# knows how much it missed.
|
|
64
|
+
#
|
|
65
|
+
# == Exit codes
|
|
66
|
+
#
|
|
67
|
+
# * +0+ → matches; format with footer.
|
|
68
|
+
# * +1+ → no matches; return +"No matches for pattern '...'"+.
|
|
69
|
+
# * +2+ → rg error (bad regex, missing path); return +"Error: ripgrep: ..."+.
|
|
70
|
+
#
|
|
71
|
+
# == Refusals
|
|
72
|
+
#
|
|
73
|
+
# All returned as +"Error: ..."+ observations:
|
|
74
|
+
#
|
|
75
|
+
# * Empty +pattern+ → fast reject.
|
|
76
|
+
# * Unknown +output_mode+ → enum error listing valid values.
|
|
77
|
+
# * Path outside the workspace → caught from {Tool::Workspace::Error}.
|
|
78
|
+
# * Nonexistent path → +"Error: path not found: <path>"+.
|
|
79
|
+
class Grep < Tool
|
|
80
|
+
# @return [Integer] hard byte cap on combined rg output. Same value
|
|
81
|
+
# as {Tool::Read::MAX_BYTES} so the two file-touching tools share
|
|
82
|
+
# a budget shape.
|
|
83
|
+
MAX_BYTES = 50 * 1024
|
|
84
|
+
|
|
85
|
+
# @return [String] human-readable form of {MAX_BYTES} for the
|
|
86
|
+
# truncation marker.
|
|
87
|
+
MAX_BYTES_LABEL = "#{MAX_BYTES / 1024} KB"
|
|
88
|
+
|
|
89
|
+
# @return [Integer] per-line cap passed to rg's +--max-columns+.
|
|
90
|
+
# Long lines are truncated server-side with a preview marker.
|
|
91
|
+
MAX_LINE_LENGTH = 2000
|
|
92
|
+
|
|
93
|
+
# @return [Array<String>] valid +output_mode+ values.
|
|
94
|
+
OUTPUT_MODES = %w[content files_with_matches count].freeze
|
|
95
|
+
|
|
96
|
+
# @return [String] default +output_mode+.
|
|
97
|
+
DEFAULT_OUTPUT_MODE = 'content'
|
|
98
|
+
|
|
99
|
+
# Description shown to the LLM. opencode-shape (summary + +Usage:+
|
|
100
|
+
# bullets). Per-parameter constraints live in parameter
|
|
101
|
+
# descriptions.
|
|
102
|
+
#
|
|
103
|
+
# @return [String]
|
|
104
|
+
DESCRIPTION = <<~DESC
|
|
105
|
+
Search file contents for a regex pattern across the workspace.
|
|
106
|
+
|
|
107
|
+
Usage:
|
|
108
|
+
- Wraps `ripgrep` — regex syntax is rg's Rust-regex dialect (mostly PCRE-compatible; no lookbehind).
|
|
109
|
+
- Default search root is the workspace root; pass `path` to narrow to a file or subdirectory.
|
|
110
|
+
- Respects `.gitignore` — for unfiltered search use bash `rg --no-ignore <pattern>`.
|
|
111
|
+
- Use `glob` to filter by filename, e.g. `"*.rb"` or `"src/**/*.{ts,tsx}"`.
|
|
112
|
+
- `output_mode` controls verbosity: `content` (default, file:line:text), `files_with_matches` (paths only), `count` (matches per file).
|
|
113
|
+
- Use `files_with_matches` first to scope a broad search, then `content` (or `read`) to investigate — saves tokens.
|
|
114
|
+
- Output is truncated to #{MAX_BYTES_LABEL}; refine the pattern or narrow `path` if the response ends in a truncation marker.
|
|
115
|
+
- Long lines are truncated to #{MAX_LINE_LENGTH} chars with a preview marker; use `read` to see full lines.
|
|
116
|
+
DESC
|
|
117
|
+
|
|
118
|
+
# @param workspace [Tool::Workspace] captured for path resolution
|
|
119
|
+
# and as +chdir+ for rg. All path arguments route through
|
|
120
|
+
# +workspace.resolve_for_read+.
|
|
121
|
+
# @raise [RuntimeError] if +rg+ isn't on +PATH+; fail-loud at
|
|
122
|
+
# construction rather than the first tool call.
|
|
123
|
+
# @return [Grep]
|
|
124
|
+
def initialize(workspace:)
|
|
125
|
+
Grep.send(:check_binaries!)
|
|
126
|
+
super(
|
|
127
|
+
name: 'grep',
|
|
128
|
+
description: DESCRIPTION,
|
|
129
|
+
parameters: Parameters.build { |p|
|
|
130
|
+
p.required_string :pattern,
|
|
131
|
+
'Regex pattern to search for (rg Rust-regex ' \
|
|
132
|
+
'dialect), e.g. "def\s+\w+" or "TODO".'
|
|
133
|
+
p.optional_string :path,
|
|
134
|
+
'File or directory to search. Relative paths ' \
|
|
135
|
+
'resolve against the workspace root. Defaults ' \
|
|
136
|
+
'to the workspace root, e.g. "lib/" or "lib/foo.rb".'
|
|
137
|
+
p.optional_string :glob,
|
|
138
|
+
'Filename glob to filter files, e.g. "*.rb" ' \
|
|
139
|
+
'or "src/**/*.{ts,tsx}".'
|
|
140
|
+
p.optional_boolean :case_insensitive,
|
|
141
|
+
'Match case-insensitively. Defaults to false, e.g. true.'
|
|
142
|
+
p.optional_string :output_mode,
|
|
143
|
+
"One of #{OUTPUT_MODES.join(', ')}. Defaults to " \
|
|
144
|
+
"#{DEFAULT_OUTPUT_MODE}, e.g. \"files_with_matches\"."
|
|
145
|
+
},
|
|
146
|
+
execute: lambda { |pattern:, path: nil, glob: nil,
|
|
147
|
+
case_insensitive: false, output_mode: DEFAULT_OUTPUT_MODE|
|
|
148
|
+
Grep.search(workspace: workspace, pattern: pattern, path: path,
|
|
149
|
+
glob: glob, case_insensitive: case_insensitive,
|
|
150
|
+
output_mode: output_mode)
|
|
151
|
+
}
|
|
152
|
+
)
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
# Validate inputs, resolve the path against the workspace, spawn
|
|
156
|
+
# rg, and render the observation. Returns either the formatted
|
|
157
|
+
# results, a "no matches" string, or +"Error: ..."+.
|
|
158
|
+
#
|
|
159
|
+
# @param workspace [Tool::Workspace]
|
|
160
|
+
# @param pattern [String]
|
|
161
|
+
# @param path [String, nil]
|
|
162
|
+
# @param glob [String, nil]
|
|
163
|
+
# @param case_insensitive [Boolean]
|
|
164
|
+
# @param output_mode [String]
|
|
165
|
+
# @return [String]
|
|
166
|
+
def self.search(workspace:, pattern:, path:, glob:, case_insensitive:, output_mode:)
|
|
167
|
+
return 'Error: empty pattern.' if pattern.empty?
|
|
168
|
+
unless OUTPUT_MODES.include?(output_mode)
|
|
169
|
+
return "Error: output_mode must be one of #{OUTPUT_MODES.join(', ')}, " \
|
|
170
|
+
"got #{output_mode.inspect}."
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
search_target = '.'
|
|
174
|
+
if path
|
|
175
|
+
resolved = workspace.resolve_for_read(path)
|
|
176
|
+
return "Error: path not found: #{path}" unless resolved.exist?
|
|
177
|
+
|
|
178
|
+
rel = resolved.relative_path_from(workspace.cwd).to_s
|
|
179
|
+
search_target = rel
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
argv = build_argv(pattern: pattern, glob: glob,
|
|
183
|
+
case_insensitive: case_insensitive,
|
|
184
|
+
output_mode: output_mode, path: search_target)
|
|
185
|
+
|
|
186
|
+
result = Pikuri::Subprocess.spawn(*argv, chdir: workspace.cwd.to_s).wait
|
|
187
|
+
exit_code = result.status.exitstatus
|
|
188
|
+
|
|
189
|
+
case exit_code
|
|
190
|
+
when 0
|
|
191
|
+
format_output(result.output, output_mode: output_mode,
|
|
192
|
+
pattern: pattern, path: path)
|
|
193
|
+
when 1
|
|
194
|
+
no_match_message(pattern: pattern, path: path)
|
|
195
|
+
else
|
|
196
|
+
stderr = result.output.strip
|
|
197
|
+
stderr = "exited #{exit_code}" if stderr.empty?
|
|
198
|
+
"Error: ripgrep: #{stderr}"
|
|
199
|
+
end
|
|
200
|
+
rescue Tool::Workspace::Error => e
|
|
201
|
+
"Error: #{e.message}"
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
# Build the +rg+ argv. Path is always passed (defaults to +.+) —
|
|
205
|
+
# see the class header for why.
|
|
206
|
+
#
|
|
207
|
+
# @return [Array<String>]
|
|
208
|
+
def self.build_argv(pattern:, glob:, case_insensitive:, output_mode:, path:)
|
|
209
|
+
argv = [
|
|
210
|
+
'rg',
|
|
211
|
+
'--line-number',
|
|
212
|
+
'--color=never',
|
|
213
|
+
'--no-heading',
|
|
214
|
+
'--with-filename',
|
|
215
|
+
'--hidden',
|
|
216
|
+
"--max-columns=#{MAX_LINE_LENGTH}",
|
|
217
|
+
'--max-columns-preview',
|
|
218
|
+
'--sort=path'
|
|
219
|
+
]
|
|
220
|
+
argv << '-i' if case_insensitive
|
|
221
|
+
argv.push('--glob', glob) if glob
|
|
222
|
+
case output_mode
|
|
223
|
+
when 'files_with_matches' then argv << '--files-with-matches'
|
|
224
|
+
when 'count' then argv << '--count-matches'
|
|
225
|
+
end
|
|
226
|
+
argv.push('--', pattern, path)
|
|
227
|
+
argv
|
|
228
|
+
end
|
|
229
|
+
private_class_method :build_argv
|
|
230
|
+
|
|
231
|
+
# Render rg output: strip the +./+ prefix rg adds when path is
|
|
232
|
+
# +.+, head-truncate at {MAX_BYTES}, append a footer summarizing
|
|
233
|
+
# the result count.
|
|
234
|
+
#
|
|
235
|
+
# @return [String]
|
|
236
|
+
def self.format_output(raw, output_mode:, pattern:, path:)
|
|
237
|
+
cleaned = strip_dot_slash(raw)
|
|
238
|
+
content, truncation_marker = head_truncate(cleaned)
|
|
239
|
+
stripped = content.chomp
|
|
240
|
+
|
|
241
|
+
return no_match_message(pattern: pattern, path: path) if stripped.empty?
|
|
242
|
+
|
|
243
|
+
footer = build_footer(stripped, output_mode)
|
|
244
|
+
[stripped, '', footer + truncation_marker].join("\n")
|
|
245
|
+
end
|
|
246
|
+
private_class_method :format_output
|
|
247
|
+
|
|
248
|
+
# Strip leading +./+ from each line of rg output. rg emits this
|
|
249
|
+
# prefix when invoked with +.+ as the search path; we want clean
|
|
250
|
+
# workspace-relative paths regardless of whether the user passed a
|
|
251
|
+
# path or we defaulted to +.+.
|
|
252
|
+
#
|
|
253
|
+
# @return [String]
|
|
254
|
+
def self.strip_dot_slash(raw)
|
|
255
|
+
raw.gsub(/^\.\//, '')
|
|
256
|
+
end
|
|
257
|
+
private_class_method :strip_dot_slash
|
|
258
|
+
|
|
259
|
+
# Head-truncate +raw+ to {MAX_BYTES}, cutting at the last newline
|
|
260
|
+
# boundary so the final row is never partial. Returns the truncated
|
|
261
|
+
# content and a marker String (empty if no truncation).
|
|
262
|
+
#
|
|
263
|
+
# @return [Array(String, String)]
|
|
264
|
+
def self.head_truncate(raw)
|
|
265
|
+
total = raw.bytesize
|
|
266
|
+
return [raw, ''] if total <= MAX_BYTES
|
|
267
|
+
|
|
268
|
+
head = raw.byteslice(0, MAX_BYTES)
|
|
269
|
+
last_nl = head.rindex("\n")
|
|
270
|
+
head = head.byteslice(0, last_nl) if last_nl
|
|
271
|
+
omitted = total - head.bytesize
|
|
272
|
+
marker = "\n\n... [#{omitted} bytes omitted; total was #{total} bytes; " \
|
|
273
|
+
'refine pattern or path] ...'
|
|
274
|
+
[head, marker]
|
|
275
|
+
end
|
|
276
|
+
private_class_method :head_truncate
|
|
277
|
+
|
|
278
|
+
# Compose a one-line footer summarizing the result. Format depends
|
|
279
|
+
# on +output_mode+; counts derive from rg's text output.
|
|
280
|
+
#
|
|
281
|
+
# @return [String]
|
|
282
|
+
def self.build_footer(content, output_mode)
|
|
283
|
+
lines = content.split("\n").reject(&:empty?)
|
|
284
|
+
case output_mode
|
|
285
|
+
when 'content'
|
|
286
|
+
files = lines.map { |l| l.split(':', 2).first }.uniq
|
|
287
|
+
"Found #{pluralize(lines.size, 'match', 'matches')} in " \
|
|
288
|
+
"#{pluralize(files.size, 'file', 'files')}."
|
|
289
|
+
when 'files_with_matches'
|
|
290
|
+
"Found #{pluralize(lines.size, 'file', 'files')}."
|
|
291
|
+
when 'count'
|
|
292
|
+
total = lines.sum { |l| Integer(l.split(':').last) }
|
|
293
|
+
"Found #{pluralize(total, 'match', 'matches')} in " \
|
|
294
|
+
"#{pluralize(lines.size, 'file', 'files')}."
|
|
295
|
+
end
|
|
296
|
+
end
|
|
297
|
+
private_class_method :build_footer
|
|
298
|
+
|
|
299
|
+
# @return [String] +"1 match"+ / +"2 matches"+
|
|
300
|
+
def self.pluralize(n, sing, plural)
|
|
301
|
+
"#{n} #{n == 1 ? sing : plural}"
|
|
302
|
+
end
|
|
303
|
+
private_class_method :pluralize
|
|
304
|
+
|
|
305
|
+
# @return [String]
|
|
306
|
+
def self.no_match_message(pattern:, path:)
|
|
307
|
+
base = "No matches for pattern '#{pattern}'"
|
|
308
|
+
base += " in #{path}" if path
|
|
309
|
+
"#{base}."
|
|
310
|
+
end
|
|
311
|
+
private_class_method :no_match_message
|
|
312
|
+
|
|
313
|
+
# Verify +rg+ is reachable on +PATH+. Routed through
|
|
314
|
+
# {Pikuri::Subprocess.spawn} to honor the subprocess seam.
|
|
315
|
+
# rg missing surfaces as +Errno::ENOENT+; an installed rg returns
|
|
316
|
+
# exit 0 from +--version+.
|
|
317
|
+
#
|
|
318
|
+
# @return [void]
|
|
319
|
+
# @raise [RuntimeError] if rg is missing
|
|
320
|
+
def self.check_binaries!
|
|
321
|
+
result = Pikuri::Subprocess.spawn('rg', '--version', chdir: '/').wait
|
|
322
|
+
return if result.status.success?
|
|
323
|
+
|
|
324
|
+
raise install_hint
|
|
325
|
+
rescue Errno::ENOENT
|
|
326
|
+
raise install_hint
|
|
327
|
+
end
|
|
328
|
+
private_class_method :check_binaries!
|
|
329
|
+
|
|
330
|
+
# @return [String]
|
|
331
|
+
def self.install_hint
|
|
332
|
+
"Tool::Grep requires 'rg' (ripgrep) on PATH; install via your " \
|
|
333
|
+
"distro's package manager (e.g. 'apt install ripgrep')."
|
|
334
|
+
end
|
|
335
|
+
private_class_method :install_hint
|
|
336
|
+
end
|
|
337
|
+
end
|
|
338
|
+
end
|
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Pikuri
|
|
4
|
+
class Tool
|
|
5
|
+
# The +read+ tool, expressed as a {Tool} subclass: instantiating
|
|
6
|
+
# +Tool::Read.new(workspace: ws)+ produces a tool whose
|
|
7
|
+
# {Tool#to_ruby_llm_tool} wiring is identical to any bundled tool's,
|
|
8
|
+
# so ruby_llm sees nothing special about it. Same shape as
|
|
9
|
+
# {Tool::SubAgent} — workspace is captured by the +execute+ closure
|
|
10
|
+
# at construction.
|
|
11
|
+
#
|
|
12
|
+
# == Output format
|
|
13
|
+
#
|
|
14
|
+
# cat-n: each line is rendered as +"%6d\t%s"+ (six-column right-
|
|
15
|
+
# padded line number, tab, content). Chosen for breadth of training-
|
|
16
|
+
# data exposure: +cat -n+ output shows up across virtually every Unix
|
|
17
|
+
# tutorial and Stack Overflow answer, so even small local models
|
|
18
|
+
# recognize the shape. opencode's shorter +"<n>: <content>"+ format
|
|
19
|
+
# saves a few thousand tokens per 2K-line file but trades model
|
|
20
|
+
# familiarity; pi omits line numbers entirely (cheapest tokens, but
|
|
21
|
+
# the model loses the ability to cite ranges or pick {Edit}
|
|
22
|
+
# boundaries precisely).
|
|
23
|
+
#
|
|
24
|
+
# == Truncation rules
|
|
25
|
+
#
|
|
26
|
+
# Two independent limits, whichever fires first wins:
|
|
27
|
+
#
|
|
28
|
+
# * *Line limit* — {DEFAULT_LIMIT} lines (overridable via +limit+).
|
|
29
|
+
# * *Byte cap* — {MAX_BYTES} bytes of input content; not exposed as a
|
|
30
|
+
# parameter. Bypassable in practice by paging via +offset+.
|
|
31
|
+
#
|
|
32
|
+
# Additionally, individual lines longer than {MAX_LINE_LENGTH} chars
|
|
33
|
+
# are truncated with {LINE_TRUNCATION_MARKER} appended; the model is
|
|
34
|
+
# told to reach for +grep+ to find content inside such files.
|
|
35
|
+
#
|
|
36
|
+
# == Refusals
|
|
37
|
+
#
|
|
38
|
+
# * Path outside the workspace → caught from
|
|
39
|
+
# {Tool::Workspace::Error}, returned as +"Error: ..."+.
|
|
40
|
+
# * File not found, EACCES → +"Error: ..."+.
|
|
41
|
+
# * Path is a directory → +"Error: ... use the glob tool"+, keeping
|
|
42
|
+
# directory listing as the glob tool's responsibility (Step 9).
|
|
43
|
+
# * Binary content → sniffed from the first {BINARY_SAMPLE_BYTES} of
|
|
44
|
+
# the file: any +NUL+ byte, or more than {BINARY_NONPRINTABLE_THRESHOLD}
|
|
45
|
+
# non-printable bytes (control chars outside +\t \n \v \f \r+),
|
|
46
|
+
# triggers refusal. Catches images, PDFs, archives, and compiled
|
|
47
|
+
# artifacts without an extension list to maintain.
|
|
48
|
+
# * Offset past EOF → +"Error: offset N is beyond end of file (M lines total)"+.
|
|
49
|
+
class Read < Tool
|
|
50
|
+
# @return [Integer] default value of the +limit+ parameter (number
|
|
51
|
+
# of lines to read per call).
|
|
52
|
+
DEFAULT_LIMIT = 2000
|
|
53
|
+
|
|
54
|
+
# @return [Integer] per-line character cap; longer lines are
|
|
55
|
+
# truncated with {LINE_TRUNCATION_MARKER}.
|
|
56
|
+
MAX_LINE_LENGTH = 2000
|
|
57
|
+
|
|
58
|
+
# @return [String] suffix appended to lines truncated by
|
|
59
|
+
# {MAX_LINE_LENGTH}.
|
|
60
|
+
LINE_TRUNCATION_MARKER = "... (line truncated to #{MAX_LINE_LENGTH} chars)"
|
|
61
|
+
|
|
62
|
+
# @return [Integer] hard byte cap on input content collected per
|
|
63
|
+
# call. Counted on the line bytes (plus one for the joining
|
|
64
|
+
# newline); the rendered output is slightly larger due to the
|
|
65
|
+
# per-line +"%6d\t"+ prefix.
|
|
66
|
+
MAX_BYTES = 50 * 1024
|
|
67
|
+
|
|
68
|
+
# @return [String] human-readable form of {MAX_BYTES} for the
|
|
69
|
+
# continuation marker.
|
|
70
|
+
MAX_BYTES_LABEL = "#{MAX_BYTES / 1024} KB"
|
|
71
|
+
|
|
72
|
+
# @return [Integer] number of bytes sampled from the start of the
|
|
73
|
+
# file for binary-content detection.
|
|
74
|
+
BINARY_SAMPLE_BYTES = 4096
|
|
75
|
+
|
|
76
|
+
# @return [Float] fraction of the sample that may be non-printable
|
|
77
|
+
# before the file is classified as binary. Matches opencode's
|
|
78
|
+
# 30%.
|
|
79
|
+
BINARY_NONPRINTABLE_THRESHOLD = 0.30
|
|
80
|
+
|
|
81
|
+
# Description shown to the LLM. Follows the opencode-shape (summary
|
|
82
|
+
# + +Usage:+ bullets) prescribed by the project's tool-description
|
|
83
|
+
# convention. Per-parameter constraints (defaults, format) live in
|
|
84
|
+
# the parameter descriptions, not here.
|
|
85
|
+
#
|
|
86
|
+
# @return [String]
|
|
87
|
+
DESCRIPTION = <<~DESC
|
|
88
|
+
Read a file from the workspace and return its contents with line numbers.
|
|
89
|
+
|
|
90
|
+
Usage:
|
|
91
|
+
- Output is line-numbered in `cat -n` style so subsequent edits can reference exact line numbers.
|
|
92
|
+
- Use `offset` and `limit` to page through large files; when the response ends in `Use offset=N to continue`, call again with that offset.
|
|
93
|
+
- Lines longer than #{MAX_LINE_LENGTH} chars are truncated with a marker — use `grep` for content inside such files.
|
|
94
|
+
- Binary files (images, PDFs, archives, compiled artifacts) are refused; this tool reads text only.
|
|
95
|
+
- Directories are refused — use the `glob` tool to list files.
|
|
96
|
+
- If unsure of the path, use `glob` first to look up filenames.
|
|
97
|
+
- Avoid tiny repeated slices — if you need more context, read a larger window.
|
|
98
|
+
DESC
|
|
99
|
+
|
|
100
|
+
# @param workspace [Tool::Workspace] captured for path resolution;
|
|
101
|
+
# all reads route through +workspace.resolve_for_read+.
|
|
102
|
+
# @return [Read]
|
|
103
|
+
def initialize(workspace:)
|
|
104
|
+
super(
|
|
105
|
+
name: 'read',
|
|
106
|
+
description: DESCRIPTION,
|
|
107
|
+
parameters: Parameters.build { |p|
|
|
108
|
+
p.required_string :path,
|
|
109
|
+
'Path to the file to read. Relative paths ' \
|
|
110
|
+
'resolve against the workspace root, e.g. ' \
|
|
111
|
+
'"lib/foo.rb" or "/abs/path/to/file.txt".'
|
|
112
|
+
p.optional_integer :offset,
|
|
113
|
+
'Line number to start reading from (1-indexed). ' \
|
|
114
|
+
"Defaults to 1, e.g. 200."
|
|
115
|
+
p.optional_integer :limit,
|
|
116
|
+
'Maximum number of lines to read. Defaults to ' \
|
|
117
|
+
"#{DEFAULT_LIMIT}, e.g. 500."
|
|
118
|
+
},
|
|
119
|
+
execute: ->(path:, offset: 1, limit: DEFAULT_LIMIT) {
|
|
120
|
+
Read.read(workspace: workspace, path: path, offset: offset, limit: limit)
|
|
121
|
+
}
|
|
122
|
+
)
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
# Resolve +path+ against +workspace+, refuse directories / binaries /
|
|
126
|
+
# missing files, and return either the cat-n-formatted slice or an
|
|
127
|
+
# +"Error: ..."+ observation.
|
|
128
|
+
#
|
|
129
|
+
# @param workspace [Tool::Workspace]
|
|
130
|
+
# @param path [String] raw path as supplied by the LLM
|
|
131
|
+
# @param offset [Integer] 1-indexed line number to start at
|
|
132
|
+
# @param limit [Integer] maximum lines to return
|
|
133
|
+
# @return [String] tool observation
|
|
134
|
+
def self.read(workspace:, path:, offset:, limit:)
|
|
135
|
+
return "Error: offset must be >= 1, got #{offset}" if offset < 1
|
|
136
|
+
return "Error: limit must be >= 1, got #{limit}" if limit < 1
|
|
137
|
+
|
|
138
|
+
resolved = workspace.resolve_for_read(path)
|
|
139
|
+
return "Error: file not found: #{path}" unless resolved.exist?
|
|
140
|
+
return "Error: #{path} is a directory; use the glob tool to list files." if resolved.directory?
|
|
141
|
+
|
|
142
|
+
sample = read_sample(resolved)
|
|
143
|
+
return "Error: cannot read binary file: #{path}" if binary?(sample)
|
|
144
|
+
|
|
145
|
+
format_slice(path: path, resolved: resolved, offset: offset, limit: limit)
|
|
146
|
+
rescue Tool::Workspace::Error => e
|
|
147
|
+
"Error: #{e.message}"
|
|
148
|
+
rescue Errno::EACCES => e
|
|
149
|
+
"Error: cannot read #{path}: #{e.message}"
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
# Read up to {BINARY_SAMPLE_BYTES} of the file in binary mode for
|
|
153
|
+
# the {.binary?} sniff. Returns an empty String for an empty file
|
|
154
|
+
# (which {.binary?} treats as not-binary).
|
|
155
|
+
#
|
|
156
|
+
# @param resolved [Pathname]
|
|
157
|
+
# @return [String] raw bytes (ASCII-8BIT encoding)
|
|
158
|
+
def self.read_sample(resolved)
|
|
159
|
+
resolved.open('rb') { |io| io.read(BINARY_SAMPLE_BYTES) || +'' }
|
|
160
|
+
end
|
|
161
|
+
private_class_method :read_sample
|
|
162
|
+
|
|
163
|
+
# Heuristic binary classifier matching opencode's: any NUL byte
|
|
164
|
+
# forces +true+; otherwise count bytes outside the printable +\t \n
|
|
165
|
+
# \v \f \r+ + ASCII-32..126 range and ratio against the sample
|
|
166
|
+
# size. UTF-8 continuation bytes (0x80-0xBF) are >127 so they sit
|
|
167
|
+
# outside the non-printable ranges and pass through unflagged,
|
|
168
|
+
# letting UTF-8 text read fine.
|
|
169
|
+
#
|
|
170
|
+
# Public because {Tool::Edit} reuses it to refuse binary targets —
|
|
171
|
+
# if Edit accepted a binary file the model has no way to have read,
|
|
172
|
+
# it could corrupt bytes the model never inspected. Same sniff, same
|
|
173
|
+
# threshold, one definition.
|
|
174
|
+
#
|
|
175
|
+
# @param bytes [String] sample bytes
|
|
176
|
+
# @return [Boolean]
|
|
177
|
+
def self.binary?(bytes)
|
|
178
|
+
return false if bytes.empty?
|
|
179
|
+
|
|
180
|
+
non_printable = 0
|
|
181
|
+
bytes.each_byte do |b|
|
|
182
|
+
return true if b.zero?
|
|
183
|
+
|
|
184
|
+
non_printable += 1 if b < 9 || (b > 13 && b < 32)
|
|
185
|
+
end
|
|
186
|
+
non_printable.to_f / bytes.bytesize > BINARY_NONPRINTABLE_THRESHOLD
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
# Stream the file line-by-line, collect at most +limit+ lines
|
|
190
|
+
# starting at +offset+, and stop early if {MAX_BYTES} is reached.
|
|
191
|
+
# We keep counting lines past the collection window so the trailer
|
|
192
|
+
# can report total line count when the line limit (not the byte
|
|
193
|
+
# cap) was the stopping criterion — same trick opencode uses.
|
|
194
|
+
#
|
|
195
|
+
# @return [String]
|
|
196
|
+
def self.format_slice(path:, resolved:, offset:, limit:)
|
|
197
|
+
start_index = offset - 1
|
|
198
|
+
collected = []
|
|
199
|
+
total_lines = 0
|
|
200
|
+
bytes = 0
|
|
201
|
+
byte_cap_hit = false
|
|
202
|
+
has_more = false
|
|
203
|
+
|
|
204
|
+
resolved.each_line do |raw|
|
|
205
|
+
total_lines += 1
|
|
206
|
+
next if total_lines <= start_index
|
|
207
|
+
|
|
208
|
+
if collected.length >= limit
|
|
209
|
+
has_more = true
|
|
210
|
+
next
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
line = raw.chomp
|
|
214
|
+
if line.length > MAX_LINE_LENGTH
|
|
215
|
+
line = line[0, MAX_LINE_LENGTH] + LINE_TRUNCATION_MARKER
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
size = line.bytesize + 1 # +1 for the joining newline
|
|
219
|
+
if bytes + size > MAX_BYTES
|
|
220
|
+
byte_cap_hit = true
|
|
221
|
+
has_more = true
|
|
222
|
+
break
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
collected << line
|
|
226
|
+
bytes += size
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
return '(Empty file)' if total_lines.zero?
|
|
230
|
+
|
|
231
|
+
if start_index >= total_lines
|
|
232
|
+
return "Error: offset #{offset} is beyond end of file (#{total_lines} lines total)"
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
last_line = offset + collected.length - 1
|
|
236
|
+
body = collected.each_with_index.map { |line, i| format("%6d\t%s", i + offset, line) }.join("\n")
|
|
237
|
+
|
|
238
|
+
trailer =
|
|
239
|
+
if byte_cap_hit
|
|
240
|
+
"(Output capped at #{MAX_BYTES_LABEL}. Showing lines #{offset}-#{last_line}. " \
|
|
241
|
+
"Use offset=#{last_line + 1} to continue.)"
|
|
242
|
+
elsif has_more
|
|
243
|
+
"(Showing lines #{offset}-#{last_line} of #{total_lines}. " \
|
|
244
|
+
"Use offset=#{last_line + 1} to continue.)"
|
|
245
|
+
else
|
|
246
|
+
"(End of file - total #{total_lines} lines)"
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
"#{body}\n\n#{trailer}"
|
|
250
|
+
end
|
|
251
|
+
private_class_method :format_slice
|
|
252
|
+
end
|
|
253
|
+
end
|
|
254
|
+
end
|