pikuri-workspace 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,338 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pikuri
4
+ class Tool
5
+ # The +grep+ tool — content search across the workspace via
6
+ # +ripgrep+. Instantiating +Tool::Grep.new(workspace: ws)+ produces a
7
+ # tool whose {Tool#to_ruby_llm_tool} wiring is identical to any
8
+ # bundled tool's. Same shape as {Tool::Read} (workspace captured by
9
+ # the +execute+ closure, no confirmer — search is read-only).
10
+ #
11
+ # == ripgrep dependency
12
+ #
13
+ # Hard dependency: {.check_binaries!} runs in +initialize+ and raises
14
+ # if +rg+ isn't on +PATH+. Mirrors {Tool::Bash}'s posture for
15
+ # +bash+/+timeout+. We don't ship a Ruby fallback — replicating
16
+ # rg's Rust-regex dialect, glob handling, and +.gitignore+ parsing
17
+ # is a research-loop dead end. Failure message includes the install
18
+ # hint.
19
+ #
20
+ # == Argv
21
+ #
22
+ # rg --line-number --color=never --no-heading --with-filename \
23
+ # --hidden --max-columns=2000 --max-columns-preview \
24
+ # --sort=path \
25
+ # [-i] [--glob <g>] [--files-with-matches|--count-matches] \
26
+ # -- <pattern> <relative-path-or-dot>
27
+ #
28
+ # * +--no-heading+ + +--with-filename+ → flat +path:line:content+ rows
29
+ # regardless of whether the search target is a directory or a single
30
+ # file (rg defaults to suppressing the filename for single-file
31
+ # searches — we force it on for output consistency).
32
+ # * +--hidden+ → search dotfiles (still respects +.gitignore+).
33
+ # * +--max-columns=2000 --max-columns-preview+ → rg truncates lines
34
+ # longer than {MAX_LINE_LENGTH} bytes server-side and appends a
35
+ # preview marker, sparing us per-line truncation.
36
+ # * +--sort=path+ → deterministic output (single-threaded; fine for
37
+ # typical repos under ~10k files). Makes specs assertable and gives
38
+ # the model a stable order to scan.
39
+ # * Subprocess runs with +chdir: workspace.cwd+ and is *always* given
40
+ # an explicit path argument. {Pikuri::Subprocess.spawn} uses
41
+ # +popen2e+ which gives the child a piped (non-tty) stdin; rg's
42
+ # default heuristic on no-path-arg-with-piped-stdin is to search
43
+ # stdin (which we then close — yielding zero matches). Passing the
44
+ # path argument explicitly bypasses the heuristic. Output paths
45
+ # come back as +./...+ when the path is +.+; the leading +./+ is
46
+ # stripped post-rg so the model sees clean workspace-relative paths.
47
+ #
48
+ # == Output modes
49
+ #
50
+ # * +content+ (default) — +path:line:content+ rows.
51
+ # * +files_with_matches+ — just file paths, one per line.
52
+ # * +count+ — +path:count+ per file.
53
+ #
54
+ # Use +files_with_matches+ to scope a broad search cheaply before
55
+ # paying tokens for +content+.
56
+ #
57
+ # == Truncation
58
+ #
59
+ # Total output is head-truncated to {MAX_BYTES} (head-only — grep
60
+ # tails usually carry less signal than the first matches; opposite
61
+ # bias from {Tool::Bash}). Cut at the last line boundary, with a
62
+ # marker reporting omitted bytes and the original total so the model
63
+ # knows how much it missed.
64
+ #
65
+ # == Exit codes
66
+ #
67
+ # * +0+ → matches; format with footer.
68
+ # * +1+ → no matches; return +"No matches for pattern '...'"+.
69
+ # * +2+ → rg error (bad regex, missing path); return +"Error: ripgrep: ..."+.
70
+ #
71
+ # == Refusals
72
+ #
73
+ # All returned as +"Error: ..."+ observations:
74
+ #
75
+ # * Empty +pattern+ → fast reject.
76
+ # * Unknown +output_mode+ → enum error listing valid values.
77
+ # * Path outside the workspace → caught from {Tool::Workspace::Error}.
78
+ # * Nonexistent path → +"Error: path not found: <path>"+.
79
+ class Grep < Tool
80
+ # @return [Integer] hard byte cap on combined rg output. Same value
81
+ # as {Tool::Read::MAX_BYTES} so the two file-touching tools share
82
+ # a budget shape.
83
+ MAX_BYTES = 50 * 1024
84
+
85
+ # @return [String] human-readable form of {MAX_BYTES} for the
86
+ # truncation marker.
87
+ MAX_BYTES_LABEL = "#{MAX_BYTES / 1024} KB"
88
+
89
+ # @return [Integer] per-line cap passed to rg's +--max-columns+.
90
+ # Long lines are truncated server-side with a preview marker.
91
+ MAX_LINE_LENGTH = 2000
92
+
93
+ # @return [Array<String>] valid +output_mode+ values.
94
+ OUTPUT_MODES = %w[content files_with_matches count].freeze
95
+
96
+ # @return [String] default +output_mode+.
97
+ DEFAULT_OUTPUT_MODE = 'content'
98
+
99
+ # Description shown to the LLM. opencode-shape (summary + +Usage:+
100
+ # bullets). Per-parameter constraints live in parameter
101
+ # descriptions.
102
+ #
103
+ # @return [String]
104
+ DESCRIPTION = <<~DESC
105
+ Search file contents for a regex pattern across the workspace.
106
+
107
+ Usage:
108
+ - Wraps `ripgrep` — regex syntax is rg's Rust-regex dialect (mostly PCRE-compatible; no lookbehind).
109
+ - Default search root is the workspace root; pass `path` to narrow to a file or subdirectory.
110
+ - Respects `.gitignore` — for unfiltered search use bash `rg --no-ignore <pattern>`.
111
+ - Use `glob` to filter by filename, e.g. `"*.rb"` or `"src/**/*.{ts,tsx}"`.
112
+ - `output_mode` controls verbosity: `content` (default, file:line:text), `files_with_matches` (paths only), `count` (matches per file).
113
+ - Use `files_with_matches` first to scope a broad search, then `content` (or `read`) to investigate — saves tokens.
114
+ - Output is truncated to #{MAX_BYTES_LABEL}; refine the pattern or narrow `path` if the response ends in a truncation marker.
115
+ - Long lines are truncated to #{MAX_LINE_LENGTH} chars with a preview marker; use `read` to see full lines.
116
+ DESC
117
+
118
+ # @param workspace [Tool::Workspace] captured for path resolution
119
+ # and as +chdir+ for rg. All path arguments route through
120
+ # +workspace.resolve_for_read+.
121
+ # @raise [RuntimeError] if +rg+ isn't on +PATH+; fail-loud at
122
+ # construction rather than the first tool call.
123
+ # @return [Grep]
124
+ def initialize(workspace:)
125
+ Grep.send(:check_binaries!)
126
+ super(
127
+ name: 'grep',
128
+ description: DESCRIPTION,
129
+ parameters: Parameters.build { |p|
130
+ p.required_string :pattern,
131
+ 'Regex pattern to search for (rg Rust-regex ' \
132
+ 'dialect), e.g. "def\s+\w+" or "TODO".'
133
+ p.optional_string :path,
134
+ 'File or directory to search. Relative paths ' \
135
+ 'resolve against the workspace root. Defaults ' \
136
+ 'to the workspace root, e.g. "lib/" or "lib/foo.rb".'
137
+ p.optional_string :glob,
138
+ 'Filename glob to filter files, e.g. "*.rb" ' \
139
+ 'or "src/**/*.{ts,tsx}".'
140
+ p.optional_boolean :case_insensitive,
141
+ 'Match case-insensitively. Defaults to false, e.g. true.'
142
+ p.optional_string :output_mode,
143
+ "One of #{OUTPUT_MODES.join(', ')}. Defaults to " \
144
+ "#{DEFAULT_OUTPUT_MODE}, e.g. \"files_with_matches\"."
145
+ },
146
+ execute: lambda { |pattern:, path: nil, glob: nil,
147
+ case_insensitive: false, output_mode: DEFAULT_OUTPUT_MODE|
148
+ Grep.search(workspace: workspace, pattern: pattern, path: path,
149
+ glob: glob, case_insensitive: case_insensitive,
150
+ output_mode: output_mode)
151
+ }
152
+ )
153
+ end
154
+
155
+ # Validate inputs, resolve the path against the workspace, spawn
156
+ # rg, and render the observation. Returns either the formatted
157
+ # results, a "no matches" string, or +"Error: ..."+.
158
+ #
159
+ # @param workspace [Tool::Workspace]
160
+ # @param pattern [String]
161
+ # @param path [String, nil]
162
+ # @param glob [String, nil]
163
+ # @param case_insensitive [Boolean]
164
+ # @param output_mode [String]
165
+ # @return [String]
166
+ def self.search(workspace:, pattern:, path:, glob:, case_insensitive:, output_mode:)
167
+ return 'Error: empty pattern.' if pattern.empty?
168
+ unless OUTPUT_MODES.include?(output_mode)
169
+ return "Error: output_mode must be one of #{OUTPUT_MODES.join(', ')}, " \
170
+ "got #{output_mode.inspect}."
171
+ end
172
+
173
+ search_target = '.'
174
+ if path
175
+ resolved = workspace.resolve_for_read(path)
176
+ return "Error: path not found: #{path}" unless resolved.exist?
177
+
178
+ rel = resolved.relative_path_from(workspace.cwd).to_s
179
+ search_target = rel
180
+ end
181
+
182
+ argv = build_argv(pattern: pattern, glob: glob,
183
+ case_insensitive: case_insensitive,
184
+ output_mode: output_mode, path: search_target)
185
+
186
+ result = Pikuri::Subprocess.spawn(*argv, chdir: workspace.cwd.to_s).wait
187
+ exit_code = result.status.exitstatus
188
+
189
+ case exit_code
190
+ when 0
191
+ format_output(result.output, output_mode: output_mode,
192
+ pattern: pattern, path: path)
193
+ when 1
194
+ no_match_message(pattern: pattern, path: path)
195
+ else
196
+ stderr = result.output.strip
197
+ stderr = "exited #{exit_code}" if stderr.empty?
198
+ "Error: ripgrep: #{stderr}"
199
+ end
200
+ rescue Tool::Workspace::Error => e
201
+ "Error: #{e.message}"
202
+ end
203
+
204
+ # Build the +rg+ argv. Path is always passed (defaults to +.+) —
205
+ # see the class header for why.
206
+ #
207
+ # @return [Array<String>]
208
+ def self.build_argv(pattern:, glob:, case_insensitive:, output_mode:, path:)
209
+ argv = [
210
+ 'rg',
211
+ '--line-number',
212
+ '--color=never',
213
+ '--no-heading',
214
+ '--with-filename',
215
+ '--hidden',
216
+ "--max-columns=#{MAX_LINE_LENGTH}",
217
+ '--max-columns-preview',
218
+ '--sort=path'
219
+ ]
220
+ argv << '-i' if case_insensitive
221
+ argv.push('--glob', glob) if glob
222
+ case output_mode
223
+ when 'files_with_matches' then argv << '--files-with-matches'
224
+ when 'count' then argv << '--count-matches'
225
+ end
226
+ argv.push('--', pattern, path)
227
+ argv
228
+ end
229
+ private_class_method :build_argv
230
+
231
+ # Render rg output: strip the +./+ prefix rg adds when path is
232
+ # +.+, head-truncate at {MAX_BYTES}, append a footer summarizing
233
+ # the result count.
234
+ #
235
+ # @return [String]
236
+ def self.format_output(raw, output_mode:, pattern:, path:)
237
+ cleaned = strip_dot_slash(raw)
238
+ content, truncation_marker = head_truncate(cleaned)
239
+ stripped = content.chomp
240
+
241
+ return no_match_message(pattern: pattern, path: path) if stripped.empty?
242
+
243
+ footer = build_footer(stripped, output_mode)
244
+ [stripped, '', footer + truncation_marker].join("\n")
245
+ end
246
+ private_class_method :format_output
247
+
248
+ # Strip leading +./+ from each line of rg output. rg emits this
249
+ # prefix when invoked with +.+ as the search path; we want clean
250
+ # workspace-relative paths regardless of whether the user passed a
251
+ # path or we defaulted to +.+.
252
+ #
253
+ # @return [String]
254
+ def self.strip_dot_slash(raw)
255
+ raw.gsub(/^\.\//, '')
256
+ end
257
+ private_class_method :strip_dot_slash
258
+
259
+ # Head-truncate +raw+ to {MAX_BYTES}, cutting at the last newline
260
+ # boundary so the final row is never partial. Returns the truncated
261
+ # content and a marker String (empty if no truncation).
262
+ #
263
+ # @return [Array(String, String)]
264
+ def self.head_truncate(raw)
265
+ total = raw.bytesize
266
+ return [raw, ''] if total <= MAX_BYTES
267
+
268
+ head = raw.byteslice(0, MAX_BYTES)
269
+ last_nl = head.rindex("\n")
270
+ head = head.byteslice(0, last_nl) if last_nl
271
+ omitted = total - head.bytesize
272
+ marker = "\n\n... [#{omitted} bytes omitted; total was #{total} bytes; " \
273
+ 'refine pattern or path] ...'
274
+ [head, marker]
275
+ end
276
+ private_class_method :head_truncate
277
+
278
+ # Compose a one-line footer summarizing the result. Format depends
279
+ # on +output_mode+; counts derive from rg's text output.
280
+ #
281
+ # @return [String]
282
+ def self.build_footer(content, output_mode)
283
+ lines = content.split("\n").reject(&:empty?)
284
+ case output_mode
285
+ when 'content'
286
+ files = lines.map { |l| l.split(':', 2).first }.uniq
287
+ "Found #{pluralize(lines.size, 'match', 'matches')} in " \
288
+ "#{pluralize(files.size, 'file', 'files')}."
289
+ when 'files_with_matches'
290
+ "Found #{pluralize(lines.size, 'file', 'files')}."
291
+ when 'count'
292
+ total = lines.sum { |l| Integer(l.split(':').last) }
293
+ "Found #{pluralize(total, 'match', 'matches')} in " \
294
+ "#{pluralize(lines.size, 'file', 'files')}."
295
+ end
296
+ end
297
+ private_class_method :build_footer
298
+
299
+ # @return [String] +"1 match"+ / +"2 matches"+
300
+ def self.pluralize(n, sing, plural)
301
+ "#{n} #{n == 1 ? sing : plural}"
302
+ end
303
+ private_class_method :pluralize
304
+
305
+ # @return [String]
306
+ def self.no_match_message(pattern:, path:)
307
+ base = "No matches for pattern '#{pattern}'"
308
+ base += " in #{path}" if path
309
+ "#{base}."
310
+ end
311
+ private_class_method :no_match_message
312
+
313
+ # Verify +rg+ is reachable on +PATH+. Routed through
314
+ # {Pikuri::Subprocess.spawn} to honor the subprocess seam.
315
+ # rg missing surfaces as +Errno::ENOENT+; an installed rg returns
316
+ # exit 0 from +--version+.
317
+ #
318
+ # @return [void]
319
+ # @raise [RuntimeError] if rg is missing
320
+ def self.check_binaries!
321
+ result = Pikuri::Subprocess.spawn('rg', '--version', chdir: '/').wait
322
+ return if result.status.success?
323
+
324
+ raise install_hint
325
+ rescue Errno::ENOENT
326
+ raise install_hint
327
+ end
328
+ private_class_method :check_binaries!
329
+
330
+ # @return [String]
331
+ def self.install_hint
332
+ "Tool::Grep requires 'rg' (ripgrep) on PATH; install via your " \
333
+ "distro's package manager (e.g. 'apt install ripgrep')."
334
+ end
335
+ private_class_method :install_hint
336
+ end
337
+ end
338
+ end
@@ -0,0 +1,254 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pikuri
4
+ class Tool
5
+ # The +read+ tool, expressed as a {Tool} subclass: instantiating
6
+ # +Tool::Read.new(workspace: ws)+ produces a tool whose
7
+ # {Tool#to_ruby_llm_tool} wiring is identical to any bundled tool's,
8
+ # so ruby_llm sees nothing special about it. Same shape as
9
+ # {Tool::SubAgent} — workspace is captured by the +execute+ closure
10
+ # at construction.
11
+ #
12
+ # == Output format
13
+ #
14
+ # cat-n: each line is rendered as +"%6d\t%s"+ (six-column right-
15
+ # padded line number, tab, content). Chosen for breadth of training-
16
+ # data exposure: +cat -n+ output shows up across virtually every Unix
17
+ # tutorial and Stack Overflow answer, so even small local models
18
+ # recognize the shape. opencode's shorter +"<n>: <content>"+ format
19
+ # saves a few thousand tokens per 2K-line file but trades model
20
+ # familiarity; pi omits line numbers entirely (cheapest tokens, but
21
+ # the model loses the ability to cite ranges or pick {Edit}
22
+ # boundaries precisely).
23
+ #
24
+ # == Truncation rules
25
+ #
26
+ # Two independent limits, whichever fires first wins:
27
+ #
28
+ # * *Line limit* — {DEFAULT_LIMIT} lines (overridable via +limit+).
29
+ # * *Byte cap* — {MAX_BYTES} bytes of input content; not exposed as a
30
+ # parameter. Bypassable in practice by paging via +offset+.
31
+ #
32
+ # Additionally, individual lines longer than {MAX_LINE_LENGTH} chars
33
+ # are truncated with {LINE_TRUNCATION_MARKER} appended; the model is
34
+ # told to reach for +grep+ to find content inside such files.
35
+ #
36
+ # == Refusals
37
+ #
38
+ # * Path outside the workspace → caught from
39
+ # {Tool::Workspace::Error}, returned as +"Error: ..."+.
40
+ # * File not found, EACCES → +"Error: ..."+.
41
+ # * Path is a directory → +"Error: ... use the glob tool"+, keeping
42
+ # directory listing as the glob tool's responsibility (Step 9).
43
+ # * Binary content → sniffed from the first {BINARY_SAMPLE_BYTES} of
44
+ # the file: any +NUL+ byte, or more than {BINARY_NONPRINTABLE_THRESHOLD}
45
+ # non-printable bytes (control chars outside +\t \n \v \f \r+),
46
+ # triggers refusal. Catches images, PDFs, archives, and compiled
47
+ # artifacts without an extension list to maintain.
48
+ # * Offset past EOF → +"Error: offset N is beyond end of file (M lines total)"+.
49
+ class Read < Tool
50
+ # @return [Integer] default value of the +limit+ parameter (number
51
+ # of lines to read per call).
52
+ DEFAULT_LIMIT = 2000
53
+
54
+ # @return [Integer] per-line character cap; longer lines are
55
+ # truncated with {LINE_TRUNCATION_MARKER}.
56
+ MAX_LINE_LENGTH = 2000
57
+
58
+ # @return [String] suffix appended to lines truncated by
59
+ # {MAX_LINE_LENGTH}.
60
+ LINE_TRUNCATION_MARKER = "... (line truncated to #{MAX_LINE_LENGTH} chars)"
61
+
62
+ # @return [Integer] hard byte cap on input content collected per
63
+ # call. Counted on the line bytes (plus one for the joining
64
+ # newline); the rendered output is slightly larger due to the
65
+ # per-line +"%6d\t"+ prefix.
66
+ MAX_BYTES = 50 * 1024
67
+
68
+ # @return [String] human-readable form of {MAX_BYTES} for the
69
+ # continuation marker.
70
+ MAX_BYTES_LABEL = "#{MAX_BYTES / 1024} KB"
71
+
72
+ # @return [Integer] number of bytes sampled from the start of the
73
+ # file for binary-content detection.
74
+ BINARY_SAMPLE_BYTES = 4096
75
+
76
+ # @return [Float] fraction of the sample that may be non-printable
77
+ # before the file is classified as binary. Matches opencode's
78
+ # 30%.
79
+ BINARY_NONPRINTABLE_THRESHOLD = 0.30
80
+
81
+ # Description shown to the LLM. Follows the opencode-shape (summary
82
+ # + +Usage:+ bullets) prescribed by the project's tool-description
83
+ # convention. Per-parameter constraints (defaults, format) live in
84
+ # the parameter descriptions, not here.
85
+ #
86
+ # @return [String]
87
+ DESCRIPTION = <<~DESC
88
+ Read a file from the workspace and return its contents with line numbers.
89
+
90
+ Usage:
91
+ - Output is line-numbered in `cat -n` style so subsequent edits can reference exact line numbers.
92
+ - Use `offset` and `limit` to page through large files; when the response ends in `Use offset=N to continue`, call again with that offset.
93
+ - Lines longer than #{MAX_LINE_LENGTH} chars are truncated with a marker — use `grep` for content inside such files.
94
+ - Binary files (images, PDFs, archives, compiled artifacts) are refused; this tool reads text only.
95
+ - Directories are refused — use the `glob` tool to list files.
96
+ - If unsure of the path, use `glob` first to look up filenames.
97
+ - Avoid tiny repeated slices — if you need more context, read a larger window.
98
+ DESC
99
+
100
+ # @param workspace [Tool::Workspace] captured for path resolution;
101
+ # all reads route through +workspace.resolve_for_read+.
102
+ # @return [Read]
103
+ def initialize(workspace:)
104
+ super(
105
+ name: 'read',
106
+ description: DESCRIPTION,
107
+ parameters: Parameters.build { |p|
108
+ p.required_string :path,
109
+ 'Path to the file to read. Relative paths ' \
110
+ 'resolve against the workspace root, e.g. ' \
111
+ '"lib/foo.rb" or "/abs/path/to/file.txt".'
112
+ p.optional_integer :offset,
113
+ 'Line number to start reading from (1-indexed). ' \
114
+ "Defaults to 1, e.g. 200."
115
+ p.optional_integer :limit,
116
+ 'Maximum number of lines to read. Defaults to ' \
117
+ "#{DEFAULT_LIMIT}, e.g. 500."
118
+ },
119
+ execute: ->(path:, offset: 1, limit: DEFAULT_LIMIT) {
120
+ Read.read(workspace: workspace, path: path, offset: offset, limit: limit)
121
+ }
122
+ )
123
+ end
124
+
125
+ # Resolve +path+ against +workspace+, refuse directories / binaries /
126
+ # missing files, and return either the cat-n-formatted slice or an
127
+ # +"Error: ..."+ observation.
128
+ #
129
+ # @param workspace [Tool::Workspace]
130
+ # @param path [String] raw path as supplied by the LLM
131
+ # @param offset [Integer] 1-indexed line number to start at
132
+ # @param limit [Integer] maximum lines to return
133
+ # @return [String] tool observation
134
+ def self.read(workspace:, path:, offset:, limit:)
135
+ return "Error: offset must be >= 1, got #{offset}" if offset < 1
136
+ return "Error: limit must be >= 1, got #{limit}" if limit < 1
137
+
138
+ resolved = workspace.resolve_for_read(path)
139
+ return "Error: file not found: #{path}" unless resolved.exist?
140
+ return "Error: #{path} is a directory; use the glob tool to list files." if resolved.directory?
141
+
142
+ sample = read_sample(resolved)
143
+ return "Error: cannot read binary file: #{path}" if binary?(sample)
144
+
145
+ format_slice(path: path, resolved: resolved, offset: offset, limit: limit)
146
+ rescue Tool::Workspace::Error => e
147
+ "Error: #{e.message}"
148
+ rescue Errno::EACCES => e
149
+ "Error: cannot read #{path}: #{e.message}"
150
+ end
151
+
152
+ # Read up to {BINARY_SAMPLE_BYTES} of the file in binary mode for
153
+ # the {.binary?} sniff. Returns an empty String for an empty file
154
+ # (which {.binary?} treats as not-binary).
155
+ #
156
+ # @param resolved [Pathname]
157
+ # @return [String] raw bytes (ASCII-8BIT encoding)
158
+ def self.read_sample(resolved)
159
+ resolved.open('rb') { |io| io.read(BINARY_SAMPLE_BYTES) || +'' }
160
+ end
161
+ private_class_method :read_sample
162
+
163
+ # Heuristic binary classifier matching opencode's: any NUL byte
164
+ # forces +true+; otherwise count bytes outside the printable +\t \n
165
+ # \v \f \r+ + ASCII-32..126 range and ratio against the sample
166
+ # size. UTF-8 continuation bytes (0x80-0xBF) are >127 so they sit
167
+ # outside the non-printable ranges and pass through unflagged,
168
+ # letting UTF-8 text read fine.
169
+ #
170
+ # Public because {Tool::Edit} reuses it to refuse binary targets —
171
+ # if Edit accepted a binary file the model has no way to have read,
172
+ # it could corrupt bytes the model never inspected. Same sniff, same
173
+ # threshold, one definition.
174
+ #
175
+ # @param bytes [String] sample bytes
176
+ # @return [Boolean]
177
+ def self.binary?(bytes)
178
+ return false if bytes.empty?
179
+
180
+ non_printable = 0
181
+ bytes.each_byte do |b|
182
+ return true if b.zero?
183
+
184
+ non_printable += 1 if b < 9 || (b > 13 && b < 32)
185
+ end
186
+ non_printable.to_f / bytes.bytesize > BINARY_NONPRINTABLE_THRESHOLD
187
+ end
188
+
189
+ # Stream the file line-by-line, collect at most +limit+ lines
190
+ # starting at +offset+, and stop early if {MAX_BYTES} is reached.
191
+ # We keep counting lines past the collection window so the trailer
192
+ # can report total line count when the line limit (not the byte
193
+ # cap) was the stopping criterion — same trick opencode uses.
194
+ #
195
+ # @return [String]
196
+ def self.format_slice(path:, resolved:, offset:, limit:)
197
+ start_index = offset - 1
198
+ collected = []
199
+ total_lines = 0
200
+ bytes = 0
201
+ byte_cap_hit = false
202
+ has_more = false
203
+
204
+ resolved.each_line do |raw|
205
+ total_lines += 1
206
+ next if total_lines <= start_index
207
+
208
+ if collected.length >= limit
209
+ has_more = true
210
+ next
211
+ end
212
+
213
+ line = raw.chomp
214
+ if line.length > MAX_LINE_LENGTH
215
+ line = line[0, MAX_LINE_LENGTH] + LINE_TRUNCATION_MARKER
216
+ end
217
+
218
+ size = line.bytesize + 1 # +1 for the joining newline
219
+ if bytes + size > MAX_BYTES
220
+ byte_cap_hit = true
221
+ has_more = true
222
+ break
223
+ end
224
+
225
+ collected << line
226
+ bytes += size
227
+ end
228
+
229
+ return '(Empty file)' if total_lines.zero?
230
+
231
+ if start_index >= total_lines
232
+ return "Error: offset #{offset} is beyond end of file (#{total_lines} lines total)"
233
+ end
234
+
235
+ last_line = offset + collected.length - 1
236
+ body = collected.each_with_index.map { |line, i| format("%6d\t%s", i + offset, line) }.join("\n")
237
+
238
+ trailer =
239
+ if byte_cap_hit
240
+ "(Output capped at #{MAX_BYTES_LABEL}. Showing lines #{offset}-#{last_line}. " \
241
+ "Use offset=#{last_line + 1} to continue.)"
242
+ elsif has_more
243
+ "(Showing lines #{offset}-#{last_line} of #{total_lines}. " \
244
+ "Use offset=#{last_line + 1} to continue.)"
245
+ else
246
+ "(End of file - total #{total_lines} lines)"
247
+ end
248
+
249
+ "#{body}\n\n#{trailer}"
250
+ end
251
+ private_class_method :format_slice
252
+ end
253
+ end
254
+ end