pikuri-core 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/pikuri/agent.rb CHANGED
@@ -48,8 +48,8 @@ module Pikuri
48
48
  # and gets a fresh +step_limit+ at +max: 1+ (defensive — the
49
49
  # synth has no tools and shouldn't trip it). The synth's
50
50
  # answer becomes the value reported by
51
- # {#last_assistant_content}, so callers (notably
52
- # {Tool::SubAgent}) still get a usable reply.
51
+ # {#last_assistant_content}, so callers (notably the +agent+ tool
52
+ # from +pikuri-subagents+) still get a usable reply.
53
53
  #
54
54
  # == Cancellation rescue
55
55
  #
@@ -332,14 +332,16 @@ module Pikuri
332
332
  # set. Typically derived by +bin/pikuri-chat+ from its
333
333
  # configured +openai_api_base+; leave +nil+ when the
334
334
  # configured server is anything other than llama.cpp.
335
- # @param name [String] identifier for this agent. Empty for
336
- # the main agent; sub-agents get monotonic hierarchical
337
- # names like +"sub_agent 0"+, +"sub_agent 1"+,
338
- # +"sub_agent 0_0"+, ... generated by {Tool::SubAgent} from
339
- # the parent's name + a per-parent counter. Forwarded to
340
- # listeners through {ListenerList#for_sub_agent} so name-
341
- # aware ones (notably {Listener::TokenLog}) can tag their
342
- # output.
335
+ # @param id [String] unique identifier for this agent. Empty
336
+ # for the main agent; sub-agents get persona-rooted ids
337
+ # like +"researcher 0"+, +"researcher 1"+, +"file_miner 0"+, ...
338
+ # generated by the +agent+ tool from +pikuri-subagents+ from
339
+ # the persona name + a per-persona counter. Forwarded to
340
+ # listeners through {ListenerList#for_sub_agent} so id-aware
341
+ # ones (notably {Listener::TokenLog}) can tag their output.
342
+ # The word "id" is deliberate — "name" is reserved throughout
343
+ # the codebase for the persona-name load (the value the LLM
344
+ # picks in the +agent+ tool's +name:+ argument).
343
345
  # @param streaming [Boolean] opt into chunk-level streaming.
344
346
  # When +true+, {#run_loop} passes the block returned by
345
347
  # {.streaming_block} to +Chat#ask+, and ruby_llm requests
@@ -348,25 +350,24 @@ module Pikuri
348
350
  # the listener stream as they arrive. When +false+ (the
349
351
  # default), +Chat#ask+ runs in single-shot mode and only
350
352
  # the message-level {Event::Thinking} / {Event::Assistant}
351
- # bookends fire from +after_message+. Read by
352
- # {Tool::SubAgent} so spawned sub-agents inherit the same
353
- # mode without an extra kwarg.
353
+ # bookends fire from +after_message+. Read by the +agent+
354
+ # tool from +pikuri-subagents+ so spawned sub-agents inherit
355
+ # the same mode without an extra kwarg.
354
356
  # @yield [Configurator] yields a {Configurator} that collects
355
357
  # tools (via {Configurator#add_tool} / {Configurator#add_tools}),
356
358
  # listeners (via {Configurator#add_listener} /
357
359
  # {Configurator#add_listeners}), system-prompt snippets (via
358
360
  # {Configurator#append_system_prompt}), extension instances
359
361
  # (via {Configurator#add_extension} — which fires +configure+
360
- # immediately), close handlers (via {Configurator#on_close}),
361
- # and an optional +sub_agent+ tool (via
362
- # {Configurator#allow_sub_agent}). The Configurator is the
363
- # *only* path for adding any of these there are no parallel
364
- # ctor kwargs. The block is optional; an agent constructed
365
- # without one has no tools, no listeners, no extensions.
362
+ # immediately), and close handlers (via
363
+ # {Configurator#on_close}). The Configurator is the *only*
364
+ # path for adding any of these — there are no parallel ctor
365
+ # kwargs. The block is optional; an agent constructed without
366
+ # one has no tools, no listeners, no extensions.
366
367
  # @return [Agent]
367
368
  def initialize(transport:, system_prompt:,
368
369
  step_limit: nil, cancellable: nil, interloper: nil,
369
- context_window: nil, llama_probe_url: nil, name: '',
370
+ context_window: nil, llama_probe_url: nil, id: '',
370
371
  streaming: false,
371
372
  &block)
372
373
  @transport = transport.model ? transport : transport.with(model: RubyLLM.config.default_model)
@@ -376,21 +377,19 @@ module Pikuri
376
377
  @system_prompt = system_prompt
377
378
  @step_limit = step_limit
378
379
  @interloper = interloper
379
- @name = name
380
+ @id = id
380
381
  @streaming = streaming
381
382
  @synth_answer = nil
382
383
  @on_close_handlers = []
383
384
 
384
385
  # Single Configurator funnel for everything the block adds —
385
- # tools, listeners, system-prompt snippets, extensions
386
- # (both newly-configured via #add_extension and inherited
387
- # via #inherit_extensions for sub-agents), on_close handlers,
388
- # and the sub-agent request. See IDEAS.md §"Extension protocol
389
- # design".
386
+ # tools, listeners, system-prompt snippets, extensions, and
387
+ # on_close handlers. See {Configurator} for the per-method
388
+ # contract.
390
389
  configurator = Configurator.new(
391
390
  transport: @transport,
392
391
  system_prompt_base: system_prompt,
393
- name: @name,
392
+ id: @id,
394
393
  streaming: @streaming,
395
394
  step_limit: @step_limit,
396
395
  cancellable: @cancellable,
@@ -400,6 +399,7 @@ module Pikuri
400
399
  block&.call(configurator)
401
400
 
402
401
  @tools = configurator.tools.dup
402
+ @sub_agent_tools = configurator.sub_agent_tools.dup
403
403
  @listeners = ListenerList.new(configurator.listeners)
404
404
  configurator.system_prompt_additions.each do |snippet|
405
405
  @system_prompt = "#{@system_prompt}\n\n#{snippet}"
@@ -430,25 +430,6 @@ module Pikuri
430
430
  # before any Tokens event arrives.
431
431
  @listeners.emit(Event::ContextCap.new(cap: @context_window_cap))
432
432
 
433
- # Sub-agent tool: constructed *after* @tools is final and
434
- # @context_window_cap is set, so its snapshot of the parent's
435
- # tool list doesn't include itself (recursion guard) and the
436
- # cap can be threaded through to spawned sub-agents. The new
437
- # +Tool::SubAgent+ instance is appended to both +@tools+ and
438
- # +@chat+, so sub-agents inheriting via the snapshot still
439
- # get the surrounding tool set but never the +sub_agent+ tool
440
- # itself. See {Configurator#allow_sub_agent}.
441
- if configurator.sub_agent_request
442
- if @tools.any?(Tool::SubAgent)
443
- raise 'Tool::SubAgent must not be added via c.add_tool when c.allow_sub_agent ' \
444
- 'is used; Agent auto-registers it from the Configurator request.'
445
- end
446
-
447
- sub_tool = Tool::SubAgent.new(self, max_steps: configurator.sub_agent_request.max_steps)
448
- @tools << sub_tool
449
- @chat.with_tool(sub_tool.to_ruby_llm_tool)
450
- end
451
-
452
433
  # Bind sweep — each extension gets its chance to install
453
434
  # per-agent state (dynamic tools via #internal_add_tool,
454
435
  # per-agent close hooks via #on_close, etc.) now that the
@@ -474,19 +455,30 @@ module Pikuri
474
455
  # agent was constructed with — same model id / provider /
475
456
  # assume-model-exists flag passed to every +RubyLLM.chat+
476
457
  # call originating from this agent (the main chat, the
477
- # synthesizer rescue, the sub-agent tool). Read by
478
- # {Tool::SubAgent} so spawned sub-agents reuse the same
479
- # transport.
458
+ # synthesizer rescue, the +agent+ tool from
459
+ # +pikuri-subagents+). Read by extensions that need to spawn
460
+ # their own ruby_llm calls (e.g. MCP description synthesis,
461
+ # sub-agent delegation).
480
462
  attr_reader :transport
481
463
 
482
464
  # @return [Array<Tool>] this agent's tool list in declaration
483
- # order. Snapshotted by {Tool::SubAgent} so spawned
484
- # sub-agents inherit the parent's tools (minus the
485
- # sub-agent tool itself, which {#allow_sub_agent} appends
486
- # to +@tools+ only after the snapshot has been taken —
487
- # recursion guard).
465
+ # order. Read by extensions that filter against it (notably
466
+ # the +agent+ tool from +pikuri-subagents+, which picks the
467
+ # sub-agent's toolset from the parent's instances so any
468
+ # already-bound workspace/confirmer wiring travels along).
469
+ # Tools listed here are also the ones registered with
470
+ # ruby_llm — the parent LLM can call any of them. Compare
471
+ # with {#sub_agent_tools}.
488
472
  attr_reader :tools
489
473
 
474
+ # @return [Array<Tool>] tools registered via
475
+ # {Configurator#add_sub_agent_tool}, in declaration order.
476
+ # Invisible to the parent LLM (never sent to ruby_llm);
477
+ # available only to sub-agents whose persona +tool_names+
478
+ # match. See {Configurator}'s "Two tool pools" header for
479
+ # the trifecta-defense rationale.
480
+ attr_reader :sub_agent_tools
481
+
490
482
  # @return [String] resolved model id from {#transport}.
491
483
  # Convenience delegator for callers that don't need the
492
484
  # full transport bundle.
@@ -496,12 +488,10 @@ module Pikuri
496
488
 
497
489
  # @return [String] system prompt actually sent to the chat —
498
490
  # equal to the constructor's +system_prompt:+ argument plus
499
- # any snippets appended by extensions during
500
- # {Configurator#append_system_prompt} (Skills'
501
- # +<available_skills>+, MCP's +<available_mcps>+, ...).
502
- # {Tool::SubAgent} forwards this already-augmented value to
503
- # spawned sub-agents so they see the same advertisements
504
- # without re-running extension configure.
491
+ # any snippets appended via {Configurator#append_system_prompt}
492
+ # (extensions' +<available_skills>+ / +<available_mcps>+ /
493
+ # +<available_agents>+, ...). Not inherited by sub-agents —
494
+ # each persona owns its own system prompt verbatim.
505
495
  attr_reader :system_prompt
506
496
 
507
497
  # @return [ListenerList] the listener list attached to this
@@ -510,54 +500,55 @@ module Pikuri
510
500
 
511
501
  # @return [Control::StepLimit, nil] the step-budget control
512
502
  # this agent was constructed with, or +nil+ when none.
513
- # Read by {Tool::SubAgent} so spawned sub-agents derive
514
- # their own.
515
503
  attr_reader :step_limit
516
504
 
517
505
  # @return [Control::Cancellable, nil] the cancellation
518
506
  # control this agent was constructed with, or +nil+ when
519
- # none. Read by {Tool::SubAgent} so spawned sub-agents
520
- # share the same instance.
507
+ # none. Read by extensions that propagate cancellation to
508
+ # their own LLM calls (e.g. the +agent+ tool from
509
+ # +pikuri-subagents+ shares it with spawned sub-agents so
510
+ # one Ctrl+C stops the tree).
521
511
  attr_reader :cancellable
522
512
 
523
513
  # @return [Control::Interloper, nil] the mid-loop user-input
524
514
  # control this agent was constructed with, or +nil+ when
525
- # none. Not propagated to sub-agents — see
526
- # {Control::Interloper#for_sub_agent}.
515
+ # none.
527
516
  attr_reader :interloper
528
517
 
529
- # @return [String] this agent's identifier — empty for the
530
- # main agent; for sub-agents, the hierarchical id assigned
531
- # by {Tool::SubAgent} (e.g. +"sub_agent 0"+,
532
- # +"sub_agent 1"+, +"sub_agent 0_0"+). Read by the
533
- # sub-agent tool so spawned sub-agents prefix their own
534
- # names with this one, and propagated to listeners via
535
- # {ListenerList#for_sub_agent} so name-aware ones can tag
536
- # output.
537
- attr_reader :name
518
+ # @return [String] this agent's unique identifier — empty for
519
+ # the main agent; for sub-agents, the persona-rooted id
520
+ # assigned by the +agent+ tool from +pikuri-subagents+ (e.g.
521
+ # +"researcher 0"+, +"researcher 1"+, +"file_miner 0"+).
522
+ # Propagated to listeners via {ListenerList#for_sub_agent(id:)}
523
+ # so id-aware ones can tag output. Distinct from the persona's
524
+ # +name+ (the value the LLM picks in the +agent+ tool's
525
+ # +name:+ argument).
526
+ attr_reader :id
538
527
 
539
528
  # @return [Boolean] +true+ when this agent opted into
540
529
  # chunk-level streaming (see the +streaming:+ kwarg on
541
- # {#initialize}); +false+ otherwise. Read by
542
- # {Tool::SubAgent} so spawned sub-agents inherit the same
543
- # mode.
530
+ # {#initialize}); +false+ otherwise. Read by extensions that
531
+ # spawn their own ruby_llm calls (notably the +agent+ tool
532
+ # from +pikuri-subagents+, so spawned sub-agents inherit the
533
+ # same mode).
544
534
  attr_reader :streaming
545
535
 
546
536
  # @return [Array<Extension>] extension instances bound to this
547
- # agent — added via {Configurator#add_extension} (new — runs
548
- # +configure+ now and binds later) or {Configurator#inherit_extensions}
549
- # (sub-agent inheritance skips +configure+, just binds), both
550
- # inside the +Agent.new+ block. Read by {Tool::SubAgent} so
551
- # spawned sub-agents inherit the parent's extension list and
552
- # re-bind them via the bind sweep.
537
+ # agent — added via {Configurator#add_extension} inside the
538
+ # +Agent.new+ block. Each instance's +configure+ runs during
539
+ # the block and its +bind+ runs at the end of
540
+ # {#initialize}, once per registration (so once per parent
541
+ # agent in the typical setup; sub-agents do not inherit
542
+ # extensions).
553
543
  attr_reader :extensions
554
544
 
555
545
  # @return [Integer, nil] context-window cap resolved by
556
546
  # {ContextWindowDetector} at construction time. +nil+ when
557
547
  # no source produced a value (custom local model with no
558
548
  # override and no reachable llama.cpp +/props+). Read by
559
- # {Tool::SubAgent} so spawned sub-agents inherit the same
560
- # cap without re-probing.
549
+ # extensions that spawn their own ruby_llm calls (notably
550
+ # the +agent+ tool from +pikuri-subagents+, so spawned
551
+ # sub-agents inherit the same cap without re-probing).
561
552
  attr_reader :context_window_cap
562
553
 
563
554
  # Final assistant message content for the most recent
@@ -629,14 +620,14 @@ module Pikuri
629
620
 
630
621
  # Synth runs under this agent's identity but on a fresh
631
622
  # chat with a different system prompt, so it gets a
632
- # distinct +_synthesizer+ suffix on the name — same +_+
623
+ # distinct +_synthesizer+ suffix on the id — same +_+
633
624
  # separator the sub-agent generator uses, so main becomes
634
- # +"synthesizer"+ and a sub-agent +"sub_agent 0"+ becomes
635
- # +"sub_agent 0_synthesizer"+. Any +TokenLog+ in the list
625
+ # +"synthesizer"+ and a sub-agent +"researcher 0"+ becomes
626
+ # +"researcher 0_synthesizer"+. Any +TokenLog+ in the list
636
627
  # tags the synth's prompt under that bracket so it's
637
628
  # obvious from the log which turns were the rescue rather
638
629
  # than the original loop.
639
- synth_name = @name.empty? ? 'synthesizer' : "#{@name}_synthesizer"
630
+ synth_id = @id.empty? ? 'synthesizer' : "#{@id}_synthesizer"
640
631
  synth_chat = RubyLLM.chat(**@transport.to_h)
641
632
  # Defensive step limit on the synth: the synth has no
642
633
  # tools so it should never trip +before_tool_call+, but
@@ -647,7 +638,7 @@ module Pikuri
647
638
  chat: synth_chat,
648
639
  parent_messages: @chat.messages,
649
640
  user_message: user_message,
650
- listeners: @listeners.for_sub_agent(name: synth_name),
641
+ listeners: @listeners.for_sub_agent(id: synth_id),
651
642
  step_limit: synth_step_limit,
652
643
  cancellable: @cancellable,
653
644
  streaming: @streaming
@@ -706,9 +697,10 @@ module Pikuri
706
697
  # +Pikuri::Tool+ entirely."
707
698
  #
708
699
  # The added tool does NOT enter +@tools+, only +@chat+'s tool
709
- # list. {Tool::SubAgent} therefore cannot snapshot it (which is
710
- # the whole pointactivation is strictly per-agent, see
711
- # IDEAS.md §"Per-agent activation, no propagation").
700
+ # list. Sub-agents (the +agent+ tool from +pikuri-subagents+)
701
+ # therefore cannot snapshot it which is the whole point:
702
+ # activation is strictly per-agent, see IDEAS.md §"Per-agent
703
+ # activation, no propagation".
712
704
  #
713
705
  # @param ruby_llm_tool [Class] subclass of +RubyLLM::Tool+
714
706
  # @return [void]
@@ -721,11 +713,11 @@ module Pikuri
721
713
  #
722
714
  # @example
723
715
  # agent.to_s
724
- # # => "Agent(model=qwen3-35b, tools=4, listeners=[Terminal])"
716
+ # # => "Agent(id=, model=qwen3-35b, tools=4, listeners=[Terminal])"
725
717
  #
726
718
  # @return [String]
727
719
  def to_s
728
- "Agent(model=#{model}, tools=#{@tools.size}, listeners=#{@listeners})"
720
+ "Agent(id=#{@id}, model=#{model}, tools=#{@tools.size}, listeners=#{@listeners})"
729
721
  end
730
722
  end
731
723
  end
@@ -0,0 +1,237 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pdf-reader'
4
+
5
+ module Pikuri
6
+ # Magic-byte content sniffing + text extraction, centralised. Three
7
+ # responsibilities:
8
+ #
9
+ # * {.detect_mime} — recognise a file from its leading bytes. Returns
10
+ # a MIME String for formats pikuri knows how to handle specially
11
+ # ({+application/pdf+}, the four image formats), or +nil+ for
12
+ # "unrecognised — could be text, could be opaque binary; caller
13
+ # decides".
14
+ # * {.binary?} — heuristic text-vs-binary classifier. Independent of
15
+ # {.detect_mime}: a file can be both recognised (e.g. PDF) *and*
16
+ # binary. {.detect_mime} tells you what the bytes are;
17
+ # {.binary?} tells you whether they're safe to render as text.
18
+ # * {.read_as_text} — read a file and return its content as plain
19
+ # UTF-8 text. PDFs go through +pdf-reader+ page-by-page; plain
20
+ # text passes through; images / binaries / missing files raise.
21
+ # The pure-extraction shape consumers like +Pikuri::VectorDb+'s
22
+ # indexer want (no LLM-tool concerns — no paging, no line
23
+ # numbering, no byte caps; just bytes-in-text-out).
24
+ #
25
+ # {.detect_mime} and {.binary?} accept either a +String+ of bytes
26
+ # (sample taken by the caller) or a +Pathname+ — when given a path,
27
+ # the module opens the file in binary mode and reads {SAMPLE_BYTES}
28
+ # for the sniff itself. The Pathname form is the convenience path;
29
+ # the bytes form is for callers that already have the sample or are
30
+ # calling both methods on the same file and want to avoid a second
31
+ # open. {.read_as_text} takes a +Pathname+ only — there's no
32
+ # bytes-in shortcut because the PDF case needs to seek the file.
33
+ #
34
+ # == Why a separate module
35
+ #
36
+ # Without this module, magic-byte tables and the binary heuristic
37
+ # ended up scattered through whichever tool needed them — first PDF
38
+ # in {Workspace::Read}, then images alongside it, then a copy of
39
+ # {.binary?} reached for by {Workspace::Edit}. Collecting the
40
+ # detection logic here lets {Read} focus on routing
41
+ # (mime-to-formatter), {Edit} drop its cross-tool reach, and new
42
+ # tools (a future +Workspace::Diff+, an attachment-aware web fetcher,
43
+ # ...) share one set of magic-byte truths.
44
+ #
45
+ # == Deliberate non-goals
46
+ #
47
+ # * *Not a full MIME database.* The set grows when a pikuri tool
48
+ # needs a new format, not speculatively. Keeps the "audit in an
49
+ # evening" ceiling honest.
50
+ # * *No path / extension fallback.* Extensions lie (a renamed
51
+ # +.png+ → opaque garbage); magic-byte detection on the actual
52
+ # content is the source of truth. Callers that need
53
+ # extension-based behaviour can layer it themselves.
54
+ # * *No convenience predicates* like +image?+ / +pdf?+. Callers do
55
+ # +mime == 'application/pdf'+ or +mime&.start_with?('image/')+ —
56
+ # one extra character, zero added API surface.
57
+ module FileType
58
+ module_function
59
+
60
+ # @return [Integer] recommended number of bytes to sample for
61
+ # {.detect_mime} and {.binary?}. Big enough to catch every
62
+ # prefix pikuri sniffs today (the largest is WebP's 12-byte
63
+ # container header) with comfortable slack; small enough that
64
+ # reading it off any reasonable filesystem is effectively free.
65
+ SAMPLE_BYTES = 4096
66
+
67
+ # @return [Float] fraction of the sample that may be non-printable
68
+ # before {.binary?} flags the bytes as binary. Matches opencode's
69
+ # threshold.
70
+ BINARY_NONPRINTABLE_THRESHOLD = 0.30
71
+
72
+ # @return [Hash{String => String}] magic-byte prefixes → MIME types
73
+ # for the image formats with flat (offset-zero, fixed-length)
74
+ # signatures. WebP isn't here — its signature is split across the
75
+ # RIFF container header — and is handled directly in
76
+ # {.detect_mime}.
77
+ IMAGE_MAGIC_BYTES = {
78
+ "\x89PNG\r\n\x1a\n".b => 'image/png',
79
+ "\xff\xd8\xff".b => 'image/jpeg',
80
+ "GIF87a".b => 'image/gif',
81
+ "GIF89a".b => 'image/gif'
82
+ }.freeze
83
+
84
+ # @return [String] PDF magic prefix. Every conformant PDF starts
85
+ # with this five-byte ASCII sequence per ISO 32000-1 §7.5.2.
86
+ PDF_MAGIC = '%PDF-'
87
+
88
+ # Recognise a file from its leading bytes. Returns the MIME type
89
+ # as a String for formats pikuri handles specially, or +nil+ for
90
+ # "unrecognised" — callers interpret +nil+ themselves (text,
91
+ # opaque binary, ...).
92
+ #
93
+ # @param input [String, Pathname] the bytes to inspect, or a
94
+ # +Pathname+ that this method opens in binary mode and reads up
95
+ # to {SAMPLE_BYTES} from. Caller is responsible for verifying the
96
+ # path exists; missing-file errors propagate as +Errno::ENOENT+.
97
+ # @return [String, nil]
98
+ def detect_mime(input)
99
+ bytes = sample_of(input)
100
+ return 'application/pdf' if bytes.start_with?(PDF_MAGIC)
101
+
102
+ IMAGE_MAGIC_BYTES.each do |prefix, mime|
103
+ return mime if bytes.start_with?(prefix)
104
+ end
105
+ return 'image/webp' if bytes.bytesize >= 12 &&
106
+ bytes.byteslice(0, 4) == 'RIFF'.b &&
107
+ bytes.byteslice(8, 4) == 'WEBP'.b
108
+
109
+ nil
110
+ end
111
+
112
+ # Heuristic text-vs-binary classifier matching opencode's: any
113
+ # +NUL+ byte forces +true+; otherwise count bytes outside the
114
+ # printable +\t \n \v \f \r+ + ASCII-32..126 range and ratio
115
+ # against the sample size. UTF-8 continuation bytes (0x80-0xBF)
116
+ # are >127 so they sit outside the non-printable ranges and pass
117
+ # through unflagged, letting UTF-8 text read fine. An empty
118
+ # sample is treated as not-binary (callers reading an empty file
119
+ # take the empty-text path).
120
+ #
121
+ # @param input [String, Pathname] the bytes to inspect, or a
122
+ # +Pathname+ that this method opens in binary mode and reads up
123
+ # to {SAMPLE_BYTES} from. Caller is responsible for verifying
124
+ # the path exists.
125
+ # @return [Boolean]
126
+ def binary?(input)
127
+ bytes = sample_of(input)
128
+ return false if bytes.empty?
129
+
130
+ non_printable = 0
131
+ bytes.each_byte do |b|
132
+ return true if b.zero?
133
+
134
+ non_printable += 1 if b < 9 || (b > 13 && b < 32)
135
+ end
136
+ non_printable.to_f / bytes.bytesize > BINARY_NONPRINTABLE_THRESHOLD
137
+ end
138
+
139
+ # Read +path+ and return its content as plain UTF-8 text. Two
140
+ # extraction paths, picked by {.detect_mime}:
141
+ #
142
+ # * **PDF** — walked page-by-page via +pdf-reader+; each page's
143
+ # extracted text is stripped and pages are joined with a blank
144
+ # line. A scanned-image PDF (no extractable text) comes back as
145
+ # the empty String — a deliberate silent skip, callers detect by
146
+ # length if they care.
147
+ # * **Plain text** — anything that {.detect_mime} doesn't
148
+ # recognise and that {.binary?} accepts. Read with UTF-8
149
+ # encoding; behaviour on non-UTF-8 bytes is whatever +File.read+
150
+ # does with +encoding: Encoding::UTF_8+ (which is "leave invalid
151
+ # bytes in, let downstream decide").
152
+ #
153
+ # Refusal cases — all raise rather than returning a sentinel
154
+ # because the callers are internal pikuri code, not an LLM
155
+ # tool. The LLM-facing +Workspace::Read+ does its own routing and
156
+ # returns "Error: ..." observations; that's a separate concern.
157
+ #
158
+ # * Path doesn't exist → +Errno::ENOENT+.
159
+ # * Path is a directory → +ArgumentError+.
160
+ # * Image (PNG / JPEG / GIF / WebP per {.detect_mime}) →
161
+ # +ArgumentError+; images aren't text.
162
+ # * Binary content (per {.binary?}) and not a recognised MIME →
163
+ # +ArgumentError+.
164
+ # * Malformed PDF — +pdf-reader+'s
165
+ # +MalformedPDFError+ / +UnsupportedFeatureError+ /
166
+ # +InvalidPageError+ are re-raised as a +RuntimeError+ with the
167
+ # path included so callers don't need to know pdf-reader's
168
+ # exception hierarchy.
169
+ #
170
+ # @param path [Pathname] file to read.
171
+ # @return [String] UTF-8 text. May be empty (empty text file, or
172
+ # scanned-image PDF).
173
+ # @raise [ArgumentError] if +path+ isn't a +Pathname+, points at
174
+ # a directory, is an image, or is binary.
175
+ # @raise [Errno::ENOENT] if +path+ doesn't exist.
176
+ # @raise [RuntimeError] on a malformed / unsupported PDF.
177
+ def read_as_text(path)
178
+ raise ArgumentError, "expected Pathname, got #{path.class}" unless path.is_a?(Pathname)
179
+ raise Errno::ENOENT, path.to_s unless path.exist?
180
+ raise ArgumentError, "#{path} is a directory" if path.directory?
181
+
182
+ mime = detect_mime(path)
183
+ return read_pdf_text(path) if mime == 'application/pdf'
184
+ raise ArgumentError, "#{path} is an image (#{mime}); cannot extract as text" if mime&.start_with?('image/')
185
+ raise ArgumentError, "#{path} appears to be binary; cannot extract as text" if binary?(path)
186
+
187
+ path.read(encoding: Encoding::UTF_8)
188
+ end
189
+
190
+ # Walk a PDF page-by-page via +pdf-reader+, returning a single
191
+ # String with non-empty page texts joined by blank lines. Catches
192
+ # the three +PDF::Reader+ exceptions Workspace::Read also handles
193
+ # and re-raises them as +RuntimeError+ with the path included.
194
+ #
195
+ # @param path [Pathname]
196
+ # @return [String]
197
+ # @raise [RuntimeError] on malformed / unsupported PDF.
198
+ def read_pdf_text(path)
199
+ pages = path.open('rb') do |io|
200
+ ::PDF::Reader.new(io).pages.map { |p| p.text.strip }
201
+ end
202
+ pages.reject(&:empty?).join("\n\n")
203
+ rescue ::PDF::Reader::MalformedPDFError,
204
+ ::PDF::Reader::UnsupportedFeatureError,
205
+ ::PDF::Reader::InvalidPageError => e
206
+ raise "Cannot extract PDF text from #{path}: " \
207
+ "#{e.class.name.split('::').last}: #{e.message}"
208
+ end
209
+ private_class_method :read_pdf_text
210
+
211
+ # Coerce an +input+ argument into a bytes String for the sniffs.
212
+ # +String+ inputs are returned as-is (caller already sampled);
213
+ # +Pathname+ inputs are opened in binary mode and up to
214
+ # {SAMPLE_BYTES} are read off the front. Empty files come back
215
+ # as an empty String — {.binary?} treats that as not-binary and
216
+ # {.detect_mime} returns +nil+ for it, which is what the
217
+ # empty-text path wants.
218
+ #
219
+ # @param input [String, Pathname]
220
+ # @return [String] raw bytes (ASCII-8BIT encoding for the path
221
+ # case; whatever the caller passed for the bytes case)
222
+ # @raise [ArgumentError] if +input+ is neither a +String+ nor a
223
+ # +Pathname+ — refuses to guess, since a bare String could be
224
+ # either a path or actual bytes.
225
+ def sample_of(input)
226
+ case input
227
+ when String
228
+ input
229
+ when Pathname
230
+ input.open('rb') { |io| io.read(SAMPLE_BYTES) || +'' }
231
+ else
232
+ raise ArgumentError, "expected String bytes or Pathname, got #{input.class}"
233
+ end
234
+ end
235
+ private_class_method :sample_of
236
+ end
237
+ end
@@ -72,10 +72,17 @@ module Pikuri
72
72
  # interpretation is wanted; +argv+ is passed to +exec+
73
73
  # directly, so no implicit shell expansion happens here.
74
74
  # @param chdir [String, Pathname] working directory
75
+ # @param env [Hash{String=>String}] extra environment variables to
76
+ # set in the child process. The child otherwise inherits the
77
+ # parent's full environment; entries in +env+ override or add to
78
+ # it. Default +{}+ (pure inheritance). Used by {Code::Bash} to
79
+ # thread {Pikuri::Workspace::Filesystem#env} (host git identity,
80
+ # etc.) into a bash subprocess whose sandbox would otherwise
81
+ # strip the host's config files.
75
82
  # @return [Subprocess] handle — call {#wait} to block for the
76
83
  # direct child to exit and read the captured output
77
- def self.spawn(*argv, chdir:)
78
- stdin, io, wait_thr = Open3.popen2e(*argv, chdir: chdir.to_s, pgroup: true)
84
+ def self.spawn(*argv, chdir:, env: {})
85
+ stdin, io, wait_thr = Open3.popen2e(env, *argv, chdir: chdir.to_s, pgroup: true)
79
86
  stdin.close
80
87
  register(wait_thr.pid)
81
88
  new(io: io, wait_thr: wait_thr)