pikuri-core 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +10 -0
- data/lib/pikuri/agent/chat_transport.rb +6 -5
- data/lib/pikuri/agent/configurator.rb +50 -75
- data/lib/pikuri/agent/control/cancellable.rb +7 -17
- data/lib/pikuri/agent/control/interloper.rb +10 -21
- data/lib/pikuri/agent/control/step_limit.rb +0 -14
- data/lib/pikuri/agent/extension.rb +12 -14
- data/lib/pikuri/agent/listener/token_log.rb +20 -21
- data/lib/pikuri/agent/listener_list.rb +7 -5
- data/lib/pikuri/agent/synthesizer.rb +2 -2
- data/lib/pikuri/agent.rb +88 -96
- data/lib/pikuri/file_type.rb +237 -0
- data/lib/pikuri/subprocess.rb +9 -2
- data/lib/pikuri/tool/parameters.rb +64 -3
- data/lib/pikuri/tool.rb +15 -7
- data/lib/pikuri/version.rb +1 -1
- metadata +3 -3
- data/lib/pikuri/tool/sub_agent.rb +0 -150
data/lib/pikuri/agent.rb
CHANGED
|
@@ -48,8 +48,8 @@ module Pikuri
|
|
|
48
48
|
# and gets a fresh +step_limit+ at +max: 1+ (defensive — the
|
|
49
49
|
# synth has no tools and shouldn't trip it). The synth's
|
|
50
50
|
# answer becomes the value reported by
|
|
51
|
-
# {#last_assistant_content}, so callers (notably
|
|
52
|
-
#
|
|
51
|
+
# {#last_assistant_content}, so callers (notably the +agent+ tool
|
|
52
|
+
# from +pikuri-subagents+) still get a usable reply.
|
|
53
53
|
#
|
|
54
54
|
# == Cancellation rescue
|
|
55
55
|
#
|
|
@@ -332,14 +332,16 @@ module Pikuri
|
|
|
332
332
|
# set. Typically derived by +bin/pikuri-chat+ from its
|
|
333
333
|
# configured +openai_api_base+; leave +nil+ when the
|
|
334
334
|
# configured server is anything other than llama.cpp.
|
|
335
|
-
# @param
|
|
336
|
-
# the main agent; sub-agents get
|
|
337
|
-
#
|
|
338
|
-
#
|
|
339
|
-
# the
|
|
340
|
-
# listeners through {ListenerList#for_sub_agent} so
|
|
341
|
-
#
|
|
342
|
-
#
|
|
335
|
+
# @param id [String] unique identifier for this agent. Empty
|
|
336
|
+
# for the main agent; sub-agents get persona-rooted ids
|
|
337
|
+
# like +"researcher 0"+, +"researcher 1"+, +"file_miner 0"+, ...
|
|
338
|
+
# generated by the +agent+ tool from +pikuri-subagents+ from
|
|
339
|
+
# the persona name + a per-persona counter. Forwarded to
|
|
340
|
+
# listeners through {ListenerList#for_sub_agent} so id-aware
|
|
341
|
+
# ones (notably {Listener::TokenLog}) can tag their output.
|
|
342
|
+
# The word "id" is deliberate — "name" is reserved throughout
|
|
343
|
+
# the codebase for the persona-name load (the value the LLM
|
|
344
|
+
# picks in the +agent+ tool's +name:+ argument).
|
|
343
345
|
# @param streaming [Boolean] opt into chunk-level streaming.
|
|
344
346
|
# When +true+, {#run_loop} passes the block returned by
|
|
345
347
|
# {.streaming_block} to +Chat#ask+, and ruby_llm requests
|
|
@@ -348,25 +350,24 @@ module Pikuri
|
|
|
348
350
|
# the listener stream as they arrive. When +false+ (the
|
|
349
351
|
# default), +Chat#ask+ runs in single-shot mode and only
|
|
350
352
|
# the message-level {Event::Thinking} / {Event::Assistant}
|
|
351
|
-
# bookends fire from +after_message+. Read by
|
|
352
|
-
#
|
|
353
|
-
# mode without an extra kwarg.
|
|
353
|
+
# bookends fire from +after_message+. Read by the +agent+
|
|
354
|
+
# tool from +pikuri-subagents+ so spawned sub-agents inherit
|
|
355
|
+
# the same mode without an extra kwarg.
|
|
354
356
|
# @yield [Configurator] yields a {Configurator} that collects
|
|
355
357
|
# tools (via {Configurator#add_tool} / {Configurator#add_tools}),
|
|
356
358
|
# listeners (via {Configurator#add_listener} /
|
|
357
359
|
# {Configurator#add_listeners}), system-prompt snippets (via
|
|
358
360
|
# {Configurator#append_system_prompt}), extension instances
|
|
359
361
|
# (via {Configurator#add_extension} — which fires +configure+
|
|
360
|
-
# immediately), close handlers (via
|
|
361
|
-
#
|
|
362
|
-
#
|
|
363
|
-
#
|
|
364
|
-
#
|
|
365
|
-
# without one has no tools, no listeners, no extensions.
|
|
362
|
+
# immediately), and close handlers (via
|
|
363
|
+
# {Configurator#on_close}). The Configurator is the *only*
|
|
364
|
+
# path for adding any of these — there are no parallel ctor
|
|
365
|
+
# kwargs. The block is optional; an agent constructed without
|
|
366
|
+
# one has no tools, no listeners, no extensions.
|
|
366
367
|
# @return [Agent]
|
|
367
368
|
def initialize(transport:, system_prompt:,
|
|
368
369
|
step_limit: nil, cancellable: nil, interloper: nil,
|
|
369
|
-
context_window: nil, llama_probe_url: nil,
|
|
370
|
+
context_window: nil, llama_probe_url: nil, id: '',
|
|
370
371
|
streaming: false,
|
|
371
372
|
&block)
|
|
372
373
|
@transport = transport.model ? transport : transport.with(model: RubyLLM.config.default_model)
|
|
@@ -376,21 +377,19 @@ module Pikuri
|
|
|
376
377
|
@system_prompt = system_prompt
|
|
377
378
|
@step_limit = step_limit
|
|
378
379
|
@interloper = interloper
|
|
379
|
-
@
|
|
380
|
+
@id = id
|
|
380
381
|
@streaming = streaming
|
|
381
382
|
@synth_answer = nil
|
|
382
383
|
@on_close_handlers = []
|
|
383
384
|
|
|
384
385
|
# Single Configurator funnel for everything the block adds —
|
|
385
|
-
# tools, listeners, system-prompt snippets, extensions
|
|
386
|
-
#
|
|
387
|
-
#
|
|
388
|
-
# and the sub-agent request. See IDEAS.md §"Extension protocol
|
|
389
|
-
# design".
|
|
386
|
+
# tools, listeners, system-prompt snippets, extensions, and
|
|
387
|
+
# on_close handlers. See {Configurator} for the per-method
|
|
388
|
+
# contract.
|
|
390
389
|
configurator = Configurator.new(
|
|
391
390
|
transport: @transport,
|
|
392
391
|
system_prompt_base: system_prompt,
|
|
393
|
-
|
|
392
|
+
id: @id,
|
|
394
393
|
streaming: @streaming,
|
|
395
394
|
step_limit: @step_limit,
|
|
396
395
|
cancellable: @cancellable,
|
|
@@ -400,6 +399,7 @@ module Pikuri
|
|
|
400
399
|
block&.call(configurator)
|
|
401
400
|
|
|
402
401
|
@tools = configurator.tools.dup
|
|
402
|
+
@sub_agent_tools = configurator.sub_agent_tools.dup
|
|
403
403
|
@listeners = ListenerList.new(configurator.listeners)
|
|
404
404
|
configurator.system_prompt_additions.each do |snippet|
|
|
405
405
|
@system_prompt = "#{@system_prompt}\n\n#{snippet}"
|
|
@@ -430,25 +430,6 @@ module Pikuri
|
|
|
430
430
|
# before any Tokens event arrives.
|
|
431
431
|
@listeners.emit(Event::ContextCap.new(cap: @context_window_cap))
|
|
432
432
|
|
|
433
|
-
# Sub-agent tool: constructed *after* @tools is final and
|
|
434
|
-
# @context_window_cap is set, so its snapshot of the parent's
|
|
435
|
-
# tool list doesn't include itself (recursion guard) and the
|
|
436
|
-
# cap can be threaded through to spawned sub-agents. The new
|
|
437
|
-
# +Tool::SubAgent+ instance is appended to both +@tools+ and
|
|
438
|
-
# +@chat+, so sub-agents inheriting via the snapshot still
|
|
439
|
-
# get the surrounding tool set but never the +sub_agent+ tool
|
|
440
|
-
# itself. See {Configurator#allow_sub_agent}.
|
|
441
|
-
if configurator.sub_agent_request
|
|
442
|
-
if @tools.any?(Tool::SubAgent)
|
|
443
|
-
raise 'Tool::SubAgent must not be added via c.add_tool when c.allow_sub_agent ' \
|
|
444
|
-
'is used; Agent auto-registers it from the Configurator request.'
|
|
445
|
-
end
|
|
446
|
-
|
|
447
|
-
sub_tool = Tool::SubAgent.new(self, max_steps: configurator.sub_agent_request.max_steps)
|
|
448
|
-
@tools << sub_tool
|
|
449
|
-
@chat.with_tool(sub_tool.to_ruby_llm_tool)
|
|
450
|
-
end
|
|
451
|
-
|
|
452
433
|
# Bind sweep — each extension gets its chance to install
|
|
453
434
|
# per-agent state (dynamic tools via #internal_add_tool,
|
|
454
435
|
# per-agent close hooks via #on_close, etc.) now that the
|
|
@@ -474,19 +455,30 @@ module Pikuri
|
|
|
474
455
|
# agent was constructed with — same model id / provider /
|
|
475
456
|
# assume-model-exists flag passed to every +RubyLLM.chat+
|
|
476
457
|
# call originating from this agent (the main chat, the
|
|
477
|
-
# synthesizer rescue, the
|
|
478
|
-
#
|
|
479
|
-
#
|
|
458
|
+
# synthesizer rescue, the +agent+ tool from
|
|
459
|
+
# +pikuri-subagents+). Read by extensions that need to spawn
|
|
460
|
+
# their own ruby_llm calls (e.g. MCP description synthesis,
|
|
461
|
+
# sub-agent delegation).
|
|
480
462
|
attr_reader :transport
|
|
481
463
|
|
|
482
464
|
# @return [Array<Tool>] this agent's tool list in declaration
|
|
483
|
-
# order.
|
|
484
|
-
#
|
|
485
|
-
# sub-agent
|
|
486
|
-
#
|
|
487
|
-
#
|
|
465
|
+
# order. Read by extensions that filter against it (notably
|
|
466
|
+
# the +agent+ tool from +pikuri-subagents+, which picks the
|
|
467
|
+
# sub-agent's toolset from the parent's instances so any
|
|
468
|
+
# already-bound workspace/confirmer wiring travels along).
|
|
469
|
+
# Tools listed here are also the ones registered with
|
|
470
|
+
# ruby_llm — the parent LLM can call any of them. Compare
|
|
471
|
+
# with {#sub_agent_tools}.
|
|
488
472
|
attr_reader :tools
|
|
489
473
|
|
|
474
|
+
# @return [Array<Tool>] tools registered via
|
|
475
|
+
# {Configurator#add_sub_agent_tool}, in declaration order.
|
|
476
|
+
# Invisible to the parent LLM (never sent to ruby_llm);
|
|
477
|
+
# available only to sub-agents whose persona +tool_names+
|
|
478
|
+
# match. See {Configurator}'s "Two tool pools" header for
|
|
479
|
+
# the trifecta-defense rationale.
|
|
480
|
+
attr_reader :sub_agent_tools
|
|
481
|
+
|
|
490
482
|
# @return [String] resolved model id from {#transport}.
|
|
491
483
|
# Convenience delegator for callers that don't need the
|
|
492
484
|
# full transport bundle.
|
|
@@ -496,12 +488,10 @@ module Pikuri
|
|
|
496
488
|
|
|
497
489
|
# @return [String] system prompt actually sent to the chat —
|
|
498
490
|
# equal to the constructor's +system_prompt:+ argument plus
|
|
499
|
-
# any snippets appended
|
|
500
|
-
#
|
|
501
|
-
# +<
|
|
502
|
-
#
|
|
503
|
-
# spawned sub-agents so they see the same advertisements
|
|
504
|
-
# without re-running extension configure.
|
|
491
|
+
# any snippets appended via {Configurator#append_system_prompt}
|
|
492
|
+
# (extensions' +<available_skills>+ / +<available_mcps>+ /
|
|
493
|
+
# +<available_agents>+, ...). Not inherited by sub-agents —
|
|
494
|
+
# each persona owns its own system prompt verbatim.
|
|
505
495
|
attr_reader :system_prompt
|
|
506
496
|
|
|
507
497
|
# @return [ListenerList] the listener list attached to this
|
|
@@ -510,54 +500,55 @@ module Pikuri
|
|
|
510
500
|
|
|
511
501
|
# @return [Control::StepLimit, nil] the step-budget control
|
|
512
502
|
# this agent was constructed with, or +nil+ when none.
|
|
513
|
-
# Read by {Tool::SubAgent} so spawned sub-agents derive
|
|
514
|
-
# their own.
|
|
515
503
|
attr_reader :step_limit
|
|
516
504
|
|
|
517
505
|
# @return [Control::Cancellable, nil] the cancellation
|
|
518
506
|
# control this agent was constructed with, or +nil+ when
|
|
519
|
-
# none. Read by
|
|
520
|
-
#
|
|
507
|
+
# none. Read by extensions that propagate cancellation to
|
|
508
|
+
# their own LLM calls (e.g. the +agent+ tool from
|
|
509
|
+
# +pikuri-subagents+ shares it with spawned sub-agents so
|
|
510
|
+
# one Ctrl+C stops the tree).
|
|
521
511
|
attr_reader :cancellable
|
|
522
512
|
|
|
523
513
|
# @return [Control::Interloper, nil] the mid-loop user-input
|
|
524
514
|
# control this agent was constructed with, or +nil+ when
|
|
525
|
-
# none.
|
|
526
|
-
# {Control::Interloper#for_sub_agent}.
|
|
515
|
+
# none.
|
|
527
516
|
attr_reader :interloper
|
|
528
517
|
|
|
529
|
-
# @return [String] this agent's identifier — empty for
|
|
530
|
-
# main agent; for sub-agents, the
|
|
531
|
-
# by
|
|
532
|
-
# +"
|
|
533
|
-
#
|
|
534
|
-
#
|
|
535
|
-
#
|
|
536
|
-
#
|
|
537
|
-
attr_reader :
|
|
518
|
+
# @return [String] this agent's unique identifier — empty for
|
|
519
|
+
# the main agent; for sub-agents, the persona-rooted id
|
|
520
|
+
# assigned by the +agent+ tool from +pikuri-subagents+ (e.g.
|
|
521
|
+
# +"researcher 0"+, +"researcher 1"+, +"file_miner 0"+).
|
|
522
|
+
# Propagated to listeners via {ListenerList#for_sub_agent(id:)}
|
|
523
|
+
# so id-aware ones can tag output. Distinct from the persona's
|
|
524
|
+
# +name+ (the value the LLM picks in the +agent+ tool's
|
|
525
|
+
# +name:+ argument).
|
|
526
|
+
attr_reader :id
|
|
538
527
|
|
|
539
528
|
# @return [Boolean] +true+ when this agent opted into
|
|
540
529
|
# chunk-level streaming (see the +streaming:+ kwarg on
|
|
541
|
-
# {#initialize}); +false+ otherwise. Read by
|
|
542
|
-
#
|
|
543
|
-
#
|
|
530
|
+
# {#initialize}); +false+ otherwise. Read by extensions that
|
|
531
|
+
# spawn their own ruby_llm calls (notably the +agent+ tool
|
|
532
|
+
# from +pikuri-subagents+, so spawned sub-agents inherit the
|
|
533
|
+
# same mode).
|
|
544
534
|
attr_reader :streaming
|
|
545
535
|
|
|
546
536
|
# @return [Array<Extension>] extension instances bound to this
|
|
547
|
-
# agent — added via {Configurator#add_extension}
|
|
548
|
-
# +
|
|
549
|
-
#
|
|
550
|
-
#
|
|
551
|
-
#
|
|
552
|
-
#
|
|
537
|
+
# agent — added via {Configurator#add_extension} inside the
|
|
538
|
+
# +Agent.new+ block. Each instance's +configure+ runs during
|
|
539
|
+
# the block and its +bind+ runs at the end of
|
|
540
|
+
# {#initialize}, once per registration (so once per parent
|
|
541
|
+
# agent in the typical setup; sub-agents do not inherit
|
|
542
|
+
# extensions).
|
|
553
543
|
attr_reader :extensions
|
|
554
544
|
|
|
555
545
|
# @return [Integer, nil] context-window cap resolved by
|
|
556
546
|
# {ContextWindowDetector} at construction time. +nil+ when
|
|
557
547
|
# no source produced a value (custom local model with no
|
|
558
548
|
# override and no reachable llama.cpp +/props+). Read by
|
|
559
|
-
#
|
|
560
|
-
#
|
|
549
|
+
# extensions that spawn their own ruby_llm calls (notably
|
|
550
|
+
# the +agent+ tool from +pikuri-subagents+, so spawned
|
|
551
|
+
# sub-agents inherit the same cap without re-probing).
|
|
561
552
|
attr_reader :context_window_cap
|
|
562
553
|
|
|
563
554
|
# Final assistant message content for the most recent
|
|
@@ -629,14 +620,14 @@ module Pikuri
|
|
|
629
620
|
|
|
630
621
|
# Synth runs under this agent's identity but on a fresh
|
|
631
622
|
# chat with a different system prompt, so it gets a
|
|
632
|
-
# distinct +_synthesizer+ suffix on the
|
|
623
|
+
# distinct +_synthesizer+ suffix on the id — same +_+
|
|
633
624
|
# separator the sub-agent generator uses, so main becomes
|
|
634
|
-
# +"synthesizer"+ and a sub-agent +"
|
|
635
|
-
# +"
|
|
625
|
+
# +"synthesizer"+ and a sub-agent +"researcher 0"+ becomes
|
|
626
|
+
# +"researcher 0_synthesizer"+. Any +TokenLog+ in the list
|
|
636
627
|
# tags the synth's prompt under that bracket so it's
|
|
637
628
|
# obvious from the log which turns were the rescue rather
|
|
638
629
|
# than the original loop.
|
|
639
|
-
|
|
630
|
+
synth_id = @id.empty? ? 'synthesizer' : "#{@id}_synthesizer"
|
|
640
631
|
synth_chat = RubyLLM.chat(**@transport.to_h)
|
|
641
632
|
# Defensive step limit on the synth: the synth has no
|
|
642
633
|
# tools so it should never trip +before_tool_call+, but
|
|
@@ -647,7 +638,7 @@ module Pikuri
|
|
|
647
638
|
chat: synth_chat,
|
|
648
639
|
parent_messages: @chat.messages,
|
|
649
640
|
user_message: user_message,
|
|
650
|
-
listeners: @listeners.for_sub_agent(
|
|
641
|
+
listeners: @listeners.for_sub_agent(id: synth_id),
|
|
651
642
|
step_limit: synth_step_limit,
|
|
652
643
|
cancellable: @cancellable,
|
|
653
644
|
streaming: @streaming
|
|
@@ -706,9 +697,10 @@ module Pikuri
|
|
|
706
697
|
# +Pikuri::Tool+ entirely."
|
|
707
698
|
#
|
|
708
699
|
# The added tool does NOT enter +@tools+, only +@chat+'s tool
|
|
709
|
-
# list.
|
|
710
|
-
#
|
|
711
|
-
# IDEAS.md §"Per-agent
|
|
700
|
+
# list. Sub-agents (the +agent+ tool from +pikuri-subagents+)
|
|
701
|
+
# therefore cannot snapshot it — which is the whole point:
|
|
702
|
+
# activation is strictly per-agent, see IDEAS.md §"Per-agent
|
|
703
|
+
# activation, no propagation".
|
|
712
704
|
#
|
|
713
705
|
# @param ruby_llm_tool [Class] subclass of +RubyLLM::Tool+
|
|
714
706
|
# @return [void]
|
|
@@ -721,11 +713,11 @@ module Pikuri
|
|
|
721
713
|
#
|
|
722
714
|
# @example
|
|
723
715
|
# agent.to_s
|
|
724
|
-
# # => "Agent(model=qwen3-35b, tools=4, listeners=[Terminal])"
|
|
716
|
+
# # => "Agent(id=, model=qwen3-35b, tools=4, listeners=[Terminal])"
|
|
725
717
|
#
|
|
726
718
|
# @return [String]
|
|
727
719
|
def to_s
|
|
728
|
-
"Agent(model=#{model}, tools=#{@tools.size}, listeners=#{@listeners})"
|
|
720
|
+
"Agent(id=#{@id}, model=#{model}, tools=#{@tools.size}, listeners=#{@listeners})"
|
|
729
721
|
end
|
|
730
722
|
end
|
|
731
723
|
end
|
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'pdf-reader'
|
|
4
|
+
|
|
5
|
+
module Pikuri
|
|
6
|
+
# Magic-byte content sniffing + text extraction, centralised. Three
|
|
7
|
+
# responsibilities:
|
|
8
|
+
#
|
|
9
|
+
# * {.detect_mime} — recognise a file from its leading bytes. Returns
|
|
10
|
+
# a MIME String for formats pikuri knows how to handle specially
|
|
11
|
+
# ({+application/pdf+}, the four image formats), or +nil+ for
|
|
12
|
+
# "unrecognised — could be text, could be opaque binary; caller
|
|
13
|
+
# decides".
|
|
14
|
+
# * {.binary?} — heuristic text-vs-binary classifier. Independent of
|
|
15
|
+
# {.detect_mime}: a file can be both recognised (e.g. PDF) *and*
|
|
16
|
+
# binary. {.detect_mime} tells you what the bytes are;
|
|
17
|
+
# {.binary?} tells you whether they're safe to render as text.
|
|
18
|
+
# * {.read_as_text} — read a file and return its content as plain
|
|
19
|
+
# UTF-8 text. PDFs go through +pdf-reader+ page-by-page; plain
|
|
20
|
+
# text passes through; images / binaries / missing files raise.
|
|
21
|
+
# The pure-extraction shape consumers like +Pikuri::VectorDb+'s
|
|
22
|
+
# indexer want (no LLM-tool concerns — no paging, no line
|
|
23
|
+
# numbering, no byte caps; just bytes-in-text-out).
|
|
24
|
+
#
|
|
25
|
+
# {.detect_mime} and {.binary?} accept either a +String+ of bytes
|
|
26
|
+
# (sample taken by the caller) or a +Pathname+ — when given a path,
|
|
27
|
+
# the module opens the file in binary mode and reads {SAMPLE_BYTES}
|
|
28
|
+
# for the sniff itself. The Pathname form is the convenience path;
|
|
29
|
+
# the bytes form is for callers that already have the sample or are
|
|
30
|
+
# calling both methods on the same file and want to avoid a second
|
|
31
|
+
# open. {.read_as_text} takes a +Pathname+ only — there's no
|
|
32
|
+
# bytes-in shortcut because the PDF case needs to seek the file.
|
|
33
|
+
#
|
|
34
|
+
# == Why a separate module
|
|
35
|
+
#
|
|
36
|
+
# Without this module, magic-byte tables and the binary heuristic
|
|
37
|
+
# ended up scattered through whichever tool needed them — first PDF
|
|
38
|
+
# in {Workspace::Read}, then images alongside it, then a copy of
|
|
39
|
+
# {.binary?} reached for by {Workspace::Edit}. Collecting the
|
|
40
|
+
# detection logic here lets {Read} focus on routing
|
|
41
|
+
# (mime-to-formatter), {Edit} drop its cross-tool reach, and new
|
|
42
|
+
# tools (a future +Workspace::Diff+, an attachment-aware web fetcher,
|
|
43
|
+
# ...) share one set of magic-byte truths.
|
|
44
|
+
#
|
|
45
|
+
# == Deliberate non-goals
|
|
46
|
+
#
|
|
47
|
+
# * *Not a full MIME database.* The set grows when a pikuri tool
|
|
48
|
+
# needs a new format, not speculatively. Keeps the "audit in an
|
|
49
|
+
# evening" ceiling honest.
|
|
50
|
+
# * *No path / extension fallback.* Extensions lie (a renamed
|
|
51
|
+
# +.png+ → opaque garbage); magic-byte detection on the actual
|
|
52
|
+
# content is the source of truth. Callers that need
|
|
53
|
+
# extension-based behaviour can layer it themselves.
|
|
54
|
+
# * *No convenience predicates* like +image?+ / +pdf?+. Callers do
|
|
55
|
+
# +mime == 'application/pdf'+ or +mime&.start_with?('image/')+ —
|
|
56
|
+
# one extra character, zero added API surface.
|
|
57
|
+
module FileType
|
|
58
|
+
module_function
|
|
59
|
+
|
|
60
|
+
# @return [Integer] recommended number of bytes to sample for
|
|
61
|
+
# {.detect_mime} and {.binary?}. Big enough to catch every
|
|
62
|
+
# prefix pikuri sniffs today (the largest is WebP's 12-byte
|
|
63
|
+
# container header) with comfortable slack; small enough that
|
|
64
|
+
# reading it off any reasonable filesystem is effectively free.
|
|
65
|
+
SAMPLE_BYTES = 4096
|
|
66
|
+
|
|
67
|
+
# @return [Float] fraction of the sample that may be non-printable
|
|
68
|
+
# before {.binary?} flags the bytes as binary. Matches opencode's
|
|
69
|
+
# threshold.
|
|
70
|
+
BINARY_NONPRINTABLE_THRESHOLD = 0.30
|
|
71
|
+
|
|
72
|
+
# @return [Hash{String => String}] magic-byte prefixes → MIME types
|
|
73
|
+
# for the image formats with flat (offset-zero, fixed-length)
|
|
74
|
+
# signatures. WebP isn't here — its signature is split across the
|
|
75
|
+
# RIFF container header — and is handled directly in
|
|
76
|
+
# {.detect_mime}.
|
|
77
|
+
IMAGE_MAGIC_BYTES = {
|
|
78
|
+
"\x89PNG\r\n\x1a\n".b => 'image/png',
|
|
79
|
+
"\xff\xd8\xff".b => 'image/jpeg',
|
|
80
|
+
"GIF87a".b => 'image/gif',
|
|
81
|
+
"GIF89a".b => 'image/gif'
|
|
82
|
+
}.freeze
|
|
83
|
+
|
|
84
|
+
# @return [String] PDF magic prefix. Every conformant PDF starts
|
|
85
|
+
# with this five-byte ASCII sequence per ISO 32000-1 §7.5.2.
|
|
86
|
+
PDF_MAGIC = '%PDF-'
|
|
87
|
+
|
|
88
|
+
# Recognise a file from its leading bytes. Returns the MIME type
|
|
89
|
+
# as a String for formats pikuri handles specially, or +nil+ for
|
|
90
|
+
# "unrecognised" — callers interpret +nil+ themselves (text,
|
|
91
|
+
# opaque binary, ...).
|
|
92
|
+
#
|
|
93
|
+
# @param input [String, Pathname] the bytes to inspect, or a
|
|
94
|
+
# +Pathname+ that this method opens in binary mode and reads up
|
|
95
|
+
# to {SAMPLE_BYTES} from. Caller is responsible for verifying the
|
|
96
|
+
# path exists; missing-file errors propagate as +Errno::ENOENT+.
|
|
97
|
+
# @return [String, nil]
|
|
98
|
+
def detect_mime(input)
|
|
99
|
+
bytes = sample_of(input)
|
|
100
|
+
return 'application/pdf' if bytes.start_with?(PDF_MAGIC)
|
|
101
|
+
|
|
102
|
+
IMAGE_MAGIC_BYTES.each do |prefix, mime|
|
|
103
|
+
return mime if bytes.start_with?(prefix)
|
|
104
|
+
end
|
|
105
|
+
return 'image/webp' if bytes.bytesize >= 12 &&
|
|
106
|
+
bytes.byteslice(0, 4) == 'RIFF'.b &&
|
|
107
|
+
bytes.byteslice(8, 4) == 'WEBP'.b
|
|
108
|
+
|
|
109
|
+
nil
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
# Heuristic text-vs-binary classifier matching opencode's: any
|
|
113
|
+
# +NUL+ byte forces +true+; otherwise count bytes outside the
|
|
114
|
+
# printable +\t \n \v \f \r+ + ASCII-32..126 range and ratio
|
|
115
|
+
# against the sample size. UTF-8 continuation bytes (0x80-0xBF)
|
|
116
|
+
# are >127 so they sit outside the non-printable ranges and pass
|
|
117
|
+
# through unflagged, letting UTF-8 text read fine. An empty
|
|
118
|
+
# sample is treated as not-binary (callers reading an empty file
|
|
119
|
+
# take the empty-text path).
|
|
120
|
+
#
|
|
121
|
+
# @param input [String, Pathname] the bytes to inspect, or a
|
|
122
|
+
# +Pathname+ that this method opens in binary mode and reads up
|
|
123
|
+
# to {SAMPLE_BYTES} from. Caller is responsible for verifying
|
|
124
|
+
# the path exists.
|
|
125
|
+
# @return [Boolean]
|
|
126
|
+
def binary?(input)
|
|
127
|
+
bytes = sample_of(input)
|
|
128
|
+
return false if bytes.empty?
|
|
129
|
+
|
|
130
|
+
non_printable = 0
|
|
131
|
+
bytes.each_byte do |b|
|
|
132
|
+
return true if b.zero?
|
|
133
|
+
|
|
134
|
+
non_printable += 1 if b < 9 || (b > 13 && b < 32)
|
|
135
|
+
end
|
|
136
|
+
non_printable.to_f / bytes.bytesize > BINARY_NONPRINTABLE_THRESHOLD
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
# Read +path+ and return its content as plain UTF-8 text. Two
|
|
140
|
+
# extraction paths, picked by {.detect_mime}:
|
|
141
|
+
#
|
|
142
|
+
# * **PDF** — walked page-by-page via +pdf-reader+; each page's
|
|
143
|
+
# extracted text is stripped and pages are joined with a blank
|
|
144
|
+
# line. A scanned-image PDF (no extractable text) comes back as
|
|
145
|
+
# the empty String — a deliberate silent skip, callers detect by
|
|
146
|
+
# length if they care.
|
|
147
|
+
# * **Plain text** — anything that {.detect_mime} doesn't
|
|
148
|
+
# recognise and that {.binary?} accepts. Read with UTF-8
|
|
149
|
+
# encoding; behaviour on non-UTF-8 bytes is whatever +File.read+
|
|
150
|
+
# does with +encoding: Encoding::UTF_8+ (which is "leave invalid
|
|
151
|
+
# bytes in, let downstream decide").
|
|
152
|
+
#
|
|
153
|
+
# Refusal cases — all raise rather than returning a sentinel
|
|
154
|
+
# because the callers are internal pikuri code, not an LLM
|
|
155
|
+
# tool. The LLM-facing +Workspace::Read+ does its own routing and
|
|
156
|
+
# returns "Error: ..." observations; that's a separate concern.
|
|
157
|
+
#
|
|
158
|
+
# * Path doesn't exist → +Errno::ENOENT+.
|
|
159
|
+
# * Path is a directory → +ArgumentError+.
|
|
160
|
+
# * Image (PNG / JPEG / GIF / WebP per {.detect_mime}) →
|
|
161
|
+
# +ArgumentError+; images aren't text.
|
|
162
|
+
# * Binary content (per {.binary?}) and not a recognised MIME →
|
|
163
|
+
# +ArgumentError+.
|
|
164
|
+
# * Malformed PDF — +pdf-reader+'s
|
|
165
|
+
# +MalformedPDFError+ / +UnsupportedFeatureError+ /
|
|
166
|
+
# +InvalidPageError+ are re-raised as a +RuntimeError+ with the
|
|
167
|
+
# path included so callers don't need to know pdf-reader's
|
|
168
|
+
# exception hierarchy.
|
|
169
|
+
#
|
|
170
|
+
# @param path [Pathname] file to read.
|
|
171
|
+
# @return [String] UTF-8 text. May be empty (empty text file, or
|
|
172
|
+
# scanned-image PDF).
|
|
173
|
+
# @raise [ArgumentError] if +path+ isn't a +Pathname+, points at
|
|
174
|
+
# a directory, is an image, or is binary.
|
|
175
|
+
# @raise [Errno::ENOENT] if +path+ doesn't exist.
|
|
176
|
+
# @raise [RuntimeError] on a malformed / unsupported PDF.
|
|
177
|
+
def read_as_text(path)
|
|
178
|
+
raise ArgumentError, "expected Pathname, got #{path.class}" unless path.is_a?(Pathname)
|
|
179
|
+
raise Errno::ENOENT, path.to_s unless path.exist?
|
|
180
|
+
raise ArgumentError, "#{path} is a directory" if path.directory?
|
|
181
|
+
|
|
182
|
+
mime = detect_mime(path)
|
|
183
|
+
return read_pdf_text(path) if mime == 'application/pdf'
|
|
184
|
+
raise ArgumentError, "#{path} is an image (#{mime}); cannot extract as text" if mime&.start_with?('image/')
|
|
185
|
+
raise ArgumentError, "#{path} appears to be binary; cannot extract as text" if binary?(path)
|
|
186
|
+
|
|
187
|
+
path.read(encoding: Encoding::UTF_8)
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
# Walk a PDF page-by-page via +pdf-reader+, returning a single
|
|
191
|
+
# String with non-empty page texts joined by blank lines. Catches
|
|
192
|
+
# the three +PDF::Reader+ exceptions Workspace::Read also handles
|
|
193
|
+
# and re-raises them as +RuntimeError+ with the path included.
|
|
194
|
+
#
|
|
195
|
+
# @param path [Pathname]
|
|
196
|
+
# @return [String]
|
|
197
|
+
# @raise [RuntimeError] on malformed / unsupported PDF.
|
|
198
|
+
def read_pdf_text(path)
|
|
199
|
+
pages = path.open('rb') do |io|
|
|
200
|
+
::PDF::Reader.new(io).pages.map { |p| p.text.strip }
|
|
201
|
+
end
|
|
202
|
+
pages.reject(&:empty?).join("\n\n")
|
|
203
|
+
rescue ::PDF::Reader::MalformedPDFError,
|
|
204
|
+
::PDF::Reader::UnsupportedFeatureError,
|
|
205
|
+
::PDF::Reader::InvalidPageError => e
|
|
206
|
+
raise "Cannot extract PDF text from #{path}: " \
|
|
207
|
+
"#{e.class.name.split('::').last}: #{e.message}"
|
|
208
|
+
end
|
|
209
|
+
private_class_method :read_pdf_text
|
|
210
|
+
|
|
211
|
+
# Coerce an +input+ argument into a bytes String for the sniffs.
|
|
212
|
+
# +String+ inputs are returned as-is (caller already sampled);
|
|
213
|
+
# +Pathname+ inputs are opened in binary mode and up to
|
|
214
|
+
# {SAMPLE_BYTES} are read off the front. Empty files come back
|
|
215
|
+
# as an empty String — {.binary?} treats that as not-binary and
|
|
216
|
+
# {.detect_mime} returns +nil+ for it, which is what the
|
|
217
|
+
# empty-text path wants.
|
|
218
|
+
#
|
|
219
|
+
# @param input [String, Pathname]
|
|
220
|
+
# @return [String] raw bytes (ASCII-8BIT encoding for the path
|
|
221
|
+
# case; whatever the caller passed for the bytes case)
|
|
222
|
+
# @raise [ArgumentError] if +input+ is neither a +String+ nor a
|
|
223
|
+
# +Pathname+ — refuses to guess, since a bare String could be
|
|
224
|
+
# either a path or actual bytes.
|
|
225
|
+
def sample_of(input)
|
|
226
|
+
case input
|
|
227
|
+
when String
|
|
228
|
+
input
|
|
229
|
+
when Pathname
|
|
230
|
+
input.open('rb') { |io| io.read(SAMPLE_BYTES) || +'' }
|
|
231
|
+
else
|
|
232
|
+
raise ArgumentError, "expected String bytes or Pathname, got #{input.class}"
|
|
233
|
+
end
|
|
234
|
+
end
|
|
235
|
+
private_class_method :sample_of
|
|
236
|
+
end
|
|
237
|
+
end
|
data/lib/pikuri/subprocess.rb
CHANGED
|
@@ -72,10 +72,17 @@ module Pikuri
|
|
|
72
72
|
# interpretation is wanted; +argv+ is passed to +exec+
|
|
73
73
|
# directly, so no implicit shell expansion happens here.
|
|
74
74
|
# @param chdir [String, Pathname] working directory
|
|
75
|
+
# @param env [Hash{String=>String}] extra environment variables to
|
|
76
|
+
# set in the child process. The child otherwise inherits the
|
|
77
|
+
# parent's full environment; entries in +env+ override or add to
|
|
78
|
+
# it. Default +{}+ (pure inheritance). Used by {Code::Bash} to
|
|
79
|
+
# thread {Pikuri::Workspace::Filesystem#env} (host git identity,
|
|
80
|
+
# etc.) into a bash subprocess whose sandbox would otherwise
|
|
81
|
+
# strip the host's config files.
|
|
75
82
|
# @return [Subprocess] handle — call {#wait} to block for the
|
|
76
83
|
# direct child to exit and read the captured output
|
|
77
|
-
def self.spawn(*argv, chdir:)
|
|
78
|
-
stdin, io, wait_thr = Open3.popen2e(*argv, chdir: chdir.to_s, pgroup: true)
|
|
84
|
+
def self.spawn(*argv, chdir:, env: {})
|
|
85
|
+
stdin, io, wait_thr = Open3.popen2e(env, *argv, chdir: chdir.to_s, pgroup: true)
|
|
79
86
|
stdin.close
|
|
80
87
|
register(wait_thr.pid)
|
|
81
88
|
new(io: io, wait_thr: wait_thr)
|