@princetheprogrammerbtw/husk 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +76 -11
- package/dist/cli/index.js +2 -3
- package/dist/cli/index.js.map +1 -1
- package/dist/index.d.ts +270 -2
- package/dist/index.js +308 -2
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -525,6 +525,48 @@ declare class OpenAIProvider implements Provider {
|
|
|
525
525
|
chat(request: ChatRequest): Promise<ChatResponse>;
|
|
526
526
|
}
|
|
527
527
|
|
|
528
|
+
/**
|
|
529
|
+
* Husk — Ollama provider adapter.
|
|
530
|
+
*
|
|
531
|
+
* Wraps Ollama's OpenAI-compatible Chat Completions API. Because Ollama
|
|
532
|
+
* exposes the exact same wire format as OpenAI, we can reuse the OpenAI
|
|
533
|
+
* adapter internally — only the default model name, base URL, and the
|
|
534
|
+
* provider 'name' field differ.
|
|
535
|
+
*
|
|
536
|
+
* Why this exists: local models (llama3.2, deepseek-r1, qwen2.5, etc.)
|
|
537
|
+
* are a first-class use case. Privacy, cost, and offline-ability all
|
|
538
|
+
* matter. Ollama is the dominant local-model runtime and uses the
|
|
539
|
+
* OpenAI API surface, so the adapter is a thin shell.
|
|
540
|
+
*
|
|
541
|
+
* Defaults:
|
|
542
|
+
* - model: 'llama3.2' (override via constructor)
|
|
543
|
+
* - baseURL: 'http://localhost:11434/v1' (override for remote Ollama)
|
|
544
|
+
* - apiKey: 'ollama' (Ollama ignores the value but the OpenAI SDK
|
|
545
|
+
* requires a non-empty string)
|
|
546
|
+
*
|
|
547
|
+
* Usage:
|
|
548
|
+
* const agent = new Agent({ model: new OllamaProvider() });
|
|
549
|
+
* const result = await agent.run('Explain quantum entanglement');
|
|
550
|
+
*
|
|
551
|
+
* For a list of models: `ollama list` (in your terminal).
|
|
552
|
+
*/
|
|
553
|
+
|
|
554
|
+
interface OllamaProviderOptions {
|
|
555
|
+
/** Model id (run `ollama list` to see what's pulled locally). Default: 'llama3.2'. */
|
|
556
|
+
readonly model?: string;
|
|
557
|
+
/** Ollama server URL. Default: 'http://localhost:11434/v1'. */
|
|
558
|
+
readonly baseURL?: string;
|
|
559
|
+
/** API key — Ollama ignores this but the OpenAI SDK requires it. Default: 'ollama'. */
|
|
560
|
+
readonly apiKey?: string;
|
|
561
|
+
}
|
|
562
|
+
declare class OllamaProvider implements Provider {
|
|
563
|
+
readonly name = "ollama";
|
|
564
|
+
readonly model: string;
|
|
565
|
+
private readonly inner;
|
|
566
|
+
constructor(options?: OllamaProviderOptions);
|
|
567
|
+
chat(request: Parameters<Provider['chat']>[0]): ReturnType<Provider['chat']>;
|
|
568
|
+
}
|
|
569
|
+
|
|
528
570
|
/**
|
|
529
571
|
* Husk — tool registry helpers.
|
|
530
572
|
*
|
|
@@ -688,6 +730,232 @@ interface GrepInput {
|
|
|
688
730
|
}
|
|
689
731
|
declare const Grep: ToolDefinition<GrepInput>;
|
|
690
732
|
|
|
733
|
+
/**
|
|
734
|
+
* Husk — eval runner types and API.
|
|
735
|
+
*
|
|
736
|
+
* The eval runner lets users assert that an agent's output meets
|
|
737
|
+
* expectations. Three primitives:
|
|
738
|
+
*
|
|
739
|
+
* 1. EvalCase — an input + the expected outcome (an assertion or a set of them)
|
|
740
|
+
* 2. Assertion — a function that takes the agent's result and returns pass/fail
|
|
741
|
+
* 3. EvalSuite — a named collection of eval cases, runnable as a unit
|
|
742
|
+
*
|
|
743
|
+
* The design choice: assertions are plain async functions, not a DSL.
|
|
744
|
+
* Users can use the 4 built-ins (equals, contains, matches, fn) or
|
|
745
|
+
* write their own. The DSL is intentionally tiny — a heavy DSL
|
|
746
|
+
* (think Jest matchers) is a maintainability trap.
|
|
747
|
+
*
|
|
748
|
+
* Example:
|
|
749
|
+
*
|
|
750
|
+
* const suite = defineSuite({
|
|
751
|
+
* name: 'hello-agent',
|
|
752
|
+
* cases: [
|
|
753
|
+
* {
|
|
754
|
+
* name: 'answers geography',
|
|
755
|
+
* input: 'What is the capital of France? Answer in one word.',
|
|
756
|
+
* assertions: [
|
|
757
|
+
* contains('Paris'),
|
|
758
|
+
* matches(/^[A-Z][a-z]+$/), // single capitalized word
|
|
759
|
+
* ],
|
|
760
|
+
* },
|
|
761
|
+
* ],
|
|
762
|
+
* });
|
|
763
|
+
*
|
|
764
|
+
* const results = await runSuite(suite, () => new Agent({ model: ... }));
|
|
765
|
+
* console.log(`${results.passed}/${results.total} passed`);
|
|
766
|
+
*/
|
|
767
|
+
|
|
768
|
+
/**
|
|
769
|
+
* A function that checks whether an agent's output meets a criterion.
|
|
770
|
+
* Returns a pass/fail with an optional message explaining the failure.
|
|
771
|
+
*/
|
|
772
|
+
type Assertion = (result: AgentResult) => AssertionResult | Promise<AssertionResult>;
|
|
773
|
+
interface AssertionResult {
|
|
774
|
+
/** Whether the assertion passed. */
|
|
775
|
+
readonly pass: boolean;
|
|
776
|
+
/** Human-readable name shown in eval reports. */
|
|
777
|
+
readonly name: string;
|
|
778
|
+
/** Optional message — required when pass is false to explain why. */
|
|
779
|
+
readonly message?: string;
|
|
780
|
+
}
|
|
781
|
+
/** Output exactly equals the expected string. */
|
|
782
|
+
declare function equals(expected: string): Assertion;
|
|
783
|
+
/** Output contains the expected substring (case-sensitive). */
|
|
784
|
+
declare function contains(needle: string): Assertion;
|
|
785
|
+
/** Output matches the expected regex. */
|
|
786
|
+
declare function matches(pattern: RegExp): Assertion;
|
|
787
|
+
/** Output passes a custom predicate. Use this for shape-based checks. */
|
|
788
|
+
declare function fn(name: string, predicate: (output: string) => boolean, message?: string): Assertion;
|
|
789
|
+
/** Output does NOT contain the given substring. */
|
|
790
|
+
declare function notContains(needle: string): Assertion;
|
|
791
|
+
/** Output length is within bounds. */
|
|
792
|
+
declare function lengthBetween(min: number, max: number): Assertion;
|
|
793
|
+
interface EvalCase {
|
|
794
|
+
/** Human-readable name shown in eval reports. */
|
|
795
|
+
readonly name: string;
|
|
796
|
+
/** The input to pass to agent.run(). */
|
|
797
|
+
readonly input: string;
|
|
798
|
+
/** Assertions to run on the result. All must pass for the case to pass. */
|
|
799
|
+
readonly assertions: readonly Assertion[];
|
|
800
|
+
/**
|
|
801
|
+
* Optional max iterations override. Lets you cap runaway agents per-case
|
|
802
|
+
* without affecting other cases in the suite.
|
|
803
|
+
*/
|
|
804
|
+
readonly maxIterations?: number;
|
|
805
|
+
}
|
|
806
|
+
interface EvalSuite {
|
|
807
|
+
/** Suite name shown in reports. */
|
|
808
|
+
readonly name: string;
|
|
809
|
+
/** Cases in this suite, run sequentially. */
|
|
810
|
+
readonly cases: readonly EvalCase[];
|
|
811
|
+
}
|
|
812
|
+
interface CaseResult {
|
|
813
|
+
readonly caseName: string;
|
|
814
|
+
readonly passed: boolean;
|
|
815
|
+
readonly assertionResults: readonly AssertionResult[];
|
|
816
|
+
readonly agentResult: AgentResult;
|
|
817
|
+
readonly durationMs: number;
|
|
818
|
+
}
|
|
819
|
+
interface SuiteResult {
|
|
820
|
+
readonly suiteName: string;
|
|
821
|
+
readonly results: readonly CaseResult[];
|
|
822
|
+
readonly passed: number;
|
|
823
|
+
readonly total: number;
|
|
824
|
+
readonly durationMs: number;
|
|
825
|
+
}
|
|
826
|
+
|
|
827
|
+
/**
|
|
828
|
+
* Husk — eval runner.
|
|
829
|
+
*
|
|
830
|
+
* Takes an EvalSuite + a factory that returns an Agent, runs each
|
|
831
|
+
* case sequentially, applies the assertions, and reports results.
|
|
832
|
+
*
|
|
833
|
+
* Why a factory (not an Agent instance): each case might want its
|
|
834
|
+
* own agent configuration. The factory pattern gives the user full
|
|
835
|
+
* control without forcing a specific shape.
|
|
836
|
+
*
|
|
837
|
+
* Why sequential (not parallel): LLM calls compete for rate limits
|
|
838
|
+
* and cost $$$. Sequential gives predictable billing and easier
|
|
839
|
+
* debugging. Parallel mode is a v0.3.0 addition.
|
|
840
|
+
*
|
|
841
|
+
* Failure handling: an agent run that throws an error is reported
|
|
842
|
+
* as a case failure (not a runner crash). The error message is
|
|
843
|
+
* included in the assertion results so the user can see what broke.
|
|
844
|
+
*/
|
|
845
|
+
|
|
846
|
+
/**
|
|
847
|
+
* A factory that produces a fresh Agent per case. Called once per
|
|
848
|
+
* case so each case can have isolated memory, config, etc.
|
|
849
|
+
*/
|
|
850
|
+
type AgentFactory = () => Agent | Promise<Agent>;
|
|
851
|
+
interface RunSuiteOptions {
|
|
852
|
+
/** Stop on first failing case. Default: false (run all cases regardless). */
|
|
853
|
+
readonly failFast?: boolean;
|
|
854
|
+
/** Custom logger for runner-level events. Default: silent. */
|
|
855
|
+
readonly onCaseStart?: (caseName: string) => void;
|
|
856
|
+
readonly onCaseEnd?: (result: CaseResult) => void;
|
|
857
|
+
}
|
|
858
|
+
declare function runSuite(suite: EvalSuite, factory: AgentFactory, options?: RunSuiteOptions): Promise<SuiteResult>;
|
|
859
|
+
/**
|
|
860
|
+
* Build a suite with less boilerplate. Equivalent to constructing
|
|
861
|
+
* the object inline, but reads more clearly at the call site.
|
|
862
|
+
*/
|
|
863
|
+
declare function defineSuite(suite: {
|
|
864
|
+
name: string;
|
|
865
|
+
cases: readonly EvalCase[];
|
|
866
|
+
}): EvalSuite;
|
|
867
|
+
|
|
868
|
+
/**
|
|
869
|
+
* Husk — observability types (tracer interface).
|
|
870
|
+
*
|
|
871
|
+
* A minimal, OTel-inspired tracer interface. Husk's events are mapped
|
|
872
|
+
* to spans by the mapper in ./tracer.ts. Users can plug in the real
|
|
873
|
+
* @opentelemetry/api tracer via the adapter (see ./otel-adapter.ts)
|
|
874
|
+
* or any other compatible backend.
|
|
875
|
+
*
|
|
876
|
+
* Design choice: we don't depend on @opentelemetry/api directly. The
|
|
877
|
+
* interface here is a strict subset of OTel's Span interface (just
|
|
878
|
+
* what's needed for agent observability). Keeping the dep out of
|
|
879
|
+
* Husk's core means users who don't need OTel pay nothing for it.
|
|
880
|
+
*
|
|
881
|
+
* For users who want full OTel:
|
|
882
|
+
* import { trace } from '@opentelemetry/api';
|
|
883
|
+
* import { toOtelTracer } from '@princetheprogrammerbtw/husk/otel-adapter';
|
|
884
|
+
* agent.onAny(toOtelTracer(trace.getTracer('husk')).onEvent);
|
|
885
|
+
*/
|
|
886
|
+
type SpanKind = 'internal' | 'client' | 'server';
|
|
887
|
+
interface SpanContext {
|
|
888
|
+
/** Unique trace id (all spans in one agent.run share this). */
|
|
889
|
+
readonly traceId: string;
|
|
890
|
+
/** Unique span id. */
|
|
891
|
+
readonly spanId: string;
|
|
892
|
+
/** Parent span id, if any. */
|
|
893
|
+
readonly parentSpanId?: string;
|
|
894
|
+
}
|
|
895
|
+
interface SpanOptions {
|
|
896
|
+
readonly name: string;
|
|
897
|
+
readonly kind?: SpanKind;
|
|
898
|
+
readonly attributes?: Readonly<Record<string, unknown>>;
|
|
899
|
+
readonly startTimeNs?: bigint;
|
|
900
|
+
}
|
|
901
|
+
interface Span {
|
|
902
|
+
readonly context: SpanContext;
|
|
903
|
+
/** Record an event (timestamped annotation) on the span. */
|
|
904
|
+
addEvent(name: string, attributes?: Record<string, unknown>): void;
|
|
905
|
+
/** Set or update an attribute on the span. */
|
|
906
|
+
setAttribute(key: string, value: string | number | boolean | null): void;
|
|
907
|
+
/** Record an exception. */
|
|
908
|
+
recordException(err: Error): void;
|
|
909
|
+
/** Mark the span as failed. */
|
|
910
|
+
setStatus(status: 'ok' | 'error', message?: string): void;
|
|
911
|
+
/** End the span. Must be called exactly once. */
|
|
912
|
+
end(endTimeNs?: bigint): void;
|
|
913
|
+
}
|
|
914
|
+
interface Tracer {
|
|
915
|
+
/**
|
|
916
|
+
* Start a new span. If parent is provided, the new span becomes a
|
|
917
|
+
* child of it. Returns the new span; caller is responsible for
|
|
918
|
+
* calling .end() on it.
|
|
919
|
+
*/
|
|
920
|
+
startSpan(options: SpanOptions, parent?: SpanContext): Span;
|
|
921
|
+
}
|
|
922
|
+
/**
|
|
923
|
+
* A tracer that does nothing. Used when no real tracer is configured.
|
|
924
|
+
* Zero overhead — every method is a no-op, so the cost is one virtual
|
|
925
|
+
* call per event.
|
|
926
|
+
*/
|
|
927
|
+
declare class NoopTracer implements Tracer {
|
|
928
|
+
startSpan(_options: SpanOptions, _parent?: SpanContext): Span;
|
|
929
|
+
}
|
|
930
|
+
|
|
931
|
+
/**
|
|
932
|
+
* Husk — agent event → tracer mapper.
|
|
933
|
+
*
|
|
934
|
+
* Translates the typed AgentEvent stream into tracer spans. The top-
|
|
935
|
+
* level 'agent:start' begins a trace, each iteration becomes a child
|
|
936
|
+
* span, and tool calls become their own spans under the iteration.
|
|
937
|
+
*
|
|
938
|
+
* Design: spans are created in startSpanOrder. Tool spans nest under
|
|
939
|
+
* the iteration span. The end of the agent run ends the trace span.
|
|
940
|
+
*
|
|
941
|
+
* Usage:
|
|
942
|
+
* const mapper = new EventTracer(myTracer);
|
|
943
|
+
* agent.onAny(mapper.onEvent.bind(mapper));
|
|
944
|
+
* await agent.run(...); // emits spans to myTracer
|
|
945
|
+
*/
|
|
946
|
+
|
|
947
|
+
declare class EventTracer {
|
|
948
|
+
private readonly tracer;
|
|
949
|
+
private traceSpan;
|
|
950
|
+
private iterationSpan;
|
|
951
|
+
private toolSpans;
|
|
952
|
+
constructor(tracer: Tracer);
|
|
953
|
+
/**
|
|
954
|
+
* Bind as an event handler: `agent.onAny(tracer.onEvent.bind(tracer))`
|
|
955
|
+
*/
|
|
956
|
+
onEvent: AgentEventHandler;
|
|
957
|
+
}
|
|
958
|
+
|
|
691
959
|
/**
|
|
692
960
|
* Husk — public API entry point.
|
|
693
961
|
*
|
|
@@ -697,6 +965,6 @@ declare const Grep: ToolDefinition<GrepInput>;
|
|
|
697
965
|
*
|
|
698
966
|
* Re-exports are added incrementally as features land (see commit history).
|
|
699
967
|
*/
|
|
700
|
-
declare const VERSION = "0.0
|
|
968
|
+
declare const VERSION = "0.1.0";
|
|
701
969
|
|
|
702
|
-
export { Agent, type AgentConfig, type AgentEvent, AgentEventEmitter, type AgentEventHandler, type AgentResult, AnthropicProvider, type AnthropicProviderOptions, Bash, type BashInput, type ChatChunk, type ChatRequest, type ChatResponse, ConsoleLogger, type ContentBlock, Edit, type EditInput, type Example, FileStore, type FileStoreOptions, Grep, type GrepInput, InMemoryStore, type JSONSchema, type JSONSchemaField, type LogLevel, type Logger, type MemoryStore, type Message, type MessageContent, OpenAIProvider, type OpenAIProviderOptions, type Provider, Read, type ReadInput, type Role, type SteeringConfig, type StopReason, type TextBlock, type TokenUsage, type ToolContext, type ToolDefinition, type ToolResult, type ToolResultBlock, type ToolUseBlock, VERSION, Write, type WriteInput, arrayField, booleanField, buildExampleMessages, buildSystemPrompt, defineTool, integerField, logEventsTo, numberField, objectField, objectSchema, stringField };
|
|
970
|
+
export { Agent, type AgentConfig, type AgentEvent, AgentEventEmitter, type AgentEventHandler, type AgentFactory, type AgentResult, AnthropicProvider, type AnthropicProviderOptions, type Assertion, type AssertionResult, Bash, type BashInput, type CaseResult, type ChatChunk, type ChatRequest, type ChatResponse, ConsoleLogger, type ContentBlock, Edit, type EditInput, type EvalCase, type EvalSuite, EventTracer, type Example, FileStore, type FileStoreOptions, Grep, type GrepInput, InMemoryStore, type JSONSchema, type JSONSchemaField, type LogLevel, type Logger, type MemoryStore, type Message, type MessageContent, NoopTracer, OllamaProvider, type OllamaProviderOptions, OpenAIProvider, type OpenAIProviderOptions, type Provider, Read, type ReadInput, type Role, type RunSuiteOptions, type Span, type SpanContext, type SpanKind, type SpanOptions, type SteeringConfig, type StopReason, type SuiteResult, type TextBlock, type TokenUsage, type ToolContext, type ToolDefinition, type ToolResult, type ToolResultBlock, type ToolUseBlock, type Tracer, VERSION, Write, type WriteInput, arrayField, booleanField, buildExampleMessages, buildSystemPrompt, contains, defineSuite, defineTool, equals, fn, integerField, lengthBetween, logEventsTo, matches, notContains, numberField, objectField, objectSchema, runSuite, stringField };
|
package/dist/index.js
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
1
2
|
import { promises } from 'fs';
|
|
2
3
|
import { join, resolve, dirname } from 'path';
|
|
3
4
|
import Anthropic from '@anthropic-ai/sdk';
|
|
@@ -820,6 +821,27 @@ function mapStopReason2(reason) {
|
|
|
820
821
|
}
|
|
821
822
|
}
|
|
822
823
|
|
|
824
|
+
// src/providers/ollama.ts
|
|
825
|
+
var DEFAULT_BASE_URL = "http://localhost:11434/v1";
|
|
826
|
+
var DEFAULT_MODEL = "llama3.2";
|
|
827
|
+
var PLACEHOLDER_API_KEY = "ollama";
|
|
828
|
+
var OllamaProvider = class {
|
|
829
|
+
name = "ollama";
|
|
830
|
+
model;
|
|
831
|
+
inner;
|
|
832
|
+
constructor(options = {}) {
|
|
833
|
+
this.model = options.model ?? DEFAULT_MODEL;
|
|
834
|
+
this.inner = new OpenAIProvider({
|
|
835
|
+
apiKey: options.apiKey ?? PLACEHOLDER_API_KEY,
|
|
836
|
+
model: this.model,
|
|
837
|
+
baseURL: options.baseURL ?? DEFAULT_BASE_URL
|
|
838
|
+
});
|
|
839
|
+
}
|
|
840
|
+
chat(request) {
|
|
841
|
+
return this.inner.chat(request);
|
|
842
|
+
}
|
|
843
|
+
};
|
|
844
|
+
|
|
823
845
|
// src/tools/registry.ts
|
|
824
846
|
function defineTool(tool) {
|
|
825
847
|
return {
|
|
@@ -1102,9 +1124,293 @@ function truncateOutput(output, limit) {
|
|
|
1102
1124
|
... (${lines.length - limit} more matches truncated)`;
|
|
1103
1125
|
}
|
|
1104
1126
|
|
|
1127
|
+
// src/evals/types.ts
|
|
1128
|
+
function equals(expected) {
|
|
1129
|
+
return (result) => {
|
|
1130
|
+
const pass = result.output === expected;
|
|
1131
|
+
return pass ? { name: `equals(${JSON.stringify(expected).slice(0, 40)})`, pass: true } : {
|
|
1132
|
+
name: `equals(${JSON.stringify(expected).slice(0, 40)})`,
|
|
1133
|
+
pass: false,
|
|
1134
|
+
message: `Expected ${JSON.stringify(expected)}, got ${JSON.stringify(result.output).slice(0, 200)}`
|
|
1135
|
+
};
|
|
1136
|
+
};
|
|
1137
|
+
}
|
|
1138
|
+
function contains(needle) {
|
|
1139
|
+
return (result) => {
|
|
1140
|
+
const pass = result.output.includes(needle);
|
|
1141
|
+
return pass ? { name: `contains(${JSON.stringify(needle).slice(0, 40)})`, pass: true } : {
|
|
1142
|
+
name: `contains(${JSON.stringify(needle).slice(0, 40)})`,
|
|
1143
|
+
pass: false,
|
|
1144
|
+
message: `Expected output to contain ${JSON.stringify(needle)}, got ${JSON.stringify(result.output).slice(0, 200)}`
|
|
1145
|
+
};
|
|
1146
|
+
};
|
|
1147
|
+
}
|
|
1148
|
+
function matches(pattern) {
|
|
1149
|
+
return (result) => {
|
|
1150
|
+
const m = pattern.exec(result.output);
|
|
1151
|
+
return {
|
|
1152
|
+
name: `matches(${pattern})`,
|
|
1153
|
+
pass: m !== null,
|
|
1154
|
+
...m === null ? {
|
|
1155
|
+
message: `Output did not match ${pattern}: ${JSON.stringify(result.output).slice(0, 200)}`
|
|
1156
|
+
} : {}
|
|
1157
|
+
};
|
|
1158
|
+
};
|
|
1159
|
+
}
|
|
1160
|
+
function fn(name, predicate, message) {
|
|
1161
|
+
return (result) => {
|
|
1162
|
+
const pass = predicate(result.output);
|
|
1163
|
+
return {
|
|
1164
|
+
name,
|
|
1165
|
+
pass,
|
|
1166
|
+
...pass ? {} : { message: message ?? `Predicate ${name} failed` }
|
|
1167
|
+
};
|
|
1168
|
+
};
|
|
1169
|
+
}
|
|
1170
|
+
function notContains(needle) {
|
|
1171
|
+
return (result) => {
|
|
1172
|
+
const pass = !result.output.includes(needle);
|
|
1173
|
+
return pass ? { name: `notContains(${JSON.stringify(needle).slice(0, 40)})`, pass: true } : {
|
|
1174
|
+
name: `notContains(${JSON.stringify(needle).slice(0, 40)})`,
|
|
1175
|
+
pass: false,
|
|
1176
|
+
message: `Output should not contain ${JSON.stringify(needle)} but did: ${JSON.stringify(result.output).slice(0, 200)}`
|
|
1177
|
+
};
|
|
1178
|
+
};
|
|
1179
|
+
}
|
|
1180
|
+
function lengthBetween(min, max) {
|
|
1181
|
+
return (result) => {
|
|
1182
|
+
const len = result.output.length;
|
|
1183
|
+
const pass = len >= min && len <= max;
|
|
1184
|
+
return pass ? { name: `lengthBetween(${min}, ${max})`, pass: true } : {
|
|
1185
|
+
name: `lengthBetween(${min}, ${max})`,
|
|
1186
|
+
pass: false,
|
|
1187
|
+
message: `Output length ${len} not in [${min}, ${max}]`
|
|
1188
|
+
};
|
|
1189
|
+
};
|
|
1190
|
+
}
|
|
1191
|
+
|
|
1192
|
+
// src/evals/runner.ts
|
|
1193
|
+
async function runSuite(suite, factory, options = {}) {
|
|
1194
|
+
const start = Date.now();
|
|
1195
|
+
const results = [];
|
|
1196
|
+
let passed = 0;
|
|
1197
|
+
for (const c of suite.cases) {
|
|
1198
|
+
options.onCaseStart?.(c.name);
|
|
1199
|
+
const caseResult = await runCase(c, factory);
|
|
1200
|
+
results.push(caseResult);
|
|
1201
|
+
if (caseResult.passed) passed += 1;
|
|
1202
|
+
options.onCaseEnd?.(caseResult);
|
|
1203
|
+
if (options.failFast && !caseResult.passed) {
|
|
1204
|
+
break;
|
|
1205
|
+
}
|
|
1206
|
+
}
|
|
1207
|
+
return {
|
|
1208
|
+
suiteName: suite.name,
|
|
1209
|
+
results,
|
|
1210
|
+
passed,
|
|
1211
|
+
total: suite.cases.length,
|
|
1212
|
+
durationMs: Date.now() - start
|
|
1213
|
+
};
|
|
1214
|
+
}
|
|
1215
|
+
async function runCase(c, factory) {
|
|
1216
|
+
const start = Date.now();
|
|
1217
|
+
const agent = await factory();
|
|
1218
|
+
let agentResult;
|
|
1219
|
+
try {
|
|
1220
|
+
agentResult = await agent.run(c.input);
|
|
1221
|
+
} catch (err) {
|
|
1222
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
1223
|
+
const errorAssertionResult = {
|
|
1224
|
+
pass: false,
|
|
1225
|
+
name: "agent.run",
|
|
1226
|
+
message: `agent.run threw: ${message}`
|
|
1227
|
+
};
|
|
1228
|
+
return {
|
|
1229
|
+
caseName: c.name,
|
|
1230
|
+
passed: false,
|
|
1231
|
+
assertionResults: [errorAssertionResult],
|
|
1232
|
+
agentResult: {
|
|
1233
|
+
output: "",
|
|
1234
|
+
messages: [],
|
|
1235
|
+
iterations: 0,
|
|
1236
|
+
usage: { inputTokens: 0, outputTokens: 0 },
|
|
1237
|
+
durationMs: Date.now() - start
|
|
1238
|
+
},
|
|
1239
|
+
durationMs: Date.now() - start
|
|
1240
|
+
};
|
|
1241
|
+
}
|
|
1242
|
+
const assertionResults = [];
|
|
1243
|
+
for (const a of c.assertions) {
|
|
1244
|
+
const r = await a(agentResult);
|
|
1245
|
+
assertionResults.push(r);
|
|
1246
|
+
}
|
|
1247
|
+
const allPassed = assertionResults.every((r) => r.pass);
|
|
1248
|
+
return {
|
|
1249
|
+
caseName: c.name,
|
|
1250
|
+
passed: allPassed,
|
|
1251
|
+
assertionResults,
|
|
1252
|
+
agentResult,
|
|
1253
|
+
durationMs: Date.now() - start
|
|
1254
|
+
};
|
|
1255
|
+
}
|
|
1256
|
+
function defineSuite(suite) {
|
|
1257
|
+
return {
|
|
1258
|
+
name: suite.name,
|
|
1259
|
+
cases: suite.cases
|
|
1260
|
+
};
|
|
1261
|
+
}
|
|
1262
|
+
|
|
1263
|
+
// src/obs/tracer.ts
|
|
1264
|
+
var NoopTracer = class {
|
|
1265
|
+
startSpan(_options, _parent) {
|
|
1266
|
+
const ctx = {
|
|
1267
|
+
traceId: "0",
|
|
1268
|
+
spanId: "0"
|
|
1269
|
+
};
|
|
1270
|
+
return {
|
|
1271
|
+
context: ctx,
|
|
1272
|
+
addEvent: () => {
|
|
1273
|
+
},
|
|
1274
|
+
setAttribute: () => {
|
|
1275
|
+
},
|
|
1276
|
+
recordException: () => {
|
|
1277
|
+
},
|
|
1278
|
+
setStatus: () => {
|
|
1279
|
+
},
|
|
1280
|
+
end: () => {
|
|
1281
|
+
}
|
|
1282
|
+
};
|
|
1283
|
+
}
|
|
1284
|
+
};
|
|
1285
|
+
|
|
1286
|
+
// src/obs/mapper.ts
|
|
1287
|
+
var EventTracer = class {
|
|
1288
|
+
tracer;
|
|
1289
|
+
traceSpan = null;
|
|
1290
|
+
iterationSpan = null;
|
|
1291
|
+
toolSpans = /* @__PURE__ */ new Map();
|
|
1292
|
+
constructor(tracer) {
|
|
1293
|
+
this.tracer = tracer;
|
|
1294
|
+
}
|
|
1295
|
+
/**
|
|
1296
|
+
* Bind as an event handler: `agent.onAny(tracer.onEvent.bind(tracer))`
|
|
1297
|
+
*/
|
|
1298
|
+
onEvent = (event) => {
|
|
1299
|
+
switch (event.type) {
|
|
1300
|
+
case "agent:start": {
|
|
1301
|
+
this.traceSpan = this.tracer.startSpan({
|
|
1302
|
+
name: "agent.run",
|
|
1303
|
+
kind: "internal",
|
|
1304
|
+
attributes: {
|
|
1305
|
+
"husk.input": event.input,
|
|
1306
|
+
"husk.session_id": event.sessionId
|
|
1307
|
+
}
|
|
1308
|
+
});
|
|
1309
|
+
break;
|
|
1310
|
+
}
|
|
1311
|
+
case "agent:iteration": {
|
|
1312
|
+
this.iterationSpan?.end();
|
|
1313
|
+
this.iterationSpan = this.tracer.startSpan(
|
|
1314
|
+
{
|
|
1315
|
+
name: `iteration.${event.iteration}`,
|
|
1316
|
+
kind: "internal",
|
|
1317
|
+
attributes: { "husk.iteration": event.iteration }
|
|
1318
|
+
},
|
|
1319
|
+
this.traceSpan?.context
|
|
1320
|
+
);
|
|
1321
|
+
break;
|
|
1322
|
+
}
|
|
1323
|
+
case "provider:request": {
|
|
1324
|
+
this.iterationSpan?.addEvent("provider.request", {
|
|
1325
|
+
"provider.model": event.request.model
|
|
1326
|
+
});
|
|
1327
|
+
break;
|
|
1328
|
+
}
|
|
1329
|
+
case "provider:response": {
|
|
1330
|
+
if (this.iterationSpan) {
|
|
1331
|
+
this.iterationSpan.setAttribute(
|
|
1332
|
+
"provider.input_tokens",
|
|
1333
|
+
event.response.usage.inputTokens
|
|
1334
|
+
);
|
|
1335
|
+
this.iterationSpan.setAttribute(
|
|
1336
|
+
"provider.output_tokens",
|
|
1337
|
+
event.response.usage.outputTokens
|
|
1338
|
+
);
|
|
1339
|
+
this.iterationSpan.setAttribute("provider.stop_reason", event.response.stopReason);
|
|
1340
|
+
this.iterationSpan.setAttribute("provider.duration_ms", event.durationMs);
|
|
1341
|
+
}
|
|
1342
|
+
break;
|
|
1343
|
+
}
|
|
1344
|
+
case "tool:call": {
|
|
1345
|
+
const span = this.tracer.startSpan(
|
|
1346
|
+
{
|
|
1347
|
+
name: `tool.${event.name}`,
|
|
1348
|
+
kind: "internal",
|
|
1349
|
+
attributes: {
|
|
1350
|
+
"tool.name": event.name,
|
|
1351
|
+
"tool.input": JSON.stringify(event.input)
|
|
1352
|
+
}
|
|
1353
|
+
},
|
|
1354
|
+
this.iterationSpan?.context ?? this.traceSpan?.context
|
|
1355
|
+
);
|
|
1356
|
+
this.toolSpans.set(event.id, span);
|
|
1357
|
+
break;
|
|
1358
|
+
}
|
|
1359
|
+
case "tool:result": {
|
|
1360
|
+
const span = this.toolSpans.get(event.id);
|
|
1361
|
+
if (span) {
|
|
1362
|
+
span.setAttribute("tool.is_error", event.result.isError ?? false);
|
|
1363
|
+
span.setAttribute("tool.duration_ms", event.durationMs);
|
|
1364
|
+
if (event.result.isError) {
|
|
1365
|
+
span.setStatus("error", event.result.output);
|
|
1366
|
+
} else {
|
|
1367
|
+
span.setStatus("ok");
|
|
1368
|
+
}
|
|
1369
|
+
span.end();
|
|
1370
|
+
this.toolSpans.delete(event.id);
|
|
1371
|
+
}
|
|
1372
|
+
break;
|
|
1373
|
+
}
|
|
1374
|
+
case "agent:end": {
|
|
1375
|
+
this.iterationSpan?.end();
|
|
1376
|
+
this.iterationSpan = null;
|
|
1377
|
+
if (this.traceSpan) {
|
|
1378
|
+
this.traceSpan.setAttribute("husk.iterations", event.iterations);
|
|
1379
|
+
this.traceSpan.setAttribute("husk.duration_ms", event.durationMs);
|
|
1380
|
+
this.traceSpan.setStatus("ok");
|
|
1381
|
+
this.traceSpan.end();
|
|
1382
|
+
this.traceSpan = null;
|
|
1383
|
+
}
|
|
1384
|
+
break;
|
|
1385
|
+
}
|
|
1386
|
+
case "agent:error": {
|
|
1387
|
+
if (this.traceSpan) {
|
|
1388
|
+
this.traceSpan.recordException(event.error);
|
|
1389
|
+
this.traceSpan.setStatus("error", event.error.message);
|
|
1390
|
+
this.traceSpan.end();
|
|
1391
|
+
this.traceSpan = null;
|
|
1392
|
+
}
|
|
1393
|
+
this.iterationSpan?.end();
|
|
1394
|
+
this.iterationSpan = null;
|
|
1395
|
+
for (const span of this.toolSpans.values()) {
|
|
1396
|
+
span.end();
|
|
1397
|
+
}
|
|
1398
|
+
this.toolSpans.clear();
|
|
1399
|
+
break;
|
|
1400
|
+
}
|
|
1401
|
+
case "agent:message": {
|
|
1402
|
+
this.iterationSpan?.addEvent("message", {
|
|
1403
|
+
"message.role": event.message.role
|
|
1404
|
+
});
|
|
1405
|
+
break;
|
|
1406
|
+
}
|
|
1407
|
+
}
|
|
1408
|
+
};
|
|
1409
|
+
};
|
|
1410
|
+
|
|
1105
1411
|
// src/index.ts
|
|
1106
|
-
var VERSION = "0.0
|
|
1412
|
+
var VERSION = "0.1.0";
|
|
1107
1413
|
|
|
1108
|
-
export { Agent, AgentEventEmitter, AnthropicProvider, Bash, ConsoleLogger, Edit, FileStore, Grep, InMemoryStore, OpenAIProvider, Read, VERSION, Write, arrayField, booleanField, buildExampleMessages, buildSystemPrompt, defineTool, integerField, logEventsTo, numberField, objectField, objectSchema, stringField };
|
|
1414
|
+
export { Agent, AgentEventEmitter, AnthropicProvider, Bash, ConsoleLogger, Edit, EventTracer, FileStore, Grep, InMemoryStore, NoopTracer, OllamaProvider, OpenAIProvider, Read, VERSION, Write, arrayField, booleanField, buildExampleMessages, buildSystemPrompt, contains, defineSuite, defineTool, equals, fn, integerField, lengthBetween, logEventsTo, matches, notContains, numberField, objectField, objectSchema, runSuite, stringField };
|
|
1109
1415
|
//# sourceMappingURL=index.js.map
|
|
1110
1416
|
//# sourceMappingURL=index.js.map
|