orchestrated 0.1.9 → 0.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.d.ts CHANGED
@@ -10,6 +10,7 @@
10
10
  import type { Batch } from 'openai/resources';
11
11
  import type { ChatCompletionCreateParamsBase } from 'openai/resources/chat/completions.mjs';
12
12
  import { ClientOptions } from 'openai';
13
+ import { default as default_2 } from 'openai';
13
14
  import type { Metadata } from 'openai/resources';
14
15
  import OpenAI from 'openai';
15
16
  import type { z } from 'zod';
@@ -175,7 +176,9 @@ declare interface BatchResult {
175
176
  };
176
177
  }
177
178
 
178
- export declare const Behavioral: (args: unknown) => Promise<Score>;
179
+ export declare const Behavioral: ((args: unknown) => Promise<Score>) & {
180
+ definition?: SerializableScorerDefinition;
181
+ };
179
182
 
180
183
  /**
181
184
  * Bundle reference with fingerprint
@@ -204,7 +207,9 @@ export declare const colors: {
204
207
  yellow: string;
205
208
  };
206
209
 
207
- export declare const ContentSafety: (args: unknown) => Promise<Score>;
210
+ export declare const ContentSafety: ((args: unknown) => Promise<Score>) & {
211
+ definition?: SerializableScorerDefinition;
212
+ };
208
213
 
209
214
  /**
210
215
  * Context builder with progressive type refinement
@@ -284,7 +289,9 @@ declare interface DataSourceMetadata {
284
289
  ctx: ResolvedDataCtx;
285
290
  }
286
291
 
287
- export declare const Effectiveness: (args: unknown) => Promise<Score>;
292
+ export declare const Effectiveness: ((args: unknown) => Promise<Score>) & {
293
+ definition?: SerializableScorerDefinition;
294
+ };
288
295
 
289
296
  /**
290
297
  * Eval overload for SerializableEvaluation (from API responses or definitions.json)
@@ -421,16 +428,36 @@ export declare interface EvalOptions<EvalReport = boolean> {
421
428
  * Callback for pending batch UI (optional, CLI injects ink renderer)
422
429
  */
423
430
  onPendingBatch?: PendingBatchCallback;
431
+ /**
432
+ * Execution mode for evaluations:
433
+ * - "batch": Use batching for expensive LLM scorers (default, cost-effective)
434
+ * - "sync": Execute scorers synchronously without batching (faster, real-time)
435
+ */
436
+ execute?: "batch" | "sync";
437
+ /**
438
+ * Callback for streaming individual results as they complete (sync mode only)
439
+ * Called after each test case is evaluated
440
+ * Useful for real-time progress updates and streaming to clients
441
+ */
442
+ onResult?: (result: EvalResult) => void | Promise<void>;
424
443
  /**
425
444
  * Optional batch client for managing batch requests.
445
+ * Only used when execute is "batch".
426
446
  * If not provided, a default BatchClient will be created.
427
447
  */
428
448
  batchClient?: BatchClient;
429
449
  /**
430
450
  * Optional eval client for LLM scoring operations.
451
+ * Only used when execute is "batch".
431
452
  * If not provided, a default EvalClient will be created.
432
453
  */
433
454
  evalClient?: EvalClient;
455
+ /**
456
+ * Optional OpenAI client for sync LLM operations.
457
+ * Only used when execute is "sync".
458
+ * If not provided, a default OpenAI client will be created.
459
+ */
460
+ openaiClient?: default_2;
434
461
  __schedule?: string;
435
462
  }
436
463
 
@@ -446,6 +473,9 @@ export declare interface EvalResult<Input = any, Output = any, Expected = any> {
446
473
  tags?: string[];
447
474
  id?: string;
448
475
  hasPendingBatch?: boolean;
476
+ ctx: {
477
+ [k: string]: any;
478
+ };
449
479
  }
450
480
 
451
481
  /**
package/index.js CHANGED
@@ -21089,7 +21089,7 @@ async function initState(partial2 = {}, skipAuth = false) {
21089
21089
  throw new Error("State is immutable and already initialized. Use resetState() for testing.");
21090
21090
  }
21091
21091
  if (globalThis.__ORCHESTRATED_SHARED_STATE__) {
21092
- const sharedState = globalThis.__ORCHESTRATED_SHARED_STATE__;
21092
+ const { lazyLoad, ...sharedState } = globalThis.__ORCHESTRATED_SHARED_STATE__;
21093
21093
  globalState = Object.freeze({ ...sharedState });
21094
21094
  isInitialized = true;
21095
21095
  return;
@@ -21148,7 +21148,8 @@ async function initState(partial2 = {}, skipAuth = false) {
21148
21148
  }
21149
21149
  function getState() {
21150
21150
  if (!isInitialized && globalThis.__ORCHESTRATED_SHARED_STATE__) {
21151
- return globalThis.__ORCHESTRATED_SHARED_STATE__;
21151
+ const { lazyLoad, ...shareableState } = globalThis.__ORCHESTRATED_SHARED_STATE__;
21152
+ return shareableState;
21152
21153
  }
21153
21154
  if (!isInitialized) {
21154
21155
  throw new Error("State not initialized. Call await initState() before using getState().");
@@ -21242,10 +21243,10 @@ var init_data_source = __esm(() => {
21242
21243
  id: exports_external.string(),
21243
21244
  ctx: exports_external.object({
21244
21245
  systemPrompt: exports_external.string().optional()
21245
- }),
21246
+ }).loose(),
21246
21247
  input: exports_external.string(),
21247
21248
  output: exports_external.string()
21248
- }))
21249
+ }).loose())
21249
21250
  }).passthrough();
21250
21251
  });
21251
21252
 
@@ -88550,10 +88551,34 @@ function buildPromptScorer(config2) {
88550
88551
  enumerable: false,
88551
88552
  configurable: true
88552
88553
  });
88554
+ const serializedSchema = config2.parameters ? serializeZodSchema(config2.parameters) : {
88555
+ type: "zod",
88556
+ definition: JSON.stringify({ type: "object" })
88557
+ };
88558
+ const definition = {
88559
+ type: "prompt",
88560
+ name: config2.name,
88561
+ slug: config2.slug || config2.name.toLowerCase().replace(/[_\s]+/g, "-"),
88562
+ description: config2.description || `Prompt-based scorer: ${config2.name}`,
88563
+ schema: serializedSchema,
88564
+ promptTemplate: config2.promptTemplate,
88565
+ choiceScores: config2.choiceScores,
88566
+ model: config2.model,
88567
+ useCoT: config2.useCoT,
88568
+ temperature: config2.temperature,
88569
+ metadata: config2.metadata
88570
+ };
88571
+ Object.defineProperty(scorerFunction, "definition", {
88572
+ value: definition,
88573
+ writable: false,
88574
+ enumerable: false,
88575
+ configurable: true
88576
+ });
88553
88577
  return scorerFunction;
88554
88578
  }
88555
88579
  var init_scorer = __esm(() => {
88556
88580
  init_jsdist();
88581
+ init_schema_serializer();
88557
88582
  });
88558
88583
 
88559
88584
  // src/serialization/types.ts
@@ -117088,6 +117113,22 @@ async function serializeScorer(scorer, evalName, index) {
117088
117113
  }
117089
117114
  if (typeof scorer === "function") {
117090
117115
  const scorerName = scorer.name;
117116
+ console.log({
117117
+ scorerName,
117118
+ scorer
117119
+ });
117120
+ const definition = scorer.definition;
117121
+ if (definition && typeof definition === "object" && definition.type) {
117122
+ const alreadyRegistered = registry2.scorers.find((s) => s.name === definition.name);
117123
+ if (!alreadyRegistered) {
117124
+ registry2.scorers.push(definition);
117125
+ }
117126
+ return {
117127
+ type: "custom_scorer",
117128
+ slug: definition.slug || definition.name.toLowerCase(),
117129
+ fingerprint: definition.fingerprint
117130
+ };
117131
+ }
117091
117132
  const registered = registry2.scorers.find((s) => s.name === scorerName);
117092
117133
  if (registered) {
117093
117134
  if (registered.type === "custom_scorer") {
@@ -117108,8 +117149,8 @@ async function serializeScorer(scorer, evalName, index) {
117108
117149
  fingerprint: registered.fingerprint
117109
117150
  };
117110
117151
  }
117111
- const inlineName = `${evalName}_Scorer_${index}`;
117112
- const slug = inlineName.toLowerCase().replace(/_/g, "-");
117152
+ const inlineName = scorerName || `${evalName}_Scorer_${index}`;
117153
+ const slug = inlineName.toLowerCase().replace(/[_\s]+/g, "-");
117113
117154
  const handlerName = `${inlineName}Handler`;
117114
117155
  registerHandler(handlerName, scorer, {
117115
117156
  location: "eval",
@@ -117626,8 +117667,37 @@ var traced = {
117626
117667
  };
117627
117668
 
117628
117669
  // src/evaluator/core.ts
117670
+ function deterministicStringify(obj) {
117671
+ if (obj === null)
117672
+ return "null";
117673
+ if (obj === undefined)
117674
+ return "undefined";
117675
+ if (typeof obj === "string")
117676
+ return JSON.stringify(obj);
117677
+ if (typeof obj === "number" || typeof obj === "boolean")
117678
+ return String(obj);
117679
+ if (obj instanceof Date)
117680
+ return obj.toISOString();
117681
+ if (Array.isArray(obj)) {
117682
+ const items = obj.map(deterministicStringify);
117683
+ return `[${items.join(",")}]`;
117684
+ }
117685
+ if (typeof obj === "object") {
117686
+ const keys = Object.keys(obj).sort();
117687
+ const pairs2 = keys.map((key) => `${JSON.stringify(key)}:${deterministicStringify(obj[key])}`);
117688
+ return `{${pairs2.join(",")}}`;
117689
+ }
117690
+ return String(obj);
117691
+ }
117629
117692
  function generateTestCasesChecksum(testCases) {
117630
- const content = JSON.stringify(testCases, null, 0);
117693
+ const stableData = testCases.map((testCase) => ({
117694
+ id: testCase.id,
117695
+ input: testCase.input,
117696
+ output: testCase.output,
117697
+ expected: testCase.expected,
117698
+ ctx: testCase.ctx
117699
+ }));
117700
+ const content = deterministicStringify(stableData);
117631
117701
  return createHash2("sha256").update(content).digest("hex");
117632
117702
  }
117633
117703
  function generateCaseId(dataCase) {
@@ -117745,10 +117815,20 @@ function getEvaluationOptions(name, options, state) {
117745
117815
  const returnResults = merged.returnResults ?? true;
117746
117816
  const scorerFailAsZero = merged.scorerFailAsZero ?? false;
117747
117817
  const progress = !jsonl && createProgressTracker ? createProgressTracker(name) : new NullProgressTracker;
117748
- const batchClient = merged.batchClient || new BatchClient;
117749
- const evalClient = merged.evalClient || new EvalClient({
117750
- batchClient
117751
- });
117818
+ const execute = merged.execute ?? "batch";
117819
+ let batchClient;
117820
+ let evalClient;
117821
+ let openaiClient;
117822
+ if (execute === "batch") {
117823
+ batchClient = merged.batchClient || new BatchClient;
117824
+ evalClient = merged.evalClient || new EvalClient({
117825
+ batchClient
117826
+ });
117827
+ } else {
117828
+ const OpenAI4 = __require("openai").default;
117829
+ openaiClient = merged.openaiClient || new OpenAI4;
117830
+ }
117831
+ const onResult = merged.onResult;
117752
117832
  return {
117753
117833
  state,
117754
117834
  reporter,
@@ -117759,12 +117839,15 @@ function getEvaluationOptions(name, options, state) {
117759
117839
  progress,
117760
117840
  createProgressTracker: createProgressTracker || (() => new NullProgressTracker),
117761
117841
  onPendingBatch,
117842
+ execute,
117762
117843
  batchClient,
117763
- evalClient
117844
+ evalClient,
117845
+ openaiClient,
117846
+ onResult
117764
117847
  };
117765
117848
  }
117766
117849
  async function evaluateDataCase(dataCase, evaluator, _ctx, options) {
117767
- const evalClient = options.evalClient;
117850
+ const client2 = options.execute === "sync" ? options.openaiClient : options.evalClient;
117768
117851
  const caseId = generateCaseId(dataCase);
117769
117852
  const dataCaseCtx = dataCase.ctx;
117770
117853
  const ctx = _ctx.mutate({
@@ -117787,7 +117870,7 @@ async function evaluateDataCase(dataCase, evaluator, _ctx, options) {
117787
117870
  expected: dataCase.expected,
117788
117871
  tags: dataCase.tags,
117789
117872
  id: caseId,
117790
- client: evalClient
117873
+ client: client2
117791
117874
  };
117792
117875
  const fieldsToExclude = new Set(["state", "ctx", "tags", "id", "client"]);
117793
117876
  const argsForStorage = Object.fromEntries(Object.entries(scorerArgs).filter(([key]) => !fieldsToExclude.has(key)));
@@ -117801,7 +117884,8 @@ async function evaluateDataCase(dataCase, evaluator, _ctx, options) {
117801
117884
  tags: dataCase.tags,
117802
117885
  id: caseId,
117803
117886
  hasPendingBatch: result.hasPendingBatch,
117804
- error: result.error
117887
+ error: result.error,
117888
+ ctx: argsForStorage
117805
117889
  };
117806
117890
  if (verbose) {
117807
117891
  console.dir(result.scores, { depth: null });
@@ -117817,7 +117901,25 @@ function getScorerName(scorer2, index) {
117817
117901
  return scorer2;
117818
117902
  }
117819
117903
  if (typeof scorer2 === "function") {
117820
- return scorer2.name;
117904
+ return scorer2.name || `Scorer ${index ?? 0}`;
117905
+ }
117906
+ if (scorer2 && typeof scorer2 === "object") {
117907
+ if (scorer2.type === "internal" && scorer2.name) {
117908
+ return scorer2.name;
117909
+ }
117910
+ if (scorer2.type === "custom_scorer" && scorer2.slug) {
117911
+ try {
117912
+ const registry2 = getRegistry2();
117913
+ const scorerDef = registry2.scorers.find((s) => s.slug === scorer2.slug);
117914
+ if (scorerDef && scorerDef.name) {
117915
+ return scorerDef.name;
117916
+ }
117917
+ } catch (e) {}
117918
+ return scorer2.slug.split("-").map((word) => word.charAt(0).toUpperCase() + word.slice(1)).join(" ");
117919
+ }
117920
+ if (scorer2.name) {
117921
+ return scorer2.name;
117922
+ }
117821
117923
  }
117822
117924
  return `Scorer ${index ?? 0}`;
117823
117925
  }
@@ -118011,7 +118113,15 @@ async function executeScorer(scorer2, scorerArgs) {
118011
118113
  return scorerFn(scorerArgs);
118012
118114
  }
118013
118115
  async function runEval(name, evaluator, options) {
118014
- const { state, verbose, jsonl, returnResults, progress, batchClient } = options;
118116
+ const {
118117
+ state,
118118
+ verbose,
118119
+ jsonl,
118120
+ returnResults,
118121
+ progress,
118122
+ execute,
118123
+ batchClient
118124
+ } = options;
118015
118125
  const ctx = init({
118016
118126
  ...evaluator.ctx ?? {},
118017
118127
  state,
@@ -118023,9 +118133,11 @@ async function runEval(name, evaluator, options) {
118023
118133
  [ATTR_DATASET_SOURCE_TYPE]: data.ctx.sourceType,
118024
118134
  [ATTR_EVAL_EXECUTION_METADATA_TEST_CASE_COUNT]: data.ctx.caseCount
118025
118135
  });
118026
- await batchClient.initialize(name, data.ctx.checksum, data.ctx.dataSourceType);
118027
- if (batchClient.hasPendingBatch) {
118028
- return batchClient.getPending();
118136
+ if (execute === "batch" && batchClient) {
118137
+ await batchClient.initialize(name, data.ctx.checksum, data.ctx.dataSourceType);
118138
+ if (batchClient.hasPendingBatch) {
118139
+ return batchClient.getPending();
118140
+ }
118029
118141
  }
118030
118142
  const results = [];
118031
118143
  const errors6 = new Map;
@@ -118036,6 +118148,9 @@ async function runEval(name, evaluator, options) {
118036
118148
  }), options);
118037
118149
  results.push(caseResult.result);
118038
118150
  aggregateScorerErrors(errors6, caseResult.errors);
118151
+ if (execute === "sync" && options.onResult) {
118152
+ await options.onResult(caseResult.result);
118153
+ }
118039
118154
  if (!jsonl) {
118040
118155
  progress.increment();
118041
118156
  }
@@ -118044,9 +118159,12 @@ async function runEval(name, evaluator, options) {
118044
118159
  if (!jsonl) {
118045
118160
  progress.stop();
118046
118161
  }
118047
- const batch = await batchClient.submit();
118048
- if (verbose) {
118049
- console.log(batch);
118162
+ let batch;
118163
+ if (execute === "batch" && batchClient) {
118164
+ batch = await batchClient.submit();
118165
+ if (verbose) {
118166
+ console.log(batch);
118167
+ }
118050
118168
  }
118051
118169
  const summary = createEvaluationSummary(name, results, errors6);
118052
118170
  const result = {
@@ -118262,12 +118380,15 @@ class ScorerRegistry {
118262
118380
  }
118263
118381
  return buildPromptScorer({
118264
118382
  name: config2.name,
118383
+ slug: config2.slug,
118384
+ description: config2.description,
118265
118385
  promptTemplate: config2.promptTemplate,
118266
118386
  choiceScores: config2.choiceScores,
118267
118387
  model: config2.model,
118268
118388
  useCoT: config2.useCoT,
118269
118389
  temperature: config2.temperature,
118270
- parameters: config2.parameters
118390
+ parameters: config2.parameters,
118391
+ metadata: config2.metadata
118271
118392
  });
118272
118393
  }
118273
118394
  createCustomScorer(config2, inLazyMode) {
@@ -118430,4 +118551,4 @@ export {
118430
118551
  Behavioral
118431
118552
  };
118432
118553
 
118433
- //# debugId=7CBAC3512DE449E364756E2164756E21
118554
+ //# debugId=5F5616B0EFE8647164756E2164756E21