npm - opik - Versions diffs - 2.0.26 → 2.0.28 - Mend

opik 2.0.26 → 2.0.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/dist/index.d.cts CHANGED Viewed

@@ -6255,9 +6255,11 @@ interface ServiceTogglesConfig {
     ollamaProviderEnabled: boolean;
     collaboratorsTabEnabled: boolean;
     v2WorkspaceAllowlistIds: string[];
+    v1WorkspaceAllowlistIds: string[];
     forceWorkspaceVersion: string;
     defaultPageSize?: number;
     v2WorkspaceAllowlist?: string;
+    v1WorkspaceAllowlist?: string;
 }
 interface Span$1 {
@@ -12943,7 +12945,11 @@ type EvaluationResult = {
     experimentId: string;
     /** Name of the experiment */
     experimentName?: string;
-    /** Test results for all evaluated items */
+    /**
+     * Test results for all evaluated items, including failed ones.
+     * Items whose task threw will have a synthetic score named
+     * {@link TASK_ERROR_SCORE_NAME} with `scoringFailed: true`.
+     */
     testResults: EvaluationTestResult[];
     /** Optional URL to view detailed results in the Opik platform */
     resultUrl?: string;
@@ -12963,6 +12969,18 @@ type EvaluationError = {
     /** Original error object, if available */
     error?: Error;
 };
+/**
+ * Reserved score name injected into failed task runs.
+ *
+ * When a task throws, the engine adds a synthetic score with this name and
+ * `scoringFailed: true` so failed items remain visible in experiment results.
+ * Consumers can filter on this name to distinguish task-level failures from
+ * real metric scores.
+ *
+ * Note: coordinate with the Python SDK before renaming — picking a stable,
+ * collision-resistant name (OPIK-6437).
+ */
+declare const TASK_ERROR_SCORE_NAME = "__opik_task_error__";
 /**
  * Represents the result of a metric calculation.
  */
@@ -12973,7 +12991,12 @@ type EvaluationScoreResult = {
     value: number;
     /** Optional reason for the score */
     reason?: string;
-    /** Whether the scoring failed */
+    /**
+     * Whether the scoring failed due to a task-level error rather than a metric
+     * failure. When `true`, `name` will equal {@link TASK_ERROR_SCORE_NAME},
+     * which is a reserved name injected by the engine — user-defined metrics
+     * should never produce a score with that name.
+     */
     scoringFailed?: boolean;
     /** Optional category name for grouping scores */
     categoryName?: string;
@@ -16456,4 +16479,4 @@ interface DistributedTraceHeaders {
  */
 declare function getDistributedTraceHeaders(): DistributedTraceHeaders | null;
-export { AgentTaskCompletionJudge, AgentToolCorrectnessJudge, type AllProviderOptions, AnnotationQueuePublicScope as AnnotationQueueScope, AnswerRelevance, type AnthropicProviderOptions, BaseLLMJudgeMetric, BaseMetric, BaseSuiteEvaluator, ChatPrompt, ComplianceRiskJudge, type Config, ConfigMismatchError, ConfigNotFoundError, Contains, type CreateTestSuiteOptions, DEFAULT_EXECUTION_POLICY, Dataset, type DatasetPublic, DatasetVersion, DatasetVersionNotFoundError, type DatasetVersionPublic, DemographicBiasJudge, DialogueHelpfulnessJudge, type DistributedTraceHeaders, type ErrorInfo, type EvaluateOptions, type EvaluatePromptOptions, type EvaluateTestSuiteOptions, type EvaluationError, type EvaluationResult, type EvaluationScoreResult, type EvaluationTask, type EvaluationTestCase, type EvaluationTestResult, ExactMatch, type ExecutionPolicy, type FeedbackScoreData, type FewShotExampleAnswerRelevanceNoContext, type FewShotExampleAnswerRelevanceWithContext, type FewShotExampleHallucination, type FewShotExampleModeration, type FilterExpression, GEval, GEvalPreset, GenderBiasJudge, type GoogleProviderOptions, Hallucination, IsJson, type ItemResult, LLMJudge, type LLMJudgeConfig, type LLMJudgeModelSettings, type LLMJudgeOptions, type LLMJudgeResponseFormat, ModelConfigurationError, ModelError, ModelGenerationError, Moderation, OPIK_PARENT_SPAN_ID_HEADER, OPIK_TRACE_ID_HEADER, type OpenAIProviderOptions, OpikClient as Opik, type OpikAssistantMessage, OpikBaseModel, type OpikConfig, type OpikMessage, OpikQueryLanguage, SpanType as OpikSpanType, type OpikSystemMessage, type OpikToolMessage, type OpikUserMessage, type Param, PoliticalBiasJudge, Prompt, PromptType, PromptUncertaintyJudge, type ProviderOptionsForModel, QARelevanceJudge, type RawTestSuiteItem, RegexMatch, RegionalBiasJudge, type RegistryEntry, ReligiousBiasJudge, ResponseSchema, type RunTestsOptions, SYSTEM_PROMPT, type ScoringKeyMappingType, Span, SpanType, SummarizationCoherenceJudge, SummarizationConsistencyJudge, type SupportedModelId, TestSuite, type TestSuiteItem, TestSuiteResult, ThreadsAnnotationQueue, Trace, TracesAnnotationQueue, USER_PROMPT_TEMPLATE, type UpdateTestSuiteItem, type UpdateTestSuiteOptions, Usefulness, VercelAIChatModel, activateRunner, agentConfigContext, buildSuiteResult, createModel, createModelFromInstance, deserializeEvaluators, detectProvider, disableLogger, evaluate, evaluatePrompt, evaluateTestSuite, flushAll, generateId, getDistributedTraceHeaders, getGlobalClient, getTrackContext, logger, resetGlobalClient, resolveEvaluators, resolveExecutionPolicy, resolveItemExecutionPolicy, resolveModel, runTests, serializeEvaluators, setGlobalClient, setLoggerLevel, track, validateEvaluators, validateExecutionPolicy };
+export { AgentTaskCompletionJudge, AgentToolCorrectnessJudge, type AllProviderOptions, AnnotationQueuePublicScope as AnnotationQueueScope, AnswerRelevance, type AnthropicProviderOptions, BaseLLMJudgeMetric, BaseMetric, BaseSuiteEvaluator, ChatPrompt, ComplianceRiskJudge, type Config, ConfigMismatchError, ConfigNotFoundError, Contains, type CreateTestSuiteOptions, DEFAULT_EXECUTION_POLICY, Dataset, type DatasetPublic, DatasetVersion, DatasetVersionNotFoundError, type DatasetVersionPublic, DemographicBiasJudge, DialogueHelpfulnessJudge, type DistributedTraceHeaders, type ErrorInfo, type EvaluateOptions, type EvaluatePromptOptions, type EvaluateTestSuiteOptions, type EvaluationError, type EvaluationResult, type EvaluationScoreResult, type EvaluationTask, type EvaluationTestCase, type EvaluationTestResult, ExactMatch, type ExecutionPolicy, type FeedbackScoreData, type FewShotExampleAnswerRelevanceNoContext, type FewShotExampleAnswerRelevanceWithContext, type FewShotExampleHallucination, type FewShotExampleModeration, type FilterExpression, GEval, GEvalPreset, GenderBiasJudge, type GoogleProviderOptions, Hallucination, IsJson, type ItemResult, LLMJudge, type LLMJudgeConfig, type LLMJudgeModelSettings, type LLMJudgeOptions, type LLMJudgeResponseFormat, ModelConfigurationError, ModelError, ModelGenerationError, Moderation, OPIK_PARENT_SPAN_ID_HEADER, OPIK_TRACE_ID_HEADER, type OpenAIProviderOptions, OpikClient as Opik, type OpikAssistantMessage, OpikBaseModel, type OpikConfig, type OpikMessage, OpikQueryLanguage, SpanType as OpikSpanType, type OpikSystemMessage, type OpikToolMessage, type OpikUserMessage, type Param, PoliticalBiasJudge, Prompt, PromptType, PromptUncertaintyJudge, type ProviderOptionsForModel, QARelevanceJudge, type RawTestSuiteItem, RegexMatch, RegionalBiasJudge, type RegistryEntry, ReligiousBiasJudge, ResponseSchema, type RunTestsOptions, SYSTEM_PROMPT, type ScoringKeyMappingType, Span, SpanType, SummarizationCoherenceJudge, SummarizationConsistencyJudge, type SupportedModelId, TASK_ERROR_SCORE_NAME, TestSuite, type TestSuiteItem, TestSuiteResult, ThreadsAnnotationQueue, Trace, TracesAnnotationQueue, USER_PROMPT_TEMPLATE, type UpdateTestSuiteItem, type UpdateTestSuiteOptions, Usefulness, VercelAIChatModel, activateRunner, agentConfigContext, buildSuiteResult, createModel, createModelFromInstance, deserializeEvaluators, detectProvider, disableLogger, evaluate, evaluatePrompt, evaluateTestSuite, flushAll, generateId, getDistributedTraceHeaders, getGlobalClient, getTrackContext, logger, resetGlobalClient, resolveEvaluators, resolveExecutionPolicy, resolveItemExecutionPolicy, resolveModel, runTests, serializeEvaluators, setGlobalClient, setLoggerLevel, track, validateEvaluators, validateExecutionPolicy };

package/dist/index.d.ts CHANGED Viewed

@@ -6255,9 +6255,11 @@ interface ServiceTogglesConfig {
     ollamaProviderEnabled: boolean;
     collaboratorsTabEnabled: boolean;
     v2WorkspaceAllowlistIds: string[];
+    v1WorkspaceAllowlistIds: string[];
     forceWorkspaceVersion: string;
     defaultPageSize?: number;
     v2WorkspaceAllowlist?: string;
+    v1WorkspaceAllowlist?: string;
 }
 interface Span$1 {
@@ -12943,7 +12945,11 @@ type EvaluationResult = {
     experimentId: string;
     /** Name of the experiment */
     experimentName?: string;
-    /** Test results for all evaluated items */
+    /**
+     * Test results for all evaluated items, including failed ones.
+     * Items whose task threw will have a synthetic score named
+     * {@link TASK_ERROR_SCORE_NAME} with `scoringFailed: true`.
+     */
     testResults: EvaluationTestResult[];
     /** Optional URL to view detailed results in the Opik platform */
     resultUrl?: string;
@@ -12963,6 +12969,18 @@ type EvaluationError = {
     /** Original error object, if available */
     error?: Error;
 };
+/**
+ * Reserved score name injected into failed task runs.
+ *
+ * When a task throws, the engine adds a synthetic score with this name and
+ * `scoringFailed: true` so failed items remain visible in experiment results.
+ * Consumers can filter on this name to distinguish task-level failures from
+ * real metric scores.
+ *
+ * Note: coordinate with the Python SDK before renaming — picking a stable,
+ * collision-resistant name (OPIK-6437).
+ */
+declare const TASK_ERROR_SCORE_NAME = "__opik_task_error__";
 /**
  * Represents the result of a metric calculation.
  */
@@ -12973,7 +12991,12 @@ type EvaluationScoreResult = {
     value: number;
     /** Optional reason for the score */
     reason?: string;
-    /** Whether the scoring failed */
+    /**
+     * Whether the scoring failed due to a task-level error rather than a metric
+     * failure. When `true`, `name` will equal {@link TASK_ERROR_SCORE_NAME},
+     * which is a reserved name injected by the engine — user-defined metrics
+     * should never produce a score with that name.
+     */
     scoringFailed?: boolean;
     /** Optional category name for grouping scores */
     categoryName?: string;
@@ -16456,4 +16479,4 @@ interface DistributedTraceHeaders {
  */
 declare function getDistributedTraceHeaders(): DistributedTraceHeaders | null;
-export { AgentTaskCompletionJudge, AgentToolCorrectnessJudge, type AllProviderOptions, AnnotationQueuePublicScope as AnnotationQueueScope, AnswerRelevance, type AnthropicProviderOptions, BaseLLMJudgeMetric, BaseMetric, BaseSuiteEvaluator, ChatPrompt, ComplianceRiskJudge, type Config, ConfigMismatchError, ConfigNotFoundError, Contains, type CreateTestSuiteOptions, DEFAULT_EXECUTION_POLICY, Dataset, type DatasetPublic, DatasetVersion, DatasetVersionNotFoundError, type DatasetVersionPublic, DemographicBiasJudge, DialogueHelpfulnessJudge, type DistributedTraceHeaders, type ErrorInfo, type EvaluateOptions, type EvaluatePromptOptions, type EvaluateTestSuiteOptions, type EvaluationError, type EvaluationResult, type EvaluationScoreResult, type EvaluationTask, type EvaluationTestCase, type EvaluationTestResult, ExactMatch, type ExecutionPolicy, type FeedbackScoreData, type FewShotExampleAnswerRelevanceNoContext, type FewShotExampleAnswerRelevanceWithContext, type FewShotExampleHallucination, type FewShotExampleModeration, type FilterExpression, GEval, GEvalPreset, GenderBiasJudge, type GoogleProviderOptions, Hallucination, IsJson, type ItemResult, LLMJudge, type LLMJudgeConfig, type LLMJudgeModelSettings, type LLMJudgeOptions, type LLMJudgeResponseFormat, ModelConfigurationError, ModelError, ModelGenerationError, Moderation, OPIK_PARENT_SPAN_ID_HEADER, OPIK_TRACE_ID_HEADER, type OpenAIProviderOptions, OpikClient as Opik, type OpikAssistantMessage, OpikBaseModel, type OpikConfig, type OpikMessage, OpikQueryLanguage, SpanType as OpikSpanType, type OpikSystemMessage, type OpikToolMessage, type OpikUserMessage, type Param, PoliticalBiasJudge, Prompt, PromptType, PromptUncertaintyJudge, type ProviderOptionsForModel, QARelevanceJudge, type RawTestSuiteItem, RegexMatch, RegionalBiasJudge, type RegistryEntry, ReligiousBiasJudge, ResponseSchema, type RunTestsOptions, SYSTEM_PROMPT, type ScoringKeyMappingType, Span, SpanType, SummarizationCoherenceJudge, SummarizationConsistencyJudge, type SupportedModelId, TestSuite, type TestSuiteItem, TestSuiteResult, ThreadsAnnotationQueue, Trace, TracesAnnotationQueue, USER_PROMPT_TEMPLATE, type UpdateTestSuiteItem, type UpdateTestSuiteOptions, Usefulness, VercelAIChatModel, activateRunner, agentConfigContext, buildSuiteResult, createModel, createModelFromInstance, deserializeEvaluators, detectProvider, disableLogger, evaluate, evaluatePrompt, evaluateTestSuite, flushAll, generateId, getDistributedTraceHeaders, getGlobalClient, getTrackContext, logger, resetGlobalClient, resolveEvaluators, resolveExecutionPolicy, resolveItemExecutionPolicy, resolveModel, runTests, serializeEvaluators, setGlobalClient, setLoggerLevel, track, validateEvaluators, validateExecutionPolicy };
+export { AgentTaskCompletionJudge, AgentToolCorrectnessJudge, type AllProviderOptions, AnnotationQueuePublicScope as AnnotationQueueScope, AnswerRelevance, type AnthropicProviderOptions, BaseLLMJudgeMetric, BaseMetric, BaseSuiteEvaluator, ChatPrompt, ComplianceRiskJudge, type Config, ConfigMismatchError, ConfigNotFoundError, Contains, type CreateTestSuiteOptions, DEFAULT_EXECUTION_POLICY, Dataset, type DatasetPublic, DatasetVersion, DatasetVersionNotFoundError, type DatasetVersionPublic, DemographicBiasJudge, DialogueHelpfulnessJudge, type DistributedTraceHeaders, type ErrorInfo, type EvaluateOptions, type EvaluatePromptOptions, type EvaluateTestSuiteOptions, type EvaluationError, type EvaluationResult, type EvaluationScoreResult, type EvaluationTask, type EvaluationTestCase, type EvaluationTestResult, ExactMatch, type ExecutionPolicy, type FeedbackScoreData, type FewShotExampleAnswerRelevanceNoContext, type FewShotExampleAnswerRelevanceWithContext, type FewShotExampleHallucination, type FewShotExampleModeration, type FilterExpression, GEval, GEvalPreset, GenderBiasJudge, type GoogleProviderOptions, Hallucination, IsJson, type ItemResult, LLMJudge, type LLMJudgeConfig, type LLMJudgeModelSettings, type LLMJudgeOptions, type LLMJudgeResponseFormat, ModelConfigurationError, ModelError, ModelGenerationError, Moderation, OPIK_PARENT_SPAN_ID_HEADER, OPIK_TRACE_ID_HEADER, type OpenAIProviderOptions, OpikClient as Opik, type OpikAssistantMessage, OpikBaseModel, type OpikConfig, type OpikMessage, OpikQueryLanguage, SpanType as OpikSpanType, type OpikSystemMessage, type OpikToolMessage, type OpikUserMessage, type Param, PoliticalBiasJudge, Prompt, PromptType, PromptUncertaintyJudge, type ProviderOptionsForModel, QARelevanceJudge, type RawTestSuiteItem, RegexMatch, RegionalBiasJudge, type RegistryEntry, ReligiousBiasJudge, ResponseSchema, type RunTestsOptions, SYSTEM_PROMPT, type ScoringKeyMappingType, Span, SpanType, SummarizationCoherenceJudge, SummarizationConsistencyJudge, type SupportedModelId, TASK_ERROR_SCORE_NAME, TestSuite, type TestSuiteItem, TestSuiteResult, ThreadsAnnotationQueue, Trace, TracesAnnotationQueue, USER_PROMPT_TEMPLATE, type UpdateTestSuiteItem, type UpdateTestSuiteOptions, Usefulness, VercelAIChatModel, activateRunner, agentConfigContext, buildSuiteResult, createModel, createModelFromInstance, deserializeEvaluators, detectProvider, disableLogger, evaluate, evaluatePrompt, evaluateTestSuite, flushAll, generateId, getDistributedTraceHeaders, getGlobalClient, getTrackContext, logger, resetGlobalClient, resolveEvaluators, resolveExecutionPolicy, resolveItemExecutionPolicy, resolveModel, runTests, serializeEvaluators, setGlobalClient, setLoggerLevel, track, validateEvaluators, validateExecutionPolicy };

package/dist/index.js CHANGED Viewed

	@@ -1 +1 @@
1	- import {Ea}from'./chunk-~~4SYOSTVG~~.js';export{ra as AgentTaskCompletionJudge,qa as AgentToolCorrectnessJudge,ea as AnswerRelevance,aa as BaseLLMJudgeMetric,B as BaseMetric,C as BaseSuiteEvaluator,p as ChatPrompt,ta as ComplianceRiskJudge,i as ConfigMismatchError,h as ConfigNotFoundError,Z as Contains,y as DEFAULT_EXECUTION_POLICY,j as Dataset,f as DatasetVersion,g as DatasetVersionNotFoundError,la as DemographicBiasJudge,ja as DialogueHelpfulnessJudge,Y as ExactMatch,fa as GEval,ga as GEvalPreset,na as GenderBiasJudge,da as Hallucination,$ as IsJson,P as LLMJudge,G as ModelConfigurationError,E as ModelError,F as ModelGenerationError,ba as Moderation,Ba as OPIK_PARENT_SPAN_ID_HEADER,Aa as OPIK_TRACE_ID_HEADER,za as Opik,D as OpikBaseModel,q as OpikQueryLanguage,d as OpikSpanType,ma as PoliticalBiasJudge,o as Prompt,k as PromptType,sa as PromptUncertaintyJudge,ka as QARelevanceJudge,_ as RegexMatch,pa as RegionalBiasJudge,oa as ReligiousBiasJudge,O as ResponseSchema,M as SYSTEM_PROMPT,ia as SummarizationCoherenceJudge,ha as SummarizationConsistencyJudge,xa as TestSuite,z as TestSuiteResult,s as ThreadsAnnotationQueue,r as TracesAnnotationQueue,N as USER_PROMPT_TEMPLATE,ca as Usefulness,I as VercelAIChatModel,v as activateRunner,t as agentConfigContext,A as buildSuiteResult,J as createModel,K as createModelFromInstance,R as deserializeEvaluators,H as detectProvider,c as disableLogger,W as evaluate,X as evaluatePrompt,U as evaluateTestSuite,u as flushAll,e as generateId,Ca as getDistributedTraceHeaders,l as getGlobalClient,w as getTrackContext,a as logger,n as resetGlobalClient,ua as resolveEvaluators,S as resolveExecutionPolicy,T as resolveItemExecutionPolicy,L as resolveModel,V as runTests,Q as serializeEvaluators,m as setGlobalClient,b as setLoggerLevel,x as track,va as validateEvaluators,wa as validateExecutionPolicy,Da as z}from'./chunk-~~4SYOSTVG~~.js';Ea();
1	+ import {Fa}from'./chunk-MGI4PR45.js';export{sa as AgentTaskCompletionJudge,ra as AgentToolCorrectnessJudge,fa as AnswerRelevance,ba as BaseLLMJudgeMetric,C as BaseMetric,D as BaseSuiteEvaluator,p as ChatPrompt,ua as ComplianceRiskJudge,i as ConfigMismatchError,h as ConfigNotFoundError,_ as Contains,y as DEFAULT_EXECUTION_POLICY,j as Dataset,f as DatasetVersion,g as DatasetVersionNotFoundError,ma as DemographicBiasJudge,ka as DialogueHelpfulnessJudge,Z as ExactMatch,ga as GEval,ha as GEvalPreset,oa as GenderBiasJudge,ea as Hallucination,aa as IsJson,Q as LLMJudge,H as ModelConfigurationError,F as ModelError,G as ModelGenerationError,ca as Moderation,Ca as OPIK_PARENT_SPAN_ID_HEADER,Ba as OPIK_TRACE_ID_HEADER,Aa as Opik,E as OpikBaseModel,q as OpikQueryLanguage,d as OpikSpanType,na as PoliticalBiasJudge,o as Prompt,k as PromptType,ta as PromptUncertaintyJudge,la as QARelevanceJudge,$ as RegexMatch,qa as RegionalBiasJudge,pa as ReligiousBiasJudge,P as ResponseSchema,N as SYSTEM_PROMPT,ja as SummarizationCoherenceJudge,ia as SummarizationConsistencyJudge,B as TASK_ERROR_SCORE_NAME,ya as TestSuite,z as TestSuiteResult,s as ThreadsAnnotationQueue,r as TracesAnnotationQueue,O as USER_PROMPT_TEMPLATE,da as Usefulness,J as VercelAIChatModel,v as activateRunner,t as agentConfigContext,A as buildSuiteResult,K as createModel,L as createModelFromInstance,S as deserializeEvaluators,I as detectProvider,c as disableLogger,X as evaluate,Y as evaluatePrompt,V as evaluateTestSuite,u as flushAll,e as generateId,Da as getDistributedTraceHeaders,l as getGlobalClient,w as getTrackContext,a as logger,n as resetGlobalClient,va as resolveEvaluators,T as resolveExecutionPolicy,U as resolveItemExecutionPolicy,M as resolveModel,W as runTests,R as serializeEvaluators,m as setGlobalClient,b as setLoggerLevel,x as track,wa as validateEvaluators,xa as validateExecutionPolicy,Ea as z}from'./chunk-MGI4PR45.js';Fa();

package/dist/suite-IS535BRQ.js ADDED Viewed

	@@ -0,0 +1 @@
1	+ import {za}from'./chunk-MGI4PR45.js';export{y as DEFAULT_EXECUTION_POLICY,ya as TestSuite,z as TestSuiteResult,A as buildSuiteResult,S as deserializeEvaluators,V as evaluateTestSuite,T as resolveExecutionPolicy,U as resolveItemExecutionPolicy,W as runTests,R as serializeEvaluators}from'./chunk-MGI4PR45.js';za();

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "opik",
   "description": "Opik TypeScript and JavaScript SDK",
-  "version": "2.0.26",
+  "version": "2.0.28",
   "repository": {
     "type": "git",
     "url": "git+https://github.com/comet-ml/opik.git",