opik 2.0.26 → 2.0.28
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunk-MGI4PR45.js +404 -0
- package/dist/index.cjs +36 -36
- package/dist/index.d.cts +26 -3
- package/dist/index.d.ts +26 -3
- package/dist/index.js +1 -1
- package/dist/suite-IS535BRQ.js +1 -0
- package/package.json +1 -1
- package/dist/chunk-4SYOSTVG.js +0 -404
- package/dist/suite-BZFAPKHI.js +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -6255,9 +6255,11 @@ interface ServiceTogglesConfig {
|
|
|
6255
6255
|
ollamaProviderEnabled: boolean;
|
|
6256
6256
|
collaboratorsTabEnabled: boolean;
|
|
6257
6257
|
v2WorkspaceAllowlistIds: string[];
|
|
6258
|
+
v1WorkspaceAllowlistIds: string[];
|
|
6258
6259
|
forceWorkspaceVersion: string;
|
|
6259
6260
|
defaultPageSize?: number;
|
|
6260
6261
|
v2WorkspaceAllowlist?: string;
|
|
6262
|
+
v1WorkspaceAllowlist?: string;
|
|
6261
6263
|
}
|
|
6262
6264
|
|
|
6263
6265
|
interface Span$1 {
|
|
@@ -12943,7 +12945,11 @@ type EvaluationResult = {
|
|
|
12943
12945
|
experimentId: string;
|
|
12944
12946
|
/** Name of the experiment */
|
|
12945
12947
|
experimentName?: string;
|
|
12946
|
-
/**
|
|
12948
|
+
/**
|
|
12949
|
+
* Test results for all evaluated items, including failed ones.
|
|
12950
|
+
* Items whose task threw will have a synthetic score named
|
|
12951
|
+
* {@link TASK_ERROR_SCORE_NAME} with `scoringFailed: true`.
|
|
12952
|
+
*/
|
|
12947
12953
|
testResults: EvaluationTestResult[];
|
|
12948
12954
|
/** Optional URL to view detailed results in the Opik platform */
|
|
12949
12955
|
resultUrl?: string;
|
|
@@ -12963,6 +12969,18 @@ type EvaluationError = {
|
|
|
12963
12969
|
/** Original error object, if available */
|
|
12964
12970
|
error?: Error;
|
|
12965
12971
|
};
|
|
12972
|
+
/**
|
|
12973
|
+
* Reserved score name injected into failed task runs.
|
|
12974
|
+
*
|
|
12975
|
+
* When a task throws, the engine adds a synthetic score with this name and
|
|
12976
|
+
* `scoringFailed: true` so failed items remain visible in experiment results.
|
|
12977
|
+
* Consumers can filter on this name to distinguish task-level failures from
|
|
12978
|
+
* real metric scores.
|
|
12979
|
+
*
|
|
12980
|
+
* Note: coordinate with the Python SDK before renaming — picking a stable,
|
|
12981
|
+
* collision-resistant name (OPIK-6437).
|
|
12982
|
+
*/
|
|
12983
|
+
declare const TASK_ERROR_SCORE_NAME = "__opik_task_error__";
|
|
12966
12984
|
/**
|
|
12967
12985
|
* Represents the result of a metric calculation.
|
|
12968
12986
|
*/
|
|
@@ -12973,7 +12991,12 @@ type EvaluationScoreResult = {
|
|
|
12973
12991
|
value: number;
|
|
12974
12992
|
/** Optional reason for the score */
|
|
12975
12993
|
reason?: string;
|
|
12976
|
-
/**
|
|
12994
|
+
/**
|
|
12995
|
+
* Whether the scoring failed due to a task-level error rather than a metric
|
|
12996
|
+
* failure. When `true`, `name` will equal {@link TASK_ERROR_SCORE_NAME},
|
|
12997
|
+
* which is a reserved name injected by the engine — user-defined metrics
|
|
12998
|
+
* should never produce a score with that name.
|
|
12999
|
+
*/
|
|
12977
13000
|
scoringFailed?: boolean;
|
|
12978
13001
|
/** Optional category name for grouping scores */
|
|
12979
13002
|
categoryName?: string;
|
|
@@ -16456,4 +16479,4 @@ interface DistributedTraceHeaders {
|
|
|
16456
16479
|
*/
|
|
16457
16480
|
declare function getDistributedTraceHeaders(): DistributedTraceHeaders | null;
|
|
16458
16481
|
|
|
16459
|
-
export { AgentTaskCompletionJudge, AgentToolCorrectnessJudge, type AllProviderOptions, AnnotationQueuePublicScope as AnnotationQueueScope, AnswerRelevance, type AnthropicProviderOptions, BaseLLMJudgeMetric, BaseMetric, BaseSuiteEvaluator, ChatPrompt, ComplianceRiskJudge, type Config, ConfigMismatchError, ConfigNotFoundError, Contains, type CreateTestSuiteOptions, DEFAULT_EXECUTION_POLICY, Dataset, type DatasetPublic, DatasetVersion, DatasetVersionNotFoundError, type DatasetVersionPublic, DemographicBiasJudge, DialogueHelpfulnessJudge, type DistributedTraceHeaders, type ErrorInfo, type EvaluateOptions, type EvaluatePromptOptions, type EvaluateTestSuiteOptions, type EvaluationError, type EvaluationResult, type EvaluationScoreResult, type EvaluationTask, type EvaluationTestCase, type EvaluationTestResult, ExactMatch, type ExecutionPolicy, type FeedbackScoreData, type FewShotExampleAnswerRelevanceNoContext, type FewShotExampleAnswerRelevanceWithContext, type FewShotExampleHallucination, type FewShotExampleModeration, type FilterExpression, GEval, GEvalPreset, GenderBiasJudge, type GoogleProviderOptions, Hallucination, IsJson, type ItemResult, LLMJudge, type LLMJudgeConfig, type LLMJudgeModelSettings, type LLMJudgeOptions, type LLMJudgeResponseFormat, ModelConfigurationError, ModelError, ModelGenerationError, Moderation, OPIK_PARENT_SPAN_ID_HEADER, OPIK_TRACE_ID_HEADER, type OpenAIProviderOptions, OpikClient as Opik, type OpikAssistantMessage, OpikBaseModel, type OpikConfig, type OpikMessage, OpikQueryLanguage, SpanType as OpikSpanType, type OpikSystemMessage, type OpikToolMessage, type OpikUserMessage, type Param, PoliticalBiasJudge, Prompt, PromptType, PromptUncertaintyJudge, type ProviderOptionsForModel, QARelevanceJudge, type RawTestSuiteItem, RegexMatch, RegionalBiasJudge, type RegistryEntry, ReligiousBiasJudge, ResponseSchema, type RunTestsOptions, SYSTEM_PROMPT, type ScoringKeyMappingType, Span, SpanType, SummarizationCoherenceJudge, SummarizationConsistencyJudge, type SupportedModelId, TestSuite, type TestSuiteItem, TestSuiteResult, ThreadsAnnotationQueue, Trace, TracesAnnotationQueue, USER_PROMPT_TEMPLATE, type UpdateTestSuiteItem, type UpdateTestSuiteOptions, Usefulness, VercelAIChatModel, activateRunner, agentConfigContext, buildSuiteResult, createModel, createModelFromInstance, deserializeEvaluators, detectProvider, disableLogger, evaluate, evaluatePrompt, evaluateTestSuite, flushAll, generateId, getDistributedTraceHeaders, getGlobalClient, getTrackContext, logger, resetGlobalClient, resolveEvaluators, resolveExecutionPolicy, resolveItemExecutionPolicy, resolveModel, runTests, serializeEvaluators, setGlobalClient, setLoggerLevel, track, validateEvaluators, validateExecutionPolicy };
|
|
16482
|
+
export { AgentTaskCompletionJudge, AgentToolCorrectnessJudge, type AllProviderOptions, AnnotationQueuePublicScope as AnnotationQueueScope, AnswerRelevance, type AnthropicProviderOptions, BaseLLMJudgeMetric, BaseMetric, BaseSuiteEvaluator, ChatPrompt, ComplianceRiskJudge, type Config, ConfigMismatchError, ConfigNotFoundError, Contains, type CreateTestSuiteOptions, DEFAULT_EXECUTION_POLICY, Dataset, type DatasetPublic, DatasetVersion, DatasetVersionNotFoundError, type DatasetVersionPublic, DemographicBiasJudge, DialogueHelpfulnessJudge, type DistributedTraceHeaders, type ErrorInfo, type EvaluateOptions, type EvaluatePromptOptions, type EvaluateTestSuiteOptions, type EvaluationError, type EvaluationResult, type EvaluationScoreResult, type EvaluationTask, type EvaluationTestCase, type EvaluationTestResult, ExactMatch, type ExecutionPolicy, type FeedbackScoreData, type FewShotExampleAnswerRelevanceNoContext, type FewShotExampleAnswerRelevanceWithContext, type FewShotExampleHallucination, type FewShotExampleModeration, type FilterExpression, GEval, GEvalPreset, GenderBiasJudge, type GoogleProviderOptions, Hallucination, IsJson, type ItemResult, LLMJudge, type LLMJudgeConfig, type LLMJudgeModelSettings, type LLMJudgeOptions, type LLMJudgeResponseFormat, ModelConfigurationError, ModelError, ModelGenerationError, Moderation, OPIK_PARENT_SPAN_ID_HEADER, OPIK_TRACE_ID_HEADER, type OpenAIProviderOptions, OpikClient as Opik, type OpikAssistantMessage, OpikBaseModel, type OpikConfig, type OpikMessage, OpikQueryLanguage, SpanType as OpikSpanType, type OpikSystemMessage, type OpikToolMessage, type OpikUserMessage, type Param, PoliticalBiasJudge, Prompt, PromptType, PromptUncertaintyJudge, type ProviderOptionsForModel, QARelevanceJudge, type RawTestSuiteItem, RegexMatch, RegionalBiasJudge, type RegistryEntry, ReligiousBiasJudge, ResponseSchema, type RunTestsOptions, SYSTEM_PROMPT, type ScoringKeyMappingType, Span, SpanType, SummarizationCoherenceJudge, SummarizationConsistencyJudge, type SupportedModelId, TASK_ERROR_SCORE_NAME, TestSuite, type TestSuiteItem, TestSuiteResult, ThreadsAnnotationQueue, Trace, TracesAnnotationQueue, USER_PROMPT_TEMPLATE, type UpdateTestSuiteItem, type UpdateTestSuiteOptions, Usefulness, VercelAIChatModel, activateRunner, agentConfigContext, buildSuiteResult, createModel, createModelFromInstance, deserializeEvaluators, detectProvider, disableLogger, evaluate, evaluatePrompt, evaluateTestSuite, flushAll, generateId, getDistributedTraceHeaders, getGlobalClient, getTrackContext, logger, resetGlobalClient, resolveEvaluators, resolveExecutionPolicy, resolveItemExecutionPolicy, resolveModel, runTests, serializeEvaluators, setGlobalClient, setLoggerLevel, track, validateEvaluators, validateExecutionPolicy };
|
package/dist/index.d.ts
CHANGED
|
@@ -6255,9 +6255,11 @@ interface ServiceTogglesConfig {
|
|
|
6255
6255
|
ollamaProviderEnabled: boolean;
|
|
6256
6256
|
collaboratorsTabEnabled: boolean;
|
|
6257
6257
|
v2WorkspaceAllowlistIds: string[];
|
|
6258
|
+
v1WorkspaceAllowlistIds: string[];
|
|
6258
6259
|
forceWorkspaceVersion: string;
|
|
6259
6260
|
defaultPageSize?: number;
|
|
6260
6261
|
v2WorkspaceAllowlist?: string;
|
|
6262
|
+
v1WorkspaceAllowlist?: string;
|
|
6261
6263
|
}
|
|
6262
6264
|
|
|
6263
6265
|
interface Span$1 {
|
|
@@ -12943,7 +12945,11 @@ type EvaluationResult = {
|
|
|
12943
12945
|
experimentId: string;
|
|
12944
12946
|
/** Name of the experiment */
|
|
12945
12947
|
experimentName?: string;
|
|
12946
|
-
/**
|
|
12948
|
+
/**
|
|
12949
|
+
* Test results for all evaluated items, including failed ones.
|
|
12950
|
+
* Items whose task threw will have a synthetic score named
|
|
12951
|
+
* {@link TASK_ERROR_SCORE_NAME} with `scoringFailed: true`.
|
|
12952
|
+
*/
|
|
12947
12953
|
testResults: EvaluationTestResult[];
|
|
12948
12954
|
/** Optional URL to view detailed results in the Opik platform */
|
|
12949
12955
|
resultUrl?: string;
|
|
@@ -12963,6 +12969,18 @@ type EvaluationError = {
|
|
|
12963
12969
|
/** Original error object, if available */
|
|
12964
12970
|
error?: Error;
|
|
12965
12971
|
};
|
|
12972
|
+
/**
|
|
12973
|
+
* Reserved score name injected into failed task runs.
|
|
12974
|
+
*
|
|
12975
|
+
* When a task throws, the engine adds a synthetic score with this name and
|
|
12976
|
+
* `scoringFailed: true` so failed items remain visible in experiment results.
|
|
12977
|
+
* Consumers can filter on this name to distinguish task-level failures from
|
|
12978
|
+
* real metric scores.
|
|
12979
|
+
*
|
|
12980
|
+
* Note: coordinate with the Python SDK before renaming — picking a stable,
|
|
12981
|
+
* collision-resistant name (OPIK-6437).
|
|
12982
|
+
*/
|
|
12983
|
+
declare const TASK_ERROR_SCORE_NAME = "__opik_task_error__";
|
|
12966
12984
|
/**
|
|
12967
12985
|
* Represents the result of a metric calculation.
|
|
12968
12986
|
*/
|
|
@@ -12973,7 +12991,12 @@ type EvaluationScoreResult = {
|
|
|
12973
12991
|
value: number;
|
|
12974
12992
|
/** Optional reason for the score */
|
|
12975
12993
|
reason?: string;
|
|
12976
|
-
/**
|
|
12994
|
+
/**
|
|
12995
|
+
* Whether the scoring failed due to a task-level error rather than a metric
|
|
12996
|
+
* failure. When `true`, `name` will equal {@link TASK_ERROR_SCORE_NAME},
|
|
12997
|
+
* which is a reserved name injected by the engine — user-defined metrics
|
|
12998
|
+
* should never produce a score with that name.
|
|
12999
|
+
*/
|
|
12977
13000
|
scoringFailed?: boolean;
|
|
12978
13001
|
/** Optional category name for grouping scores */
|
|
12979
13002
|
categoryName?: string;
|
|
@@ -16456,4 +16479,4 @@ interface DistributedTraceHeaders {
|
|
|
16456
16479
|
*/
|
|
16457
16480
|
declare function getDistributedTraceHeaders(): DistributedTraceHeaders | null;
|
|
16458
16481
|
|
|
16459
|
-
export { AgentTaskCompletionJudge, AgentToolCorrectnessJudge, type AllProviderOptions, AnnotationQueuePublicScope as AnnotationQueueScope, AnswerRelevance, type AnthropicProviderOptions, BaseLLMJudgeMetric, BaseMetric, BaseSuiteEvaluator, ChatPrompt, ComplianceRiskJudge, type Config, ConfigMismatchError, ConfigNotFoundError, Contains, type CreateTestSuiteOptions, DEFAULT_EXECUTION_POLICY, Dataset, type DatasetPublic, DatasetVersion, DatasetVersionNotFoundError, type DatasetVersionPublic, DemographicBiasJudge, DialogueHelpfulnessJudge, type DistributedTraceHeaders, type ErrorInfo, type EvaluateOptions, type EvaluatePromptOptions, type EvaluateTestSuiteOptions, type EvaluationError, type EvaluationResult, type EvaluationScoreResult, type EvaluationTask, type EvaluationTestCase, type EvaluationTestResult, ExactMatch, type ExecutionPolicy, type FeedbackScoreData, type FewShotExampleAnswerRelevanceNoContext, type FewShotExampleAnswerRelevanceWithContext, type FewShotExampleHallucination, type FewShotExampleModeration, type FilterExpression, GEval, GEvalPreset, GenderBiasJudge, type GoogleProviderOptions, Hallucination, IsJson, type ItemResult, LLMJudge, type LLMJudgeConfig, type LLMJudgeModelSettings, type LLMJudgeOptions, type LLMJudgeResponseFormat, ModelConfigurationError, ModelError, ModelGenerationError, Moderation, OPIK_PARENT_SPAN_ID_HEADER, OPIK_TRACE_ID_HEADER, type OpenAIProviderOptions, OpikClient as Opik, type OpikAssistantMessage, OpikBaseModel, type OpikConfig, type OpikMessage, OpikQueryLanguage, SpanType as OpikSpanType, type OpikSystemMessage, type OpikToolMessage, type OpikUserMessage, type Param, PoliticalBiasJudge, Prompt, PromptType, PromptUncertaintyJudge, type ProviderOptionsForModel, QARelevanceJudge, type RawTestSuiteItem, RegexMatch, RegionalBiasJudge, type RegistryEntry, ReligiousBiasJudge, ResponseSchema, type RunTestsOptions, SYSTEM_PROMPT, type ScoringKeyMappingType, Span, SpanType, SummarizationCoherenceJudge, SummarizationConsistencyJudge, type SupportedModelId, TestSuite, type TestSuiteItem, TestSuiteResult, ThreadsAnnotationQueue, Trace, TracesAnnotationQueue, USER_PROMPT_TEMPLATE, type UpdateTestSuiteItem, type UpdateTestSuiteOptions, Usefulness, VercelAIChatModel, activateRunner, agentConfigContext, buildSuiteResult, createModel, createModelFromInstance, deserializeEvaluators, detectProvider, disableLogger, evaluate, evaluatePrompt, evaluateTestSuite, flushAll, generateId, getDistributedTraceHeaders, getGlobalClient, getTrackContext, logger, resetGlobalClient, resolveEvaluators, resolveExecutionPolicy, resolveItemExecutionPolicy, resolveModel, runTests, serializeEvaluators, setGlobalClient, setLoggerLevel, track, validateEvaluators, validateExecutionPolicy };
|
|
16482
|
+
export { AgentTaskCompletionJudge, AgentToolCorrectnessJudge, type AllProviderOptions, AnnotationQueuePublicScope as AnnotationQueueScope, AnswerRelevance, type AnthropicProviderOptions, BaseLLMJudgeMetric, BaseMetric, BaseSuiteEvaluator, ChatPrompt, ComplianceRiskJudge, type Config, ConfigMismatchError, ConfigNotFoundError, Contains, type CreateTestSuiteOptions, DEFAULT_EXECUTION_POLICY, Dataset, type DatasetPublic, DatasetVersion, DatasetVersionNotFoundError, type DatasetVersionPublic, DemographicBiasJudge, DialogueHelpfulnessJudge, type DistributedTraceHeaders, type ErrorInfo, type EvaluateOptions, type EvaluatePromptOptions, type EvaluateTestSuiteOptions, type EvaluationError, type EvaluationResult, type EvaluationScoreResult, type EvaluationTask, type EvaluationTestCase, type EvaluationTestResult, ExactMatch, type ExecutionPolicy, type FeedbackScoreData, type FewShotExampleAnswerRelevanceNoContext, type FewShotExampleAnswerRelevanceWithContext, type FewShotExampleHallucination, type FewShotExampleModeration, type FilterExpression, GEval, GEvalPreset, GenderBiasJudge, type GoogleProviderOptions, Hallucination, IsJson, type ItemResult, LLMJudge, type LLMJudgeConfig, type LLMJudgeModelSettings, type LLMJudgeOptions, type LLMJudgeResponseFormat, ModelConfigurationError, ModelError, ModelGenerationError, Moderation, OPIK_PARENT_SPAN_ID_HEADER, OPIK_TRACE_ID_HEADER, type OpenAIProviderOptions, OpikClient as Opik, type OpikAssistantMessage, OpikBaseModel, type OpikConfig, type OpikMessage, OpikQueryLanguage, SpanType as OpikSpanType, type OpikSystemMessage, type OpikToolMessage, type OpikUserMessage, type Param, PoliticalBiasJudge, Prompt, PromptType, PromptUncertaintyJudge, type ProviderOptionsForModel, QARelevanceJudge, type RawTestSuiteItem, RegexMatch, RegionalBiasJudge, type RegistryEntry, ReligiousBiasJudge, ResponseSchema, type RunTestsOptions, SYSTEM_PROMPT, type ScoringKeyMappingType, Span, SpanType, SummarizationCoherenceJudge, SummarizationConsistencyJudge, type SupportedModelId, TASK_ERROR_SCORE_NAME, TestSuite, type TestSuiteItem, TestSuiteResult, ThreadsAnnotationQueue, Trace, TracesAnnotationQueue, USER_PROMPT_TEMPLATE, type UpdateTestSuiteItem, type UpdateTestSuiteOptions, Usefulness, VercelAIChatModel, activateRunner, agentConfigContext, buildSuiteResult, createModel, createModelFromInstance, deserializeEvaluators, detectProvider, disableLogger, evaluate, evaluatePrompt, evaluateTestSuite, flushAll, generateId, getDistributedTraceHeaders, getGlobalClient, getTrackContext, logger, resetGlobalClient, resolveEvaluators, resolveExecutionPolicy, resolveItemExecutionPolicy, resolveModel, runTests, serializeEvaluators, setGlobalClient, setLoggerLevel, track, validateEvaluators, validateExecutionPolicy };
|
package/dist/index.js
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import {Fa}from'./chunk-MGI4PR45.js';export{sa as AgentTaskCompletionJudge,ra as AgentToolCorrectnessJudge,fa as AnswerRelevance,ba as BaseLLMJudgeMetric,C as BaseMetric,D as BaseSuiteEvaluator,p as ChatPrompt,ua as ComplianceRiskJudge,i as ConfigMismatchError,h as ConfigNotFoundError,_ as Contains,y as DEFAULT_EXECUTION_POLICY,j as Dataset,f as DatasetVersion,g as DatasetVersionNotFoundError,ma as DemographicBiasJudge,ka as DialogueHelpfulnessJudge,Z as ExactMatch,ga as GEval,ha as GEvalPreset,oa as GenderBiasJudge,ea as Hallucination,aa as IsJson,Q as LLMJudge,H as ModelConfigurationError,F as ModelError,G as ModelGenerationError,ca as Moderation,Ca as OPIK_PARENT_SPAN_ID_HEADER,Ba as OPIK_TRACE_ID_HEADER,Aa as Opik,E as OpikBaseModel,q as OpikQueryLanguage,d as OpikSpanType,na as PoliticalBiasJudge,o as Prompt,k as PromptType,ta as PromptUncertaintyJudge,la as QARelevanceJudge,$ as RegexMatch,qa as RegionalBiasJudge,pa as ReligiousBiasJudge,P as ResponseSchema,N as SYSTEM_PROMPT,ja as SummarizationCoherenceJudge,ia as SummarizationConsistencyJudge,B as TASK_ERROR_SCORE_NAME,ya as TestSuite,z as TestSuiteResult,s as ThreadsAnnotationQueue,r as TracesAnnotationQueue,O as USER_PROMPT_TEMPLATE,da as Usefulness,J as VercelAIChatModel,v as activateRunner,t as agentConfigContext,A as buildSuiteResult,K as createModel,L as createModelFromInstance,S as deserializeEvaluators,I as detectProvider,c as disableLogger,X as evaluate,Y as evaluatePrompt,V as evaluateTestSuite,u as flushAll,e as generateId,Da as getDistributedTraceHeaders,l as getGlobalClient,w as getTrackContext,a as logger,n as resetGlobalClient,va as resolveEvaluators,T as resolveExecutionPolicy,U as resolveItemExecutionPolicy,M as resolveModel,W as runTests,R as serializeEvaluators,m as setGlobalClient,b as setLoggerLevel,x as track,wa as validateEvaluators,xa as validateExecutionPolicy,Ea as z}from'./chunk-MGI4PR45.js';Fa();
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
import {za}from'./chunk-MGI4PR45.js';export{y as DEFAULT_EXECUTION_POLICY,ya as TestSuite,z as TestSuiteResult,A as buildSuiteResult,S as deserializeEvaluators,V as evaluateTestSuite,T as resolveExecutionPolicy,U as resolveItemExecutionPolicy,W as runTests,R as serializeEvaluators}from'./chunk-MGI4PR45.js';za();
|