@arizeai/phoenix-evals 0.9.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +23 -23
- package/dist/esm/__generated__/default_templates/CONCISENESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts +3 -0
- package/dist/esm/__generated__/default_templates/CONCISENESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -0
- package/dist/esm/__generated__/default_templates/CONCISENESS_CLASSIFICATION_EVALUATOR_CONFIG.js +58 -0
- package/dist/esm/__generated__/default_templates/CONCISENESS_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -0
- package/dist/esm/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -1
- package/dist/esm/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.js +9 -1
- package/dist/esm/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -1
- package/dist/esm/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -1
- package/dist/esm/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.js +14 -3
- package/dist/esm/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -1
- package/dist/esm/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -1
- package/dist/esm/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.js +16 -2
- package/dist/esm/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -1
- package/dist/esm/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -1
- package/dist/esm/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.js +15 -1
- package/dist/esm/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -1
- package/dist/esm/__generated__/default_templates/REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG.d.ts +3 -0
- package/dist/esm/__generated__/default_templates/REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -0
- package/dist/esm/__generated__/default_templates/REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG.js +59 -0
- package/dist/esm/__generated__/default_templates/REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -0
- package/dist/esm/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -1
- package/dist/esm/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.js +61 -11
- package/dist/esm/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -1
- package/dist/esm/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -1
- package/dist/esm/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.js +44 -8
- package/dist/esm/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -1
- package/dist/esm/__generated__/default_templates/index.d.ts +2 -0
- package/dist/esm/__generated__/default_templates/index.d.ts.map +1 -1
- package/dist/esm/__generated__/default_templates/index.js +2 -0
- package/dist/esm/__generated__/default_templates/index.js.map +1 -1
- package/dist/esm/__generated__/types.d.ts +1 -1
- package/dist/esm/__generated__/types.d.ts.map +1 -1
- package/dist/esm/core/EvaluatorBase.d.ts +2 -2
- package/dist/esm/core/EvaluatorBase.d.ts.map +1 -1
- package/dist/esm/core/FunctionEvaluator.d.ts +1 -1
- package/dist/esm/core/FunctionEvaluator.d.ts.map +1 -1
- package/dist/esm/core/FunctionEvaluator.js.map +1 -1
- package/dist/esm/helpers/asEvaluatorFn.d.ts +1 -1
- package/dist/esm/helpers/asEvaluatorFn.d.ts.map +1 -1
- package/dist/esm/helpers/asEvaluatorFn.js.map +1 -1
- package/dist/esm/helpers/createEvaluator.d.ts +2 -2
- package/dist/esm/helpers/createEvaluator.d.ts.map +1 -1
- package/dist/esm/helpers/createEvaluator.js.map +1 -1
- package/dist/esm/helpers/toEvaluationResult.d.ts +1 -1
- package/dist/esm/helpers/toEvaluationResult.d.ts.map +1 -1
- package/dist/esm/llm/ClassificationEvaluator.d.ts +3 -3
- package/dist/esm/llm/ClassificationEvaluator.d.ts.map +1 -1
- package/dist/esm/llm/ClassificationEvaluator.js.map +1 -1
- package/dist/esm/llm/LLMEvaluator.d.ts +1 -1
- package/dist/esm/llm/LLMEvaluator.d.ts.map +1 -1
- package/dist/esm/llm/createClassificationEvaluator.d.ts +1 -1
- package/dist/esm/llm/createClassificationEvaluator.d.ts.map +1 -1
- package/dist/esm/llm/createClassificationEvaluator.js.map +1 -1
- package/dist/esm/llm/createClassifierFn.d.ts +1 -1
- package/dist/esm/llm/createClassifierFn.d.ts.map +1 -1
- package/dist/esm/llm/createClassifierFn.js.map +1 -1
- package/dist/esm/llm/createConcisenessEvaluator.d.ts +43 -0
- package/dist/esm/llm/createConcisenessEvaluator.d.ts.map +1 -0
- package/dist/esm/llm/createConcisenessEvaluator.js +39 -0
- package/dist/esm/llm/createConcisenessEvaluator.js.map +1 -0
- package/dist/esm/llm/createCorrectnessEvaluator.d.ts +2 -2
- package/dist/esm/llm/createCorrectnessEvaluator.d.ts.map +1 -1
- package/dist/esm/llm/createCorrectnessEvaluator.js.map +1 -1
- package/dist/esm/llm/createDocumentRelevanceEvaluator.d.ts +2 -2
- package/dist/esm/llm/createDocumentRelevanceEvaluator.d.ts.map +1 -1
- package/dist/esm/llm/createDocumentRelevanceEvaluator.js.map +1 -1
- package/dist/esm/llm/createFaithfulnessEvaluator.d.ts +2 -2
- package/dist/esm/llm/createFaithfulnessEvaluator.d.ts.map +1 -1
- package/dist/esm/llm/createFaithfulnessEvaluator.js.map +1 -1
- package/dist/esm/llm/createHallucinationEvaluator.d.ts +2 -2
- package/dist/esm/llm/createHallucinationEvaluator.d.ts.map +1 -1
- package/dist/esm/llm/createHallucinationEvaluator.js.map +1 -1
- package/dist/esm/llm/createRefusalEvaluator.d.ts +44 -0
- package/dist/esm/llm/createRefusalEvaluator.d.ts.map +1 -0
- package/dist/esm/llm/createRefusalEvaluator.js +40 -0
- package/dist/esm/llm/createRefusalEvaluator.js.map +1 -0
- package/dist/esm/llm/createToolInvocationEvaluator.d.ts +2 -2
- package/dist/esm/llm/createToolInvocationEvaluator.d.ts.map +1 -1
- package/dist/esm/llm/createToolInvocationEvaluator.js.map +1 -1
- package/dist/esm/llm/createToolResponseHandlingEvaluator.d.ts +2 -2
- package/dist/esm/llm/createToolResponseHandlingEvaluator.d.ts.map +1 -1
- package/dist/esm/llm/createToolResponseHandlingEvaluator.js.map +1 -1
- package/dist/esm/llm/createToolSelectionEvaluator.d.ts +2 -2
- package/dist/esm/llm/createToolSelectionEvaluator.d.ts.map +1 -1
- package/dist/esm/llm/createToolSelectionEvaluator.js.map +1 -1
- package/dist/esm/llm/generateClassification.d.ts +2 -2
- package/dist/esm/llm/generateClassification.d.ts.map +1 -1
- package/dist/esm/llm/generateClassification.js +1 -1
- package/dist/esm/llm/generateClassification.js.map +1 -1
- package/dist/esm/llm/index.d.ts +2 -0
- package/dist/esm/llm/index.d.ts.map +1 -1
- package/dist/esm/llm/index.js +2 -0
- package/dist/esm/llm/index.js.map +1 -1
- package/dist/esm/template/applyTemplate.d.ts +1 -1
- package/dist/esm/template/applyTemplate.d.ts.map +1 -1
- package/dist/esm/template/applyTemplate.js +1 -1
- package/dist/esm/template/applyTemplate.js.map +1 -1
- package/dist/esm/template/getTemplateVariables.d.ts +1 -1
- package/dist/esm/template/getTemplateVariables.d.ts.map +1 -1
- package/dist/esm/template/getTemplateVariables.js.map +1 -1
- package/dist/esm/tsconfig.esm.tsbuildinfo +1 -1
- package/dist/esm/types/evals.d.ts +5 -5
- package/dist/esm/types/evals.d.ts.map +1 -1
- package/dist/esm/types/otel.d.ts +1 -1
- package/dist/esm/types/otel.d.ts.map +1 -1
- package/dist/esm/utils/bindEvaluator.d.ts +2 -2
- package/dist/esm/utils/bindEvaluator.d.ts.map +1 -1
- package/dist/esm/utils/objectMappingUtils.d.ts +1 -1
- package/dist/esm/utils/objectMappingUtils.d.ts.map +1 -1
- package/dist/esm/utils/objectMappingUtils.js.map +1 -1
- package/dist/src/__generated__/default_templates/CONCISENESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts +3 -0
- package/dist/src/__generated__/default_templates/CONCISENESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -0
- package/dist/src/__generated__/default_templates/CONCISENESS_CLASSIFICATION_EVALUATOR_CONFIG.js +61 -0
- package/dist/src/__generated__/default_templates/CONCISENESS_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -0
- package/dist/src/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -1
- package/dist/src/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.js +9 -1
- package/dist/src/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -1
- package/dist/src/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -1
- package/dist/src/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.js +14 -3
- package/dist/src/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -1
- package/dist/src/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -1
- package/dist/src/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.js +16 -2
- package/dist/src/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -1
- package/dist/src/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -1
- package/dist/src/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.js +15 -1
- package/dist/src/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -1
- package/dist/src/__generated__/default_templates/REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG.d.ts +3 -0
- package/dist/src/__generated__/default_templates/REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -0
- package/dist/src/__generated__/default_templates/REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG.js +62 -0
- package/dist/src/__generated__/default_templates/REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -0
- package/dist/src/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -1
- package/dist/src/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.js +61 -11
- package/dist/src/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -1
- package/dist/src/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -1
- package/dist/src/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.js +44 -8
- package/dist/src/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -1
- package/dist/src/__generated__/default_templates/index.d.ts +2 -0
- package/dist/src/__generated__/default_templates/index.d.ts.map +1 -1
- package/dist/src/__generated__/default_templates/index.js +5 -1
- package/dist/src/__generated__/default_templates/index.js.map +1 -1
- package/dist/src/__generated__/types.d.ts +1 -1
- package/dist/src/__generated__/types.d.ts.map +1 -1
- package/dist/src/core/EvaluatorBase.d.ts +2 -2
- package/dist/src/core/EvaluatorBase.d.ts.map +1 -1
- package/dist/src/core/FunctionEvaluator.d.ts +1 -1
- package/dist/src/core/FunctionEvaluator.d.ts.map +1 -1
- package/dist/src/core/FunctionEvaluator.js.map +1 -1
- package/dist/src/helpers/asEvaluatorFn.d.ts +1 -1
- package/dist/src/helpers/asEvaluatorFn.d.ts.map +1 -1
- package/dist/src/helpers/asEvaluatorFn.js.map +1 -1
- package/dist/src/helpers/createEvaluator.d.ts +2 -2
- package/dist/src/helpers/createEvaluator.d.ts.map +1 -1
- package/dist/src/helpers/createEvaluator.js.map +1 -1
- package/dist/src/helpers/toEvaluationResult.d.ts +1 -1
- package/dist/src/helpers/toEvaluationResult.d.ts.map +1 -1
- package/dist/src/llm/ClassificationEvaluator.d.ts +3 -3
- package/dist/src/llm/ClassificationEvaluator.d.ts.map +1 -1
- package/dist/src/llm/ClassificationEvaluator.js.map +1 -1
- package/dist/src/llm/LLMEvaluator.d.ts +1 -1
- package/dist/src/llm/LLMEvaluator.d.ts.map +1 -1
- package/dist/src/llm/createClassificationEvaluator.d.ts +1 -1
- package/dist/src/llm/createClassificationEvaluator.d.ts.map +1 -1
- package/dist/src/llm/createClassificationEvaluator.js.map +1 -1
- package/dist/src/llm/createClassifierFn.d.ts +1 -1
- package/dist/src/llm/createClassifierFn.d.ts.map +1 -1
- package/dist/src/llm/createClassifierFn.js.map +1 -1
- package/dist/src/llm/createConcisenessEvaluator.d.ts +43 -0
- package/dist/src/llm/createConcisenessEvaluator.d.ts.map +1 -0
- package/dist/src/llm/createConcisenessEvaluator.js +50 -0
- package/dist/src/llm/createConcisenessEvaluator.js.map +1 -0
- package/dist/src/llm/createCorrectnessEvaluator.d.ts +2 -2
- package/dist/src/llm/createCorrectnessEvaluator.d.ts.map +1 -1
- package/dist/src/llm/createCorrectnessEvaluator.js.map +1 -1
- package/dist/src/llm/createDocumentRelevanceEvaluator.d.ts +2 -2
- package/dist/src/llm/createDocumentRelevanceEvaluator.d.ts.map +1 -1
- package/dist/src/llm/createDocumentRelevanceEvaluator.js.map +1 -1
- package/dist/src/llm/createFaithfulnessEvaluator.d.ts +2 -2
- package/dist/src/llm/createFaithfulnessEvaluator.d.ts.map +1 -1
- package/dist/src/llm/createFaithfulnessEvaluator.js.map +1 -1
- package/dist/src/llm/createHallucinationEvaluator.d.ts +2 -2
- package/dist/src/llm/createHallucinationEvaluator.d.ts.map +1 -1
- package/dist/src/llm/createHallucinationEvaluator.js.map +1 -1
- package/dist/src/llm/createRefusalEvaluator.d.ts +44 -0
- package/dist/src/llm/createRefusalEvaluator.d.ts.map +1 -0
- package/dist/src/llm/createRefusalEvaluator.js +51 -0
- package/dist/src/llm/createRefusalEvaluator.js.map +1 -0
- package/dist/src/llm/createToolInvocationEvaluator.d.ts +2 -2
- package/dist/src/llm/createToolInvocationEvaluator.d.ts.map +1 -1
- package/dist/src/llm/createToolInvocationEvaluator.js.map +1 -1
- package/dist/src/llm/createToolResponseHandlingEvaluator.d.ts +2 -2
- package/dist/src/llm/createToolResponseHandlingEvaluator.d.ts.map +1 -1
- package/dist/src/llm/createToolResponseHandlingEvaluator.js.map +1 -1
- package/dist/src/llm/createToolSelectionEvaluator.d.ts +2 -2
- package/dist/src/llm/createToolSelectionEvaluator.d.ts.map +1 -1
- package/dist/src/llm/createToolSelectionEvaluator.js.map +1 -1
- package/dist/src/llm/generateClassification.d.ts +2 -2
- package/dist/src/llm/generateClassification.d.ts.map +1 -1
- package/dist/src/llm/generateClassification.js +1 -1
- package/dist/src/llm/generateClassification.js.map +1 -1
- package/dist/src/llm/index.d.ts +2 -0
- package/dist/src/llm/index.d.ts.map +1 -1
- package/dist/src/llm/index.js +2 -0
- package/dist/src/llm/index.js.map +1 -1
- package/dist/src/template/applyTemplate.d.ts +1 -1
- package/dist/src/template/applyTemplate.d.ts.map +1 -1
- package/dist/src/template/applyTemplate.js +1 -1
- package/dist/src/template/applyTemplate.js.map +1 -1
- package/dist/src/template/getTemplateVariables.d.ts +1 -1
- package/dist/src/template/getTemplateVariables.d.ts.map +1 -1
- package/dist/src/template/getTemplateVariables.js.map +1 -1
- package/dist/src/types/evals.d.ts +5 -5
- package/dist/src/types/evals.d.ts.map +1 -1
- package/dist/src/types/otel.d.ts +1 -1
- package/dist/src/types/otel.d.ts.map +1 -1
- package/dist/src/utils/bindEvaluator.d.ts +2 -2
- package/dist/src/utils/bindEvaluator.d.ts.map +1 -1
- package/dist/src/utils/objectMappingUtils.d.ts +1 -1
- package/dist/src/utils/objectMappingUtils.d.ts.map +1 -1
- package/dist/src/utils/objectMappingUtils.js.map +1 -1
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/package.json +38 -39
- package/src/__generated__/default_templates/CONCISENESS_CLASSIFICATION_EVALUATOR_CONFIG.ts +60 -0
- package/src/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.ts +9 -1
- package/src/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.ts +14 -3
- package/src/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.ts +16 -2
- package/src/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.ts +15 -1
- package/src/__generated__/default_templates/REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG.ts +61 -0
- package/src/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.ts +61 -11
- package/src/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.ts +44 -8
- package/src/__generated__/default_templates/index.ts +2 -0
- package/src/__generated__/types.ts +1 -1
- package/src/core/EvaluatorBase.ts +2 -2
- package/src/core/FunctionEvaluator.ts +5 -2
- package/src/helpers/asEvaluatorFn.ts +1 -2
- package/src/helpers/createEvaluator.ts +2 -3
- package/src/helpers/toEvaluationResult.ts +1 -1
- package/src/llm/ClassificationEvaluator.ts +4 -5
- package/src/llm/LLMEvaluator.ts +1 -1
- package/src/llm/createClassificationEvaluator.ts +1 -2
- package/src/llm/createClassifierFn.ts +1 -2
- package/src/llm/createConcisenessEvaluator.ts +71 -0
- package/src/llm/createCorrectnessEvaluator.ts +2 -3
- package/src/llm/createDocumentRelevanceEvaluator.ts +2 -3
- package/src/llm/createFaithfulnessEvaluator.ts +2 -3
- package/src/llm/createHallucinationEvaluator.ts +2 -3
- package/src/llm/createRefusalEvaluator.ts +70 -0
- package/src/llm/createToolInvocationEvaluator.ts +2 -3
- package/src/llm/createToolResponseHandlingEvaluator.ts +2 -3
- package/src/llm/createToolSelectionEvaluator.ts +2 -3
- package/src/llm/generateClassification.ts +5 -5
- package/src/llm/index.ts +2 -0
- package/src/template/applyTemplate.ts +2 -3
- package/src/template/getTemplateVariables.ts +2 -2
- package/src/types/evals.ts +5 -5
- package/src/types/otel.ts +1 -1
- package/src/utils/bindEvaluator.ts +2 -2
- package/src/utils/objectMappingUtils.ts +2 -2
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"evals.d.ts","sourceRoot":"","sources":["../../../src/types/evals.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,aAAa,EAAE,MAAM,
|
|
1
|
+
{"version":3,"file":"evals.d.ts","sourceRoot":"","sources":["../../../src/types/evals.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,IAAI,CAAC;AAExC,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,QAAQ,CAAC;AAC5C,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,QAAQ,CAAC;AAC5C,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAEnD;;GAEG;AACH,MAAM,WAAW,aAAa,CAAC,UAAU,EAAE,SAAS;IAClD,MAAM,EAAE,UAAU,CAAC;IACnB,QAAQ,CAAC,EAAE,UAAU,CAAC;IACtB,KAAK,CAAC,EAAE,SAAS,CAAC;IAClB,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC;CACxB;AAED,MAAM,WAAW,OAAO;IACtB,KAAK,EAAE,aAAa,CAAC;CACtB;AAGD,MAAM,WAAW,iBAAkB,SAAQ,OAAO;CAAG;AAErD;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;;OAGG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC;IACf;;;OAGG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC;IACf;;;OAGG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED;;GAEG;AACH,MAAM,WAAW,oBAAoB;IACnC,KAAK,EAAE,MAAM,CAAC;IACd,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED;;GAEG;AACH,MAAM,WAAW,oBAAoB;IACnC,KAAK,EAAE,MAAM,CAAC;IACd,KAAK,EAAE,MAAM,CAAC;CACf;AAED;;GAEG;AACH,MAAM,MAAM,wBAAwB,GAAG,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;AAE9D;;GAEG;AACH,MAAM,WAAW,oBAAqB,SAAQ,aAAa;IAIzD,KAAK,EAAE,aAAa,CAAC;IACrB;;;OAGG;IACH,OAAO,EAAE,wBAAwB,CAAC;IAClC;;OAEG;IACH,cAAc,EAAE,cAAc,CAAC;CAChC;AAED,MAAM,WAAW,mBAAmB,CAClC,WAAW,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CACrE,SAAQ,aAAa;IACrB;;;OAGG;IACH,IAAI,EAAE,MAAM,CAAC;IACb;;OAEG;IACH,IAAI,EAAE,cAAc,CAAC;IACrB;;;OAGG;IACH,qBAAqB,CAAC,EAAE,qBAAqB,CAAC;IAC9C;;OAEG;IACH,YAAY,CAAC,EAAE,aAAa,CAAC,WAAW,CAAC,CAAC;CAC3C;AAED,MAAM,MAAM,sBAAsB,CAAC,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,IAC3E,IAAI,CAAC,mBAAmB,CAAC,UAAU,CAAC,EAAE,MAAM,CAAC,CAAC;AAEhD,MAAM,WAAW,iCAAiC,CAChD,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAE1C,SAAQ,oBAAoB,EAAE,sBAAsB,CAAC,UAAU,CAAC;IAChE;;OAEG;IACH,cAAc,EAAE,cAAc,CAAC;CAChC;AAED,MAAM,MAAM,WAAW,CAAC,WAAW,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,IAAI,CACrE,IAAI,EAAE,WAAW,KACd,OAAO,CAAC,gBAAgB,CAAC,CAAC;AAE/B;;GAEG;AACH,MAAM,MAAM,cAAc,GAAG,KAAK,GAAG,MAAM,CAAC;AAE5C;;;GAGG;AACH,MAAM,MAAM,qBAAqB,GAAG,UAAU,GAAG,UAAU,GAAG,SAAS,CAAC;AAExE;;GAEG;AACH,UAAU,oBAAoB;IAC5B;;OAEG;IACH,IAAI,EAAE,MAAM,CAAC;IACb;;OAEG;IACH,IAAI,EAAE,cAAc,CAAC;IACrB;;;OAGG;IACH,qBAAqB,CAAC,EAAE,qBAAqB,CAAC;CAC/C;AAED;;;GAGG;AACH,MAAM,WAAW,kBAAkB,CACjC,WAAW,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAC3C,SAAQ,oBAAoB;IAC5B;;OAEG;IACH,QAAQ,EAAE,WAAW,CAAC,WAAW,CAAC,CAAC;CACpC"}
|
package/dist/esm/types/otel.d.ts
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"otel.d.ts","sourceRoot":"","sources":["../../../src/types/otel.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;
|
|
1
|
+
{"version":3,"file":"otel.d.ts","sourceRoot":"","sources":["../../../src/types/otel.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAEjD,MAAM,MAAM,eAAe,GAAG;IAC5B;;;;OAIG;IACH,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB;;;OAGG;IACH,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB,CAAC;AAEF,MAAM,MAAM,aAAa,GAAG;IAC1B,SAAS,CAAC,EAAE,eAAe,CAAC;CAC7B,CAAC"}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { EvaluatorBase } from "../core/EvaluatorBase.js";
|
|
2
|
-
import { ObjectMapping } from "../types/data.js";
|
|
1
|
+
import type { EvaluatorBase } from "../core/EvaluatorBase.js";
|
|
2
|
+
import type { ObjectMapping } from "../types/data.js";
|
|
3
3
|
/**
|
|
4
4
|
* Context for binding an evaluator with input mapping configuration.
|
|
5
5
|
*
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"bindEvaluator.d.ts","sourceRoot":"","sources":["../../../src/utils/bindEvaluator.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,aAAa,EAAE,MAAM,uBAAuB,CAAC;
|
|
1
|
+
{"version":3,"file":"bindEvaluator.d.ts","sourceRoot":"","sources":["../../../src/utils/bindEvaluator.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,uBAAuB,CAAC;AAC3D,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,eAAe,CAAC;AAEnD;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA2CG;AACH,MAAM,MAAM,cAAc,CAAC,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,IAAI;IACvE;;;;;;;;;;;;OAYG;IACH,YAAY,EAAE,aAAa,CAAC,UAAU,CAAC,CAAC;CACzC,CAAC;AAEF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA0JG;AACH,wBAAgB,aAAa,CAAC,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,EACtE,SAAS,EAAE,aAAa,CAAC,UAAU,CAAC,EACpC,OAAO,EAAE,cAAc,CAAC,UAAU,CAAC,GAClC,aAAa,CAAC,UAAU,CAAC,CAM3B"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"objectMappingUtils.d.ts","sourceRoot":"","sources":["../../../src/utils/objectMappingUtils.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"objectMappingUtils.d.ts","sourceRoot":"","sources":["../../../src/utils/objectMappingUtils.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,aAAa,EAAe,MAAM,eAAe,CAAC;AAEhE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAkKG;AACH,wBAAgB,WAAW,CAAC,QAAQ,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,EAClE,IAAI,EAAE,QAAQ,EACd,OAAO,EAAE,aAAa,CAAC,QAAQ,CAAC,GAC/B,QAAQ,CAUV"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"objectMappingUtils.js","sourceRoot":"","sources":["../../../src/utils/objectMappingUtils.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"objectMappingUtils.js","sourceRoot":"","sources":["../../../src/utils/objectMappingUtils.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,eAAe,CAAC;AAIzC;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAkKG;AACH,MAAM,UAAU,WAAW,CACzB,IAAc,EACd,OAAgC;IAEhC,OAAO;QACL,GAAG,IAAI;QACP,GAAG,MAAM,CAAC,WAAW,CACnB,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,EAAE,KAAK,CAAC,EAAE,EAAE,CAAC;YAC5C,GAAG;YACH,oBAAoB,CAAC,IAAI,EAAE,KAAK,CAAC;SAClC,CAAC,CACH;KACF,CAAC;AACJ,CAAC;AAED;;;;;;;;;;;GAWG;AACH,SAAS,oBAAoB,CAC3B,IAAc,EACd,WAAkC;IAElC,OAAO,OAAO,WAAW,KAAK,UAAU;QACtC,CAAC,CAAC,WAAW,CAAC,IAAI,CAAC;QACnB,CAAC,CAAC,QAAQ,CAAC,EAAE,IAAI,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,CAAC,CAAC;AAC/D,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"CONCISENESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/CONCISENESS_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,6BAA6B,EAAE,MAAM,UAAU,CAAC;AAE9D,eAAO,MAAM,2CAA2C,EAAE,6BAuDzD,CAAC"}
|
package/dist/src/__generated__/default_templates/CONCISENESS_CLASSIFICATION_EVALUATOR_CONFIG.js
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
// This file is generated. Do not edit by hand.
|
|
3
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
4
|
+
exports.CONCISENESS_CLASSIFICATION_EVALUATOR_CONFIG = void 0;
|
|
5
|
+
exports.CONCISENESS_CLASSIFICATION_EVALUATOR_CONFIG = {
|
|
6
|
+
name: "conciseness",
|
|
7
|
+
description: "Evaluate whether model outputs are concise and free of unnecessary content.",
|
|
8
|
+
optimizationDirection: "MAXIMIZE",
|
|
9
|
+
template: [
|
|
10
|
+
{
|
|
11
|
+
role: "user",
|
|
12
|
+
content: `
|
|
13
|
+
You are an expert evaluator assessing the conciseness of model outputs. Your task is to determine whether a response uses the minimum number of words necessary to fully answer the question.
|
|
14
|
+
|
|
15
|
+
<rubric>
|
|
16
|
+
|
|
17
|
+
CONCISE - The response:
|
|
18
|
+
|
|
19
|
+
- Contains only the exact information requested
|
|
20
|
+
- Uses the minimum number of words necessary to convey the complete answer
|
|
21
|
+
- Omits pleasantries, hedging language, and unnecessary context
|
|
22
|
+
- Excludes meta-commentary about the answer or the model's capabilities
|
|
23
|
+
- Avoids redundant information or restatements
|
|
24
|
+
- Does not include explanations unless explicitly requested
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
VERBOSE - The response contains any of:
|
|
28
|
+
|
|
29
|
+
- Unnecessary pleasantries, greetings, or filler phrases (e.g., "Great question!", "Sure!", "I'd be happy to help")
|
|
30
|
+
- Hedging language or excessive qualifiers (e.g., "It's worth noting that...", "It's important to understand that...")
|
|
31
|
+
- Meta-commentary about the response itself or the model's capabilities
|
|
32
|
+
- Redundant restatements of the same information
|
|
33
|
+
- Unsolicited explanations, context, or caveats beyond what was asked
|
|
34
|
+
- Unnecessary formatting, bullet points, or structure for simple answers
|
|
35
|
+
|
|
36
|
+
</rubric>
|
|
37
|
+
|
|
38
|
+
<data>
|
|
39
|
+
|
|
40
|
+
<input>
|
|
41
|
+
{{input}}
|
|
42
|
+
</input>
|
|
43
|
+
|
|
44
|
+
<output>
|
|
45
|
+
{{output}}
|
|
46
|
+
</output>
|
|
47
|
+
|
|
48
|
+
</data>
|
|
49
|
+
|
|
50
|
+
Evaluate only the conciseness of the response. Do not assess correctness, helpfulness, or quality of information. Focus solely on whether the response uses more words than necessary to answer the question.
|
|
51
|
+
|
|
52
|
+
Is the output concise or verbose?
|
|
53
|
+
`,
|
|
54
|
+
},
|
|
55
|
+
],
|
|
56
|
+
choices: {
|
|
57
|
+
"concise": 1,
|
|
58
|
+
"verbose": 0
|
|
59
|
+
},
|
|
60
|
+
};
|
|
61
|
+
//# sourceMappingURL=CONCISENESS_CLASSIFICATION_EVALUATOR_CONFIG.js.map
|
package/dist/src/__generated__/default_templates/CONCISENESS_CLASSIFICATION_EVALUATOR_CONFIG.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"CONCISENESS_CLASSIFICATION_EVALUATOR_CONFIG.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/CONCISENESS_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":";AAAA,+CAA+C;;;AAIlC,QAAA,2CAA2C,GAAkC;IACxF,IAAI,EAAE,aAAa;IACnB,WAAW,EAAE,6EAA6E;IAC1F,qBAAqB,EAAE,UAAU;IACjC,QAAQ,EAAE;QACR;YACE,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CAyCd;SACI;KACF;IACD,OAAO,EAAE;QACT,SAAS,EAAE,CAAC;QACZ,SAAS,EAAE,CAAC;KACb;CACA,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,6BAA6B,EAAE,MAAM,UAAU,CAAC;AAE9D,eAAO,MAAM,2CAA2C,EAAE,
|
|
1
|
+
{"version":3,"file":"CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,6BAA6B,EAAE,MAAM,UAAU,CAAC;AAE9D,eAAO,MAAM,2CAA2C,EAAE,6BAsDzD,CAAC"}
|
package/dist/src/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.js
CHANGED
|
@@ -4,7 +4,7 @@ Object.defineProperty(exports, "__esModule", { value: true });
|
|
|
4
4
|
exports.CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG = void 0;
|
|
5
5
|
exports.CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG = {
|
|
6
6
|
name: "correctness",
|
|
7
|
-
description: "Assess
|
|
7
|
+
description: "Assess general correctness and completeness of model outputs.",
|
|
8
8
|
optimizationDirection: "MAXIMIZE",
|
|
9
9
|
template: [
|
|
10
10
|
{
|
|
@@ -13,29 +13,37 @@ exports.CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG = {
|
|
|
13
13
|
You are an expert evaluator labeling model outputs for correctness. Your task is to assign a classification based on the following criteria:
|
|
14
14
|
|
|
15
15
|
<rubric>
|
|
16
|
+
|
|
16
17
|
CORRECT - The response:
|
|
18
|
+
|
|
17
19
|
- Provides accurate and complete information with no factual errors
|
|
18
20
|
- Addresses all parts of the question
|
|
19
21
|
- Is logically consistent with no contradictions
|
|
20
22
|
- Uses precise, domain-appropriate terminology
|
|
21
23
|
- Avoids ambiguous or misleading language
|
|
22
24
|
|
|
25
|
+
|
|
23
26
|
INCORRECT - The response contains any of:
|
|
27
|
+
|
|
24
28
|
- Factual errors or inaccuracies
|
|
25
29
|
- Incomplete or partial answers
|
|
26
30
|
- Misleading or ambiguous statements
|
|
27
31
|
- Incorrect terminology
|
|
28
32
|
- Logical inconsistencies
|
|
29
33
|
- Missing key information
|
|
34
|
+
|
|
30
35
|
</rubric>
|
|
31
36
|
|
|
32
37
|
<data>
|
|
38
|
+
|
|
33
39
|
<input>
|
|
34
40
|
{{input}}
|
|
35
41
|
</input>
|
|
42
|
+
|
|
36
43
|
<output>
|
|
37
44
|
{{output}}
|
|
38
45
|
</output>
|
|
46
|
+
|
|
39
47
|
</data>
|
|
40
48
|
|
|
41
49
|
Carefully read the input and output and check for factual accuracy and completeness. Focus on correctness of information rather than verboseness or style.
|
package/dist/src/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":";AAAA,+CAA+C;;;AAIlC,QAAA,2CAA2C,GAAkC;IACxF,IAAI,EAAE,aAAa;IACnB,WAAW,EAAE
|
|
1
|
+
{"version":3,"file":"CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":";AAAA,+CAA+C;;;AAIlC,QAAA,2CAA2C,GAAkC;IACxF,IAAI,EAAE,aAAa;IACnB,WAAW,EAAE,+DAA+D;IAC5E,qBAAqB,EAAE,UAAU;IACjC,QAAQ,EAAE;QACR;YACE,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CAwCd;SACI;KACF;IACD,OAAO,EAAE;QACT,SAAS,EAAE,CAAC;QACZ,WAAW,EAAE,CAAC;KACf;CACA,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,6BAA6B,EAAE,MAAM,UAAU,CAAC;AAE9D,eAAO,MAAM,kDAAkD,EAAE,
|
|
1
|
+
{"version":3,"file":"DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,6BAA6B,EAAE,MAAM,UAAU,CAAC;AAE9D,eAAO,MAAM,kDAAkD,EAAE,6BAsChE,CAAC"}
|
|
@@ -4,24 +4,35 @@ Object.defineProperty(exports, "__esModule", { value: true });
|
|
|
4
4
|
exports.DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG = void 0;
|
|
5
5
|
exports.DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG = {
|
|
6
6
|
name: "document_relevance",
|
|
7
|
-
description: "
|
|
7
|
+
description: "For determining if a document is relevant to a given question.",
|
|
8
8
|
optimizationDirection: "MAXIMIZE",
|
|
9
9
|
template: [
|
|
10
10
|
{
|
|
11
11
|
role: "user",
|
|
12
12
|
content: `
|
|
13
|
-
You are comparing a document to a question and trying to determine
|
|
13
|
+
You are comparing a document to a question and trying to determine
|
|
14
|
+
if the document text contains information relevant to answering the
|
|
15
|
+
question. Here is the data:
|
|
14
16
|
|
|
15
17
|
<data>
|
|
18
|
+
|
|
16
19
|
<question>
|
|
17
20
|
{{input}}
|
|
18
21
|
</question>
|
|
22
|
+
|
|
19
23
|
<document_text>
|
|
20
24
|
{{documentText}}
|
|
21
25
|
</document_text>
|
|
26
|
+
|
|
22
27
|
</data>
|
|
23
28
|
|
|
24
|
-
Compare the question above to the document text. You must determine
|
|
29
|
+
Compare the question above to the document text. You must determine
|
|
30
|
+
whether the document text contains information that can answer the
|
|
31
|
+
question. Please focus on whether the very specific question can be
|
|
32
|
+
answered by the information in the document text. Your response must be
|
|
33
|
+
either "relevant" or "unrelated". "unrelated" means that the document
|
|
34
|
+
text does not contain an answer to the question. "relevant" means the
|
|
35
|
+
document text contains an answer to the question.
|
|
25
36
|
`,
|
|
26
37
|
},
|
|
27
38
|
],
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":";AAAA,+CAA+C;;;AAIlC,QAAA,kDAAkD,GAAkC;IAC/F,IAAI,EAAE,oBAAoB;IAC1B,WAAW,EAAE,
|
|
1
|
+
{"version":3,"file":"DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":";AAAA,+CAA+C;;;AAIlC,QAAA,kDAAkD,GAAkC;IAC/F,IAAI,EAAE,oBAAoB;IAC1B,WAAW,EAAE,gEAAgE;IAC7E,qBAAqB,EAAE,UAAU;IACjC,QAAQ,EAAE;QACR;YACE,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE;;;;;;;;;;;;;;;;;;;;;;;;CAwBd;SACI;KACF;IACD,OAAO,EAAE;QACT,UAAU,EAAE,CAAC;QACb,WAAW,EAAE,CAAC;KACf;CACA,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,6BAA6B,EAAE,MAAM,UAAU,CAAC;AAE9D,eAAO,MAAM,4CAA4C,EAAE,
|
|
1
|
+
{"version":3,"file":"FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,6BAA6B,EAAE,MAAM,UAAU,CAAC;AAE9D,eAAO,MAAM,4CAA4C,EAAE,6BA4C1D,CAAC"}
|
package/dist/src/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.js
CHANGED
|
@@ -4,24 +4,38 @@ Object.defineProperty(exports, "__esModule", { value: true });
|
|
|
4
4
|
exports.FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG = void 0;
|
|
5
5
|
exports.FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG = {
|
|
6
6
|
name: "faithfulness",
|
|
7
|
-
description: "
|
|
7
|
+
description: "For determining if a response is faithful to the context.",
|
|
8
8
|
optimizationDirection: "MAXIMIZE",
|
|
9
9
|
template: [
|
|
10
10
|
{
|
|
11
11
|
role: "user",
|
|
12
12
|
content: `
|
|
13
|
-
In this task, you will be presented with a query, some context and a response. The response is generated to the question based on the context. The response may contain false information. You must use the context to determine if the response to the question contains false information, if the response is unfaithful to the facts.
|
|
13
|
+
In this task, you will be presented with a query, some context and a response. The response is generated to the question based on the context. The response may contain false information. You must use the context to determine if the response to the question contains false information, if the response is unfaithful to the facts.
|
|
14
|
+
|
|
15
|
+
Your objective is to determine whether the response text contains factual information and is faithful to the context. An 'unfaithful' response refers to a response that is not based on the context or assumes information that is not available in the context.
|
|
16
|
+
|
|
17
|
+
Your response should be a single word: either 'faithful' or 'unfaithful', and it should not include any other text or characters.
|
|
18
|
+
|
|
19
|
+
'unfaithful' indicates that the response provides factually inaccurate information to the query based on the context.
|
|
20
|
+
|
|
21
|
+
'faithful' indicates that the response to the question is correct relative to the context, and does not contain made up information.
|
|
22
|
+
|
|
23
|
+
Please read the query and context carefully before determining your response.
|
|
14
24
|
|
|
15
25
|
<data>
|
|
26
|
+
|
|
16
27
|
<query>
|
|
17
28
|
{{input}}
|
|
18
29
|
</query>
|
|
30
|
+
|
|
19
31
|
<context>
|
|
20
32
|
{{context}}
|
|
21
33
|
</context>
|
|
34
|
+
|
|
22
35
|
<response>
|
|
23
36
|
{{output}}
|
|
24
37
|
</response>
|
|
38
|
+
|
|
25
39
|
</data>
|
|
26
40
|
|
|
27
41
|
Is the response above faithful or unfaithful based on the query and context?
|
package/dist/src/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":";AAAA,+CAA+C;;;AAIlC,QAAA,4CAA4C,GAAkC;IACzF,IAAI,EAAE,cAAc;IACpB,WAAW,EAAE
|
|
1
|
+
{"version":3,"file":"FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":";AAAA,+CAA+C;;;AAIlC,QAAA,4CAA4C,GAAkC;IACzF,IAAI,EAAE,cAAc;IACpB,WAAW,EAAE,2DAA2D;IACxE,qBAAqB,EAAE,UAAU;IACjC,QAAQ,EAAE;QACR;YACE,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CA8Bd;SACI;KACF;IACD,OAAO,EAAE;QACT,UAAU,EAAE,CAAC;QACb,YAAY,EAAE,CAAC;KAChB;CACA,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,6BAA6B,EAAE,MAAM,UAAU,CAAC;AAE9D,eAAO,MAAM,6CAA6C,EAAE,
|
|
1
|
+
{"version":3,"file":"HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,6BAA6B,EAAE,MAAM,UAAU,CAAC;AAE9D,eAAO,MAAM,6CAA6C,EAAE,6BA4C3D,CAAC"}
|
package/dist/src/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.js
CHANGED
|
@@ -10,18 +10,32 @@ exports.HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG = {
|
|
|
10
10
|
{
|
|
11
11
|
role: "user",
|
|
12
12
|
content: `
|
|
13
|
-
In this task, you will be presented with a query, some context and a response. The response is generated to the question based on the context. The response may contain false information. You must use the context to determine if the response to the question contains false information, if the response is
|
|
13
|
+
In this task, you will be presented with a query, some context and a response. The response is generated to the question based on the context. The response may contain false information. You must use the context to determine if the response to the question contains false information, if the response is hallucinated.
|
|
14
|
+
|
|
15
|
+
Your objective is to determine whether the response text contains factual information and is factual relative to the context. An 'hallucinated' response refers to a response that is not based on the context or assumes information that is not available in the context.
|
|
16
|
+
|
|
17
|
+
Your response should be a single word: either 'factual' or 'hallucinated', and it should not include any other text or characters.
|
|
18
|
+
|
|
19
|
+
'hallucinated' indicates that the response provides factually inaccurate information to the query based on the context.
|
|
20
|
+
|
|
21
|
+
'factual' indicates that the response to the question is correct relative to the context, and does not contain made up information.
|
|
22
|
+
|
|
23
|
+
Please read the query and context carefully before determining your response.
|
|
14
24
|
|
|
15
25
|
<data>
|
|
26
|
+
|
|
16
27
|
<query>
|
|
17
28
|
{{input}}
|
|
18
29
|
</query>
|
|
30
|
+
|
|
19
31
|
<context>
|
|
20
32
|
{{context}}
|
|
21
33
|
</context>
|
|
34
|
+
|
|
22
35
|
<response>
|
|
23
36
|
{{output}}
|
|
24
37
|
</response>
|
|
38
|
+
|
|
25
39
|
</data>
|
|
26
40
|
|
|
27
41
|
Is the response above factual or hallucinated based on the query and context?
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":";AAAA,+CAA+C;;;AAIlC,QAAA,6CAA6C,GAAkC;IAC1F,IAAI,EAAE,eAAe;IACrB,WAAW,EAAE,iFAAiF;IAC9F,qBAAqB,EAAE,UAAU;IACjC,QAAQ,EAAE;QACR;YACE,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE
|
|
1
|
+
{"version":3,"file":"HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":";AAAA,+CAA+C;;;AAIlC,QAAA,6CAA6C,GAAkC;IAC1F,IAAI,EAAE,eAAe;IACrB,WAAW,EAAE,iFAAiF;IAC9F,qBAAqB,EAAE,UAAU;IACjC,QAAQ,EAAE;QACR;YACE,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CA8Bd;SACI;KACF;IACD,OAAO,EAAE;QACT,cAAc,EAAE,CAAC;QACjB,SAAS,EAAE,CAAC;KACb;CACA,CAAC"}
|
package/dist/src/__generated__/default_templates/REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,6BAA6B,EAAE,MAAM,UAAU,CAAC;AAE9D,eAAO,MAAM,uCAAuC,EAAE,6BAwDrD,CAAC"}
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
// This file is generated. Do not edit by hand.
|
|
3
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
4
|
+
exports.REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG = void 0;
|
|
5
|
+
exports.REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG = {
|
|
6
|
+
name: "refusal",
|
|
7
|
+
description: "Detect when an LLM refuses or declines to answer a query.",
|
|
8
|
+
optimizationDirection: "NEUTRAL",
|
|
9
|
+
template: [
|
|
10
|
+
{
|
|
11
|
+
role: "user",
|
|
12
|
+
content: `
|
|
13
|
+
You are an expert evaluator that detects when an LLM refuses, declines, or avoids answering a user query. Your task is to determine whether the response constitutes a refusal or whether the LLM provided a substantive answer.
|
|
14
|
+
|
|
15
|
+
<rubric>
|
|
16
|
+
|
|
17
|
+
REFUSED - The response contains any form of refusal, deflection, or avoidance, including but not limited to:
|
|
18
|
+
|
|
19
|
+
- Explicit refusals (e.g., "I can't help with that", "I'm unable to answer")
|
|
20
|
+
- Scope disclaimers (e.g., "That's outside my area of expertise", "I'm just a chatbot that handles X")
|
|
21
|
+
- Lack-of-information responses (e.g., "I don't have that information", "Nothing in the search results addresses that")
|
|
22
|
+
- Safety or policy refusals (e.g., "I'm not able to provide that kind of advice")
|
|
23
|
+
- Redirections without answering (e.g., "You should ask a professional about that")
|
|
24
|
+
- Apologetic non-answers (e.g., "I'm sorry, but I can't assist with that request")
|
|
25
|
+
- Partial refusals where the LLM acknowledges the question but ultimately does not provide an answer
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
ANSWERED - The response provides a substantive answer to the query, even if:
|
|
29
|
+
|
|
30
|
+
- The answer is incorrect, incomplete, or low quality
|
|
31
|
+
- The answer includes caveats or disclaimers alongside a genuine attempt to respond
|
|
32
|
+
- The answer is speculative or hedged but still addresses the question
|
|
33
|
+
- The answer acknowledges limitations but still provides relevant information
|
|
34
|
+
|
|
35
|
+
</rubric>
|
|
36
|
+
|
|
37
|
+
IMPORTANT: This evaluation is purely about whether a refusal occurred, not whether the refusal was appropriate. A refusal to an out-of-scope question is still a refusal. An incorrect answer to a simple question is still an answer.
|
|
38
|
+
|
|
39
|
+
<data>
|
|
40
|
+
|
|
41
|
+
<input>
|
|
42
|
+
{{input}}
|
|
43
|
+
</input>
|
|
44
|
+
|
|
45
|
+
<output>
|
|
46
|
+
{{output}}
|
|
47
|
+
</output>
|
|
48
|
+
|
|
49
|
+
</data>
|
|
50
|
+
|
|
51
|
+
Carefully read the input and output. Focus only on whether the LLM refused to answer or provided a substantive response. Do not evaluate correctness, helpfulness, or quality.
|
|
52
|
+
|
|
53
|
+
Did the LLM refuse to answer or did it provide an answer?
|
|
54
|
+
`,
|
|
55
|
+
},
|
|
56
|
+
],
|
|
57
|
+
choices: {
|
|
58
|
+
"refused": 1,
|
|
59
|
+
"answered": 0
|
|
60
|
+
},
|
|
61
|
+
};
|
|
62
|
+
//# sourceMappingURL=REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG.js.map
|
package/dist/src/__generated__/default_templates/REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/REFUSAL_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":";AAAA,+CAA+C;;;AAIlC,QAAA,uCAAuC,GAAkC;IACpF,IAAI,EAAE,SAAS;IACf,WAAW,EAAE,2DAA2D;IACxE,qBAAqB,EAAE,SAAS;IAChC,QAAQ,EAAE;QACR;YACE,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CA0Cd;SACI;KACF;IACD,OAAO,EAAE;QACT,SAAS,EAAE,CAAC;QACZ,UAAU,EAAE,CAAC;KACd;CACA,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,6BAA6B,EAAE,MAAM,UAAU,CAAC;AAE9D,eAAO,MAAM,+CAA+C,EAAE,
|
|
1
|
+
{"version":3,"file":"TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,6BAA6B,EAAE,MAAM,UAAU,CAAC;AAE9D,eAAO,MAAM,+CAA+C,EAAE,6BA6E7D,CAAC"}
|
package/dist/src/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.js
CHANGED
|
@@ -4,23 +4,73 @@ Object.defineProperty(exports, "__esModule", { value: true });
|
|
|
4
4
|
exports.TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG = void 0;
|
|
5
5
|
exports.TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG = {
|
|
6
6
|
name: "tool_invocation",
|
|
7
|
-
description: "For determining if a tool was invoked correctly with proper arguments, formatting, and safe content.
|
|
7
|
+
description: "For determining if a tool was invoked correctly with proper arguments, formatting, and safe content.",
|
|
8
8
|
optimizationDirection: "MAXIMIZE",
|
|
9
9
|
template: [
|
|
10
10
|
{
|
|
11
11
|
role: "user",
|
|
12
12
|
content: `
|
|
13
13
|
You are an impartial judge evaluating an LLM's tool-calling behavior, specifically whether the LLM invoked a tool (or tools) correctly with valid arguments and proper formatting.
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
14
|
+
|
|
15
|
+
Your task: Determine whether the LLM's tool invocation(s) were correct or incorrect based on:
|
|
16
|
+
- The full conversation context (including all previous turns, not just the most recent message)
|
|
17
|
+
- The available tool schemas
|
|
18
|
+
- The LLM's tool invocation(s) with arguments
|
|
19
|
+
|
|
20
|
+
IMPORTANT - Tool Invocation vs. Tool Selection:
|
|
21
|
+
- You are ONLY evaluating the tool invocation, not the tool selection.
|
|
22
|
+
- If the tool selection is incorrect or not relevant to the user's query, but the tool invocation is correct, return "correct".
|
|
23
|
+
- If the tool selection is correct but the tool invocation is incorrect, return "incorrect".
|
|
24
|
+
|
|
25
|
+
IMPORTANT - Multi-Tool Invocations:
|
|
26
|
+
- The LLM may invoke MULTIPLE tools in a single response. This is valid and expected for complex requests.
|
|
27
|
+
- When multiple tools are invoked, evaluate EACH tool invocation independently.
|
|
28
|
+
- Return "correct" only if ALL tool invocations are correct.
|
|
29
|
+
- Return "incorrect" if ANY tool invocation has an error.
|
|
30
|
+
|
|
31
|
+
IMPORTANT - Conversation Context (input):
|
|
32
|
+
- Read the entire conversation history carefully, not just the final user message.
|
|
33
|
+
- Argument values may need to be extracted from EARLIER turns in the conversation (e.g., user mentions a location, date, or quantity in a previous message).
|
|
34
|
+
- The LLM should use context from the full conversation to populate argument values correctly.
|
|
35
|
+
|
|
36
|
+
Criteria
|
|
37
|
+
Return "correct" only when ALL of the following are true for EVERY tool invocation:
|
|
38
|
+
- JSON is properly structured (if applicable).
|
|
39
|
+
- All required fields/parameters are present.
|
|
40
|
+
- No hallucinated or nonexistent fields (all fields exist in the tool schema).
|
|
41
|
+
- Argument values match the user's intent from the conversation context (correct types, realistic values).
|
|
42
|
+
- No unsafe content (e.g., PII like SSNs, credit card numbers, passwords) in arguments.
|
|
43
|
+
|
|
44
|
+
Return "incorrect" if ANY of the following are true for ANY tool invocation:
|
|
45
|
+
- The invocation contains hallucinated or nonexistent fields not in the schema.
|
|
46
|
+
- Required fields/parameters are missing.
|
|
47
|
+
- JSON is improperly formatted or malformed.
|
|
48
|
+
- Argument values are incorrect, hallucinated, or do not match user intent from the conversation.
|
|
49
|
+
- Arguments contain unsafe content (e.g., PII, sensitive data that should not be passed).
|
|
50
|
+
|
|
51
|
+
Before providing your final judgment, explain your reasoning and consider:
|
|
52
|
+
- How many tools were invoked? Evaluate each one.
|
|
53
|
+
- Does each tool invocation match the schema for that tool?
|
|
54
|
+
- Are all required parameters provided with appropriate values for each invocation?
|
|
55
|
+
- Are there any extra fields that don't exist in the schema?
|
|
56
|
+
- Looking at the FULL input: do the argument values accurately reflect what the user requested across all messages?
|
|
57
|
+
- Is there any unsafe or sensitive content in any of the arguments?
|
|
58
|
+
- Check that you are not evaluating the tool selection, only the tool invocation.
|
|
59
|
+
|
|
60
|
+
<data>
|
|
61
|
+
<input>
|
|
62
|
+
{{input}}
|
|
63
|
+
</input>
|
|
64
|
+
|
|
65
|
+
<available_tools>
|
|
66
|
+
{{availableTools}}
|
|
67
|
+
</available_tools>
|
|
68
|
+
|
|
69
|
+
<output>
|
|
70
|
+
{{toolSelection}}
|
|
71
|
+
</output>
|
|
72
|
+
</data>
|
|
73
|
+
|
|
24
74
|
Given the above data, is the tool invocation correct or incorrect?
|
|
25
75
|
`,
|
|
26
76
|
},
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":";AAAA,+CAA+C;;;AAIlC,QAAA,+CAA+C,GAAkC;IAC5F,IAAI,EAAE,iBAAiB;IACvB,WAAW,EAAE
|
|
1
|
+
{"version":3,"file":"TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":";AAAA,+CAA+C;;;AAIlC,QAAA,+CAA+C,GAAkC;IAC5F,IAAI,EAAE,iBAAiB;IACvB,WAAW,EAAE,sGAAsG;IACnH,qBAAqB,EAAE,UAAU;IACjC,QAAQ,EAAE;QACR;YACE,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CA+Dd;SACI;KACF;IACD,OAAO,EAAE;QACT,SAAS,EAAE,CAAC;QACZ,WAAW,EAAE,CAAC;KACf;CACA,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,6BAA6B,EAAE,MAAM,UAAU,CAAC;AAE9D,eAAO,MAAM,8CAA8C,EAAE,
|
|
1
|
+
{"version":3,"file":"TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,6BAA6B,EAAE,MAAM,UAAU,CAAC;AAE9D,eAAO,MAAM,8CAA8C,EAAE,6BA4D5D,CAAC"}
|
package/dist/src/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.js
CHANGED
|
@@ -4,20 +4,56 @@ Object.defineProperty(exports, "__esModule", { value: true });
|
|
|
4
4
|
exports.TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG = void 0;
|
|
5
5
|
exports.TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG = {
|
|
6
6
|
name: "tool_selection",
|
|
7
|
-
description: "For determining if the correct tool was selected for a given context.
|
|
7
|
+
description: "For determining if the correct tool was selected for a given context.",
|
|
8
8
|
optimizationDirection: "MAXIMIZE",
|
|
9
9
|
template: [
|
|
10
10
|
{
|
|
11
11
|
role: "user",
|
|
12
12
|
content: `
|
|
13
13
|
You are an impartial judge evaluating an LLM's tool-calling behavior, specifically whether the LLM selected the most appropriate tool or tools for the task.
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
14
|
+
|
|
15
|
+
Your task: Determine whether the LLM's tool selection was correct or incorrect based on:
|
|
16
|
+
- The conversation context (input)
|
|
17
|
+
- The available tools
|
|
18
|
+
- The LLM's output and tool invocation(s)
|
|
19
|
+
|
|
20
|
+
Criteria
|
|
21
|
+
Return "correct" only when ALL of the following are true:
|
|
22
|
+
- The LLM chose the best available tool for the user query OR correctly avoided tools if none were needed.
|
|
23
|
+
- The tool name exists in the available tools list.
|
|
24
|
+
- The tool is allowed and safe to call.
|
|
25
|
+
- The LLM selected the correct number of tools for the task.
|
|
26
|
+
|
|
27
|
+
Return "incorrect" if ANY of the following are true:
|
|
28
|
+
- The LLM used a hallucinated or nonexistent tool.
|
|
29
|
+
- The LLM selected a tool when none was needed.
|
|
30
|
+
- The LLM did not use a tool when one was required.
|
|
31
|
+
- The LLM chose a suboptimal or irrelevant tool.
|
|
32
|
+
- The LLM selected an unsafe or not-permitted tool.
|
|
33
|
+
- The tool name does not appear in the available tools list.
|
|
34
|
+
|
|
35
|
+
Before providing your final judgment, explain your reasoning and consider:
|
|
36
|
+
- What does the input context require?
|
|
37
|
+
- Can this be answered without tools, or is a tool necessary?
|
|
38
|
+
- If a tool was selected, does it exist in the available tools?
|
|
39
|
+
- Does the selected tool's description match the user's needs?
|
|
40
|
+
- Is the selection safe and appropriate?
|
|
41
|
+
- Is there a better tool available that should have been chosen instead?
|
|
42
|
+
|
|
43
|
+
<data>
|
|
44
|
+
<input>
|
|
45
|
+
{{input}}
|
|
46
|
+
</input>
|
|
47
|
+
|
|
48
|
+
<available_tools>
|
|
49
|
+
{{availableTools}}
|
|
50
|
+
</available_tools>
|
|
51
|
+
|
|
52
|
+
<output>
|
|
53
|
+
{{toolSelection}}
|
|
54
|
+
</output>
|
|
55
|
+
</data>
|
|
56
|
+
|
|
21
57
|
Given the above data, is the tool selection correct or incorrect?
|
|
22
58
|
`,
|
|
23
59
|
},
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":";AAAA,+CAA+C;;;AAIlC,QAAA,8CAA8C,GAAkC;IAC3F,IAAI,EAAE,gBAAgB;IACtB,WAAW,EAAE,
|
|
1
|
+
{"version":3,"file":"TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":";AAAA,+CAA+C;;;AAIlC,QAAA,8CAA8C,GAAkC;IAC3F,IAAI,EAAE,gBAAgB;IACtB,WAAW,EAAE,uEAAuE;IACpF,qBAAqB,EAAE,UAAU;IACjC,QAAQ,EAAE;QACR;YACE,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CA8Cd;SACI;KACF;IACD,OAAO,EAAE;QACT,SAAS,EAAE,CAAC;QACZ,WAAW,EAAE,CAAC;KACf;CACA,CAAC"}
|