judgeval 0.2.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. package/dist/cjs/common/logger.js +28 -24
  2. package/dist/cjs/common/logger.js.map +1 -1
  3. package/dist/cjs/common/tracer.js +80 -130
  4. package/dist/cjs/common/tracer.js.map +1 -1
  5. package/dist/cjs/constants.js +2 -1
  6. package/dist/cjs/constants.js.map +1 -1
  7. package/dist/cjs/data/datasets/eval-dataset-client.js +45 -0
  8. package/dist/cjs/data/datasets/eval-dataset-client.js.map +1 -1
  9. package/dist/cjs/e2etests/eval-operations.test.js +3 -3
  10. package/dist/cjs/exporters/otel-exporter.js +352 -0
  11. package/dist/cjs/exporters/otel-exporter.js.map +1 -0
  12. package/dist/cjs/judges/index.js +217 -0
  13. package/dist/cjs/judges/index.js.map +1 -0
  14. package/dist/cjs/run-evaluation.js +13 -13
  15. package/dist/cjs/run-evaluation.js.map +1 -1
  16. package/dist/cjs/scorers/metrics/answer-correctness/answer-correctness.js +610 -0
  17. package/dist/cjs/scorers/metrics/answer-correctness/answer-correctness.js.map +1 -0
  18. package/dist/cjs/scorers/metrics/answer-correctness/index.js +19 -0
  19. package/dist/cjs/scorers/metrics/answer-correctness/index.js.map +1 -0
  20. package/dist/cjs/scorers/metrics/answer-correctness/prompts.js +175 -0
  21. package/dist/cjs/scorers/metrics/answer-correctness/prompts.js.map +1 -0
  22. package/dist/cjs/scorers/metrics/answer-relevancy/answer-relevancy.js +525 -0
  23. package/dist/cjs/scorers/metrics/answer-relevancy/answer-relevancy.js.map +1 -0
  24. package/dist/cjs/scorers/metrics/answer-relevancy/index.js +19 -0
  25. package/dist/cjs/scorers/metrics/answer-relevancy/index.js.map +1 -0
  26. package/dist/cjs/scorers/metrics/answer-relevancy/prompts.js +179 -0
  27. package/dist/cjs/scorers/metrics/answer-relevancy/prompts.js.map +1 -0
  28. package/dist/cjs/scorers/metrics/faithfulness/faithfulness.js +524 -0
  29. package/dist/cjs/scorers/metrics/faithfulness/faithfulness.js.map +1 -0
  30. package/dist/cjs/scorers/metrics/faithfulness/index.js +19 -0
  31. package/dist/cjs/scorers/metrics/faithfulness/index.js.map +1 -0
  32. package/dist/cjs/scorers/metrics/faithfulness/prompts.js +232 -0
  33. package/dist/cjs/scorers/metrics/faithfulness/prompts.js.map +1 -0
  34. package/dist/cjs/scorers/metrics/hallucination/hallucination.js +390 -0
  35. package/dist/cjs/scorers/metrics/hallucination/hallucination.js.map +1 -0
  36. package/dist/cjs/scorers/metrics/hallucination/index.js +11 -0
  37. package/dist/cjs/scorers/metrics/hallucination/index.js.map +1 -0
  38. package/dist/cjs/scorers/metrics/hallucination/prompts.js +106 -0
  39. package/dist/cjs/scorers/metrics/hallucination/prompts.js.map +1 -0
  40. package/dist/cjs/scorers/metrics/instruction-adherence/index.js +19 -0
  41. package/dist/cjs/scorers/metrics/instruction-adherence/index.js.map +1 -0
  42. package/dist/cjs/scorers/metrics/instruction-adherence/instruction-adherence.js +382 -0
  43. package/dist/cjs/scorers/metrics/instruction-adherence/instruction-adherence.js.map +1 -0
  44. package/dist/cjs/scorers/metrics/instruction-adherence/prompts.js +124 -0
  45. package/dist/cjs/scorers/metrics/instruction-adherence/prompts.js.map +1 -0
  46. package/dist/esm/common/logger.js +16 -11
  47. package/dist/esm/common/logger.js.map +1 -1
  48. package/dist/esm/common/tracer.js +78 -128
  49. package/dist/esm/common/tracer.js.map +1 -1
  50. package/dist/esm/constants.js +1 -0
  51. package/dist/esm/constants.js.map +1 -1
  52. package/dist/esm/data/datasets/eval-dataset-client.js +46 -1
  53. package/dist/esm/data/datasets/eval-dataset-client.js.map +1 -1
  54. package/dist/esm/e2etests/eval-operations.test.js +3 -3
  55. package/dist/esm/exporters/otel-exporter.js +348 -0
  56. package/dist/esm/exporters/otel-exporter.js.map +1 -0
  57. package/dist/esm/judges/index.js +185 -0
  58. package/dist/esm/judges/index.js.map +1 -0
  59. package/dist/esm/scorers/metrics/answer-correctness/answer-correctness.js +601 -0
  60. package/dist/esm/scorers/metrics/answer-correctness/answer-correctness.js.map +1 -0
  61. package/dist/esm/scorers/metrics/answer-correctness/index.js +3 -0
  62. package/dist/esm/scorers/metrics/answer-correctness/index.js.map +1 -0
  63. package/dist/esm/scorers/metrics/answer-correctness/prompts.js +171 -0
  64. package/dist/esm/scorers/metrics/answer-correctness/prompts.js.map +1 -0
  65. package/dist/esm/scorers/metrics/answer-relevancy/answer-relevancy.js +521 -0
  66. package/dist/esm/scorers/metrics/answer-relevancy/answer-relevancy.js.map +1 -0
  67. package/dist/esm/scorers/metrics/answer-relevancy/index.js +3 -0
  68. package/dist/esm/scorers/metrics/answer-relevancy/index.js.map +1 -0
  69. package/dist/esm/scorers/metrics/answer-relevancy/prompts.js +175 -0
  70. package/dist/esm/scorers/metrics/answer-relevancy/prompts.js.map +1 -0
  71. package/dist/esm/scorers/metrics/faithfulness/faithfulness.js +520 -0
  72. package/dist/esm/scorers/metrics/faithfulness/faithfulness.js.map +1 -0
  73. package/dist/esm/scorers/metrics/faithfulness/index.js +3 -0
  74. package/dist/esm/scorers/metrics/faithfulness/index.js.map +1 -0
  75. package/dist/esm/scorers/metrics/faithfulness/prompts.js +228 -0
  76. package/dist/esm/scorers/metrics/faithfulness/prompts.js.map +1 -0
  77. package/dist/esm/scorers/metrics/hallucination/hallucination.js +386 -0
  78. package/dist/esm/scorers/metrics/hallucination/hallucination.js.map +1 -0
  79. package/dist/esm/scorers/metrics/hallucination/index.js +3 -0
  80. package/dist/esm/scorers/metrics/hallucination/index.js.map +1 -0
  81. package/dist/esm/scorers/metrics/hallucination/prompts.js +102 -0
  82. package/dist/esm/scorers/metrics/hallucination/prompts.js.map +1 -0
  83. package/dist/esm/scorers/metrics/instruction-adherence/index.js +3 -0
  84. package/dist/esm/scorers/metrics/instruction-adherence/index.js.map +1 -0
  85. package/dist/esm/scorers/metrics/instruction-adherence/instruction-adherence.js +378 -0
  86. package/dist/esm/scorers/metrics/instruction-adherence/instruction-adherence.js.map +1 -0
  87. package/dist/esm/scorers/metrics/instruction-adherence/prompts.js +120 -0
  88. package/dist/esm/scorers/metrics/instruction-adherence/prompts.js.map +1 -0
  89. package/dist/types/common/logger.d.ts +1 -1
  90. package/dist/types/constants.d.ts +1 -0
  91. package/dist/types/data/datasets/eval-dataset-client.d.ts +5 -0
  92. package/dist/types/exporters/otel-exporter.d.ts +16 -0
  93. package/dist/types/judges/index.d.ts +50 -0
  94. package/dist/types/scorers/metrics/answer-correctness/answer-correctness.d.ts +99 -0
  95. package/dist/types/scorers/metrics/answer-correctness/index.d.ts +2 -0
  96. package/dist/types/scorers/metrics/answer-correctness/prompts.d.ts +71 -0
  97. package/dist/types/scorers/metrics/answer-relevancy/answer-relevancy.d.ts +78 -0
  98. package/dist/types/scorers/metrics/answer-relevancy/index.d.ts +2 -0
  99. package/dist/types/scorers/metrics/answer-relevancy/prompts.d.ts +71 -0
  100. package/dist/types/scorers/metrics/faithfulness/faithfulness.d.ts +77 -0
  101. package/dist/types/scorers/metrics/faithfulness/index.d.ts +2 -0
  102. package/dist/types/scorers/metrics/faithfulness/prompts.d.ts +94 -0
  103. package/dist/types/scorers/metrics/hallucination/hallucination.d.ts +67 -0
  104. package/dist/types/scorers/metrics/hallucination/index.d.ts +3 -0
  105. package/dist/types/scorers/metrics/hallucination/prompts.d.ts +63 -0
  106. package/dist/types/scorers/metrics/instruction-adherence/index.d.ts +2 -0
  107. package/dist/types/scorers/metrics/instruction-adherence/instruction-adherence.d.ts +67 -0
  108. package/dist/types/scorers/metrics/instruction-adherence/prompts.d.ts +78 -0
  109. package/package.json +32 -14
@@ -0,0 +1,63 @@
1
+ import { z } from 'zod';
2
+ /**
3
+ * Schema for hallucination verdict
4
+ */
5
+ export declare const HallucinationVerdictSchema: z.ZodObject<{
6
+ verdict: z.ZodString;
7
+ reason: z.ZodString;
8
+ }, "strip", z.ZodTypeAny, {
9
+ reason: string;
10
+ verdict: string;
11
+ }, {
12
+ reason: string;
13
+ verdict: string;
14
+ }>;
15
+ export type HallucinationVerdict = z.infer<typeof HallucinationVerdictSchema>;
16
+ /**
17
+ * Schema for verdicts
18
+ */
19
+ export declare const VerdictsSchema: z.ZodObject<{
20
+ verdicts: z.ZodArray<z.ZodObject<{
21
+ verdict: z.ZodString;
22
+ reason: z.ZodString;
23
+ }, "strip", z.ZodTypeAny, {
24
+ reason: string;
25
+ verdict: string;
26
+ }, {
27
+ reason: string;
28
+ verdict: string;
29
+ }>, "many">;
30
+ }, "strip", z.ZodTypeAny, {
31
+ verdicts: {
32
+ reason: string;
33
+ verdict: string;
34
+ }[];
35
+ }, {
36
+ verdicts: {
37
+ reason: string;
38
+ verdict: string;
39
+ }[];
40
+ }>;
41
+ /**
42
+ * Schema for reason
43
+ */
44
+ export declare const ReasonSchema: z.ZodObject<{
45
+ reason: z.ZodString;
46
+ }, "strip", z.ZodTypeAny, {
47
+ reason: string;
48
+ }, {
49
+ reason: string;
50
+ }>;
51
+ /**
52
+ * Templates for hallucination scorer prompts
53
+ */
54
+ export declare class HallucinationTemplate {
55
+ /**
56
+ * Generate a prompt to evaluate hallucinations in the actual output
57
+ */
58
+ static generateVerdicts(actualOutput: string, contexts: string[]): string;
59
+ /**
60
+ * Generate a prompt to create a reason for the hallucination score
61
+ */
62
+ static generateReason(actualOutput: string, contexts: string[]): string;
63
+ }
@@ -0,0 +1,2 @@
1
+ export * from './instruction-adherence.js';
2
+ export * from './prompts.js';
@@ -0,0 +1,67 @@
1
+ import { Example } from '../../../data/example.js';
2
+ import { ScorerData } from '../../../data/result.js';
3
+ import { JudgevalScorer } from '../../base-scorer.js';
4
+ import { Judge } from '../../../judges/index.js';
5
+ /**
6
+ * InstructionAdherenceScorer evaluates how well an LLM follows instructions
7
+ * by extracting instructions from the input and checking if they are followed in the output.
8
+ *
9
+ * The score is the average of scores for each instruction (1 = followed, 0.5 = partially followed, 0 = not followed).
10
+ */
11
+ export declare class InstructionAdherenceScorer extends JudgevalScorer {
12
+ private model;
13
+ private using_native_model;
14
+ private _instructions;
15
+ private _verdicts;
16
+ /**
17
+ * Create a new InstructionAdherenceScorer
18
+ *
19
+ * @param threshold - Success threshold (default: 0.5)
20
+ * @param model - Model to use for evaluation (default: DefaultJudge)
21
+ * @param include_reason - Whether to include a reason for the score (default: true)
22
+ * @param async_mode - Whether to use async mode (default: false)
23
+ * @param strict_mode - Whether to use strict mode (default: false)
24
+ * @param verbose_mode - Whether to include verbose logs (default: false)
25
+ */
26
+ constructor(threshold?: number, model?: string | Judge | undefined, include_reason?: boolean, async_mode?: boolean, strict_mode?: boolean, verbose_mode?: boolean);
27
+ /**
28
+ * Extract instructions from input text
29
+ */
30
+ private _aGetInstructions;
31
+ /**
32
+ * Extract instructions from input text (synchronous)
33
+ */
34
+ private _getInstructions;
35
+ /**
36
+ * Generate verdicts for each instruction
37
+ */
38
+ private _aGetVerdicts;
39
+ /**
40
+ * Generate verdicts for each instruction (synchronous)
41
+ */
42
+ private _getVerdicts;
43
+ /**
44
+ * Calculate the instruction adherence score
45
+ */
46
+ private _computeScore;
47
+ /**
48
+ * Create verbose logs for debugging
49
+ */
50
+ private _createVerboseLogs;
51
+ /**
52
+ * Check if example has required parameters
53
+ */
54
+ private _checkExampleParams;
55
+ /**
56
+ * Score an example synchronously
57
+ */
58
+ syncScoreExample(example: Example): ScorerData;
59
+ /**
60
+ * Score an example asynchronously
61
+ */
62
+ scoreExample(example: Example): Promise<ScorerData>;
63
+ /**
64
+ * Get the name of the scorer
65
+ */
66
+ get name(): string;
67
+ }
@@ -0,0 +1,78 @@
1
+ import { z } from 'zod';
2
+ /**
3
+ * Schema for a single instruction adherence verdict
4
+ */
5
+ export declare const VerdictSchema: z.ZodObject<{
6
+ instruction: z.ZodString;
7
+ score: z.ZodNumber;
8
+ reason: z.ZodString;
9
+ }, "strip", z.ZodTypeAny, {
10
+ score: number;
11
+ reason: string;
12
+ instruction: string;
13
+ }, {
14
+ score: number;
15
+ reason: string;
16
+ instruction: string;
17
+ }>;
18
+ /**
19
+ * Schema for a list of verdicts
20
+ */
21
+ export declare const VerdictsSchema: z.ZodObject<{
22
+ verdicts: z.ZodArray<z.ZodObject<{
23
+ instruction: z.ZodString;
24
+ score: z.ZodNumber;
25
+ reason: z.ZodString;
26
+ }, "strip", z.ZodTypeAny, {
27
+ score: number;
28
+ reason: string;
29
+ instruction: string;
30
+ }, {
31
+ score: number;
32
+ reason: string;
33
+ instruction: string;
34
+ }>, "many">;
35
+ }, "strip", z.ZodTypeAny, {
36
+ verdicts: {
37
+ score: number;
38
+ reason: string;
39
+ instruction: string;
40
+ }[];
41
+ }, {
42
+ verdicts: {
43
+ score: number;
44
+ reason: string;
45
+ instruction: string;
46
+ }[];
47
+ }>;
48
+ /**
49
+ * Schema for a list of instructions
50
+ */
51
+ export declare const InstructionsSchema: z.ZodObject<{
52
+ instructions: z.ZodArray<z.ZodString, "many">;
53
+ }, "strip", z.ZodTypeAny, {
54
+ instructions: string[];
55
+ }, {
56
+ instructions: string[];
57
+ }>;
58
+ /**
59
+ * Type for a single verdict
60
+ */
61
+ export type InstructionAdherenceVerdict = z.infer<typeof VerdictSchema>;
62
+ /**
63
+ * Type for a list of instructions
64
+ */
65
+ export type InstructionList = z.infer<typeof InstructionsSchema>;
66
+ /**
67
+ * Templates for prompts used in the InstructionAdherenceScorer
68
+ */
69
+ export declare class InstructionAdherenceTemplate {
70
+ /**
71
+ * Generate a prompt to extract instructions from input text
72
+ */
73
+ static getInstructions(input: string): string;
74
+ /**
75
+ * Generate a prompt to evaluate adherence to instructions
76
+ */
77
+ static generateVerdicts(instructions: string[], actualOutput: string): string;
78
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "judgeval",
3
- "version": "0.2.4",
3
+ "version": "0.2.5",
4
4
  "description": "Judgment SDK for TypeScript/JavaScript",
5
5
  "main": "./dist/cjs/index.js",
6
6
  "module": "./dist/esm/index.js",
@@ -22,46 +22,61 @@
22
22
  "@langchain/core": "^0.3.44",
23
23
  "@langchain/langgraph": "^0.2.63",
24
24
  "@langchain/openai": "^0.5.5",
25
- "@supabase/supabase-js": "^2.43.4",
25
+ "@opentelemetry/api": "^1.8.0",
26
+ "@opentelemetry/core": "^1.30.1",
27
+ "@opentelemetry/sdk-trace-base": "^1.25.1",
28
+ "@supabase/supabase-js": "^2.42.5",
26
29
  "@types/node-fetch": "^2.6.12",
27
30
  "@types/uuid": "^10.0.0",
31
+ "ansi-colors": "^4.1.3",
28
32
  "axios": "^1.7.2",
29
33
  "chalk": "4.1.2",
34
+ "cli-progress": "^3.12.0",
30
35
  "csv-writer": "^1.6.0",
31
36
  "dotenv": "^16.4.7",
32
37
  "eventsource-parser": "^1.1.2",
33
38
  "json-schema-to-ts": "^3.1.0",
34
39
  "langchain": "^0.3.21",
35
40
  "node-fetch": "^2.7.0",
41
+ "openai": "^4.0.0",
36
42
  "papaparse": "^5.4.1",
43
+ "together-ai": "^0.11.1",
37
44
  "undici-types": "^6.21.0",
38
- "uuid": "^11.1.0",
39
- "winston": "^3.17.0",
40
- "ws": "^8.18.1",
41
- "cli-progress": "^3.12.0",
42
- "ansi-colors": "^4.1.3"
45
+ "uuid": "^10.0.0",
46
+ "winston": "^3.13.1",
47
+ "ws": "^8.18.1"
43
48
  },
44
49
  "peerDependencies": {
45
50
  "@anthropic-ai/sdk": "^0.22.0",
46
51
  "openai": "^4.0.0",
47
- "together-ai": "^0.7.0"
52
+ "react": "^18.2.0"
48
53
  },
49
54
  "devDependencies": {
55
+ "@ai-sdk/openai": "^1.3.16",
56
+ "@ai-sdk/provider": "0.0.26",
57
+ "@jest/globals": "^29.7.0",
58
+ "@opentelemetry/sdk-node": "^0.52.0",
59
+ "@opentelemetry/sdk-trace-node": "^1.8.0",
60
+ "@types/cli-progress": "^3.11.6",
50
61
  "@types/jest": "^29.5.12",
51
- "@types/node": "^20.12.12",
62
+ "@types/node": "^20.14.10",
63
+ "@types/papaparse": "^5.3.15",
64
+ "@types/uuid": "^10.0.0",
52
65
  "@typescript-eslint/eslint-plugin": "^7.10.0",
53
66
  "@typescript-eslint/parser": "^7.10.0",
67
+ "ai": "^3.2.16",
54
68
  "cross-env": "^7.0.3",
55
69
  "eslint": "^8.57.0",
56
70
  "eslint-config-prettier": "^9.1.0",
57
71
  "eslint-plugin-prettier": "^5.1.3",
58
72
  "jest": "^29.7.0",
73
+ "nodemon": "^3.1.4",
59
74
  "prettier": "^3.2.5",
60
- "ts-jest": "^29.1.2",
61
- "typedoc": "^0.25.13",
62
- "typescript": "^5.4.5",
63
- "@types/cli-progress": "^3.11.6",
64
- "@types/papaparse": "^5.3.15"
75
+ "ts-jest": "^29.1.5",
76
+ "ts-node": "^10.9.2",
77
+ "tsx": "^4.16.2",
78
+ "typedoc": "^0.26.0",
79
+ "typescript": "^5.5.3"
65
80
  },
66
81
  "scripts": {
67
82
  "build": "rm -rf dist && tsc -p tsconfig.cjs.json && tsc -p tsconfig.esm.json",
@@ -103,5 +118,8 @@
103
118
  "import": "./dist/esm/index.js",
104
119
  "require": "./dist/cjs/index.js"
105
120
  }
121
+ },
122
+ "overrides": {
123
+ "@ai-sdk/provider": "0.0.26"
106
124
  }
107
125
  }