@evalgate/sdk 2.2.2 → 2.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +32 -0
- package/README.md +40 -1
- package/dist/assertions.d.ts +194 -10
- package/dist/assertions.js +525 -73
- package/dist/batch.js +4 -4
- package/dist/cache.d.ts +5 -1
- package/dist/cache.js +5 -1
- package/dist/cli/baseline.d.ts +14 -0
- package/dist/cli/baseline.js +43 -3
- package/dist/cli/check.d.ts +5 -2
- package/dist/cli/check.js +20 -12
- package/dist/cli/compare.d.ts +80 -0
- package/dist/cli/compare.js +266 -0
- package/dist/cli/index.js +244 -101
- package/dist/cli/regression-gate.js +23 -0
- package/dist/cli/run.js +22 -0
- package/dist/cli/start.d.ts +26 -0
- package/dist/cli/start.js +130 -0
- package/dist/cli/templates.d.ts +24 -0
- package/dist/cli/templates.js +314 -0
- package/dist/cli/traces.d.ts +109 -0
- package/dist/cli/traces.js +152 -0
- package/dist/cli/upgrade.js +5 -0
- package/dist/cli/validate.d.ts +37 -0
- package/dist/cli/validate.js +252 -0
- package/dist/cli/watch.d.ts +19 -0
- package/dist/cli/watch.js +175 -0
- package/dist/client.js +6 -13
- package/dist/constants.d.ts +2 -0
- package/dist/constants.js +5 -0
- package/dist/errors.js +7 -0
- package/dist/export.js +2 -2
- package/dist/index.d.ts +10 -9
- package/dist/index.js +24 -7
- package/dist/integrations/anthropic.js +6 -6
- package/dist/integrations/openai.js +84 -61
- package/dist/logger.d.ts +3 -1
- package/dist/logger.js +2 -1
- package/dist/otel.d.ts +130 -0
- package/dist/otel.js +309 -0
- package/dist/pagination.d.ts +13 -2
- package/dist/pagination.js +28 -2
- package/dist/runtime/adapters/testsuite-to-dsl.js +1 -6
- package/dist/runtime/eval.d.ts +14 -4
- package/dist/runtime/eval.js +127 -2
- package/dist/runtime/executor.d.ts +3 -2
- package/dist/runtime/executor.js +3 -2
- package/dist/runtime/registry.d.ts +8 -3
- package/dist/runtime/registry.js +15 -4
- package/dist/runtime/run-report.d.ts +1 -1
- package/dist/runtime/run-report.js +7 -4
- package/dist/runtime/types.d.ts +38 -0
- package/dist/snapshot.d.ts +12 -0
- package/dist/snapshot.js +24 -1
- package/dist/testing.d.ts +8 -0
- package/dist/testing.js +45 -10
- package/dist/version.d.ts +2 -2
- package/dist/version.js +2 -2
- package/dist/workflows.d.ts +2 -0
- package/dist/workflows.js +184 -102
- package/package.json +8 -1
package/dist/assertions.js
CHANGED
|
@@ -26,7 +26,10 @@ exports.containsJSON = containsJSON;
|
|
|
26
26
|
exports.notContainsPII = notContainsPII;
|
|
27
27
|
exports.hasPII = hasPII;
|
|
28
28
|
exports.hasSentiment = hasSentiment;
|
|
29
|
+
exports.hasSentimentWithScore = hasSentimentWithScore;
|
|
29
30
|
exports.similarTo = similarTo;
|
|
31
|
+
exports.hasConsistency = hasConsistency;
|
|
32
|
+
exports.hasConsistencyAsync = hasConsistencyAsync;
|
|
30
33
|
exports.withinRange = withinRange;
|
|
31
34
|
exports.isValidEmail = isValidEmail;
|
|
32
35
|
exports.isValidURL = isValidURL;
|
|
@@ -35,19 +38,37 @@ exports.matchesSchema = matchesSchema;
|
|
|
35
38
|
exports.hasReadabilityScore = hasReadabilityScore;
|
|
36
39
|
exports.containsLanguage = containsLanguage;
|
|
37
40
|
exports.hasFactualAccuracy = hasFactualAccuracy;
|
|
41
|
+
exports.respondedWithinDuration = respondedWithinDuration;
|
|
42
|
+
exports.respondedWithinTimeSince = respondedWithinTimeSince;
|
|
38
43
|
exports.respondedWithinTime = respondedWithinTime;
|
|
39
44
|
exports.hasNoToxicity = hasNoToxicity;
|
|
40
45
|
exports.followsInstructions = followsInstructions;
|
|
41
46
|
exports.containsAllRequiredFields = containsAllRequiredFields;
|
|
42
47
|
exports.configureAssertions = configureAssertions;
|
|
43
48
|
exports.getAssertionConfig = getAssertionConfig;
|
|
49
|
+
exports.resetSentimentDeprecationWarning = resetSentimentDeprecationWarning;
|
|
44
50
|
exports.hasSentimentAsync = hasSentimentAsync;
|
|
45
51
|
exports.hasNoToxicityAsync = hasNoToxicityAsync;
|
|
46
52
|
exports.containsLanguageAsync = containsLanguageAsync;
|
|
47
53
|
exports.hasValidCodeSyntaxAsync = hasValidCodeSyntaxAsync;
|
|
48
54
|
exports.hasFactualAccuracyAsync = hasFactualAccuracyAsync;
|
|
49
55
|
exports.hasNoHallucinationsAsync = hasNoHallucinationsAsync;
|
|
56
|
+
exports.toSemanticallyContain = toSemanticallyContain;
|
|
57
|
+
exports.toSemanticallyContainLLM = toSemanticallyContainLLM;
|
|
50
58
|
exports.hasValidCodeSyntax = hasValidCodeSyntax;
|
|
59
|
+
/**
|
|
60
|
+
* Test if a term appears in text as a whole word (word-boundary match)
|
|
61
|
+
* or as a phrase (for multi-word terms). Single words use \b regex to
|
|
62
|
+
* avoid false positives like "hell" matching "hello".
|
|
63
|
+
*/
|
|
64
|
+
function textContainsTerm(lowerText, term) {
|
|
65
|
+
if (term.includes(" ")) {
|
|
66
|
+
// Multi-word phrases: substring match is correct
|
|
67
|
+
return lowerText.includes(term);
|
|
68
|
+
}
|
|
69
|
+
// Single words: word-boundary match
|
|
70
|
+
return new RegExp(`\\b${term.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\b`, "i").test(lowerText);
|
|
71
|
+
}
|
|
51
72
|
class AssertionError extends Error {
|
|
52
73
|
constructor(message, expected, actual) {
|
|
53
74
|
super(message);
|
|
@@ -234,9 +255,10 @@ class Expectation {
|
|
|
234
255
|
let parsedJson = null;
|
|
235
256
|
try {
|
|
236
257
|
parsedJson = JSON.parse(String(this.value));
|
|
237
|
-
const
|
|
238
|
-
|
|
239
|
-
|
|
258
|
+
const entries = Object.entries(schema);
|
|
259
|
+
passed = entries.every(([key, expectedValue]) => parsedJson !== null &&
|
|
260
|
+
key in parsedJson &&
|
|
261
|
+
JSON.stringify(parsedJson[key]) === JSON.stringify(expectedValue));
|
|
240
262
|
}
|
|
241
263
|
catch (_e) {
|
|
242
264
|
passed = false;
|
|
@@ -436,43 +458,64 @@ class Expectation {
|
|
|
436
458
|
};
|
|
437
459
|
}
|
|
438
460
|
/**
|
|
439
|
-
* Assert value contains code block
|
|
461
|
+
* Assert value contains code block or raw code
|
|
440
462
|
* @example expect(output).toContainCode()
|
|
463
|
+
* @example expect(output).toContainCode('typescript')
|
|
441
464
|
*/
|
|
442
|
-
toContainCode(message) {
|
|
465
|
+
toContainCode(language, message) {
|
|
443
466
|
const text = String(this.value);
|
|
444
|
-
const
|
|
467
|
+
const hasMarkdownBlock = language
|
|
468
|
+
? new RegExp(`\`\`\`${language}[\\s\\S]*?\`\`\``).test(text)
|
|
469
|
+
: /```[\s\S]*?```/.test(text);
|
|
470
|
+
const hasHtmlBlock = /<code>[\s\S]*?<\/code>/.test(text);
|
|
471
|
+
const hasRawCode = /\bfunction\s+\w+\s*\(/.test(text) ||
|
|
472
|
+
/\b(?:const|let|var)\s+\w+\s*=/.test(text) ||
|
|
473
|
+
/\bclass\s+\w+/.test(text) ||
|
|
474
|
+
/=>\s*[{(]/.test(text) ||
|
|
475
|
+
/\bimport\s+.*\bfrom\b/.test(text) ||
|
|
476
|
+
/\bexport\s+(?:default\s+)?(?:function|class|const)/.test(text) ||
|
|
477
|
+
/\breturn\s+.+;/.test(text);
|
|
478
|
+
const hasCodeBlock = hasMarkdownBlock || hasHtmlBlock || hasRawCode;
|
|
445
479
|
return {
|
|
446
480
|
name: "toContainCode",
|
|
447
481
|
passed: hasCodeBlock,
|
|
448
|
-
expected: "code block",
|
|
482
|
+
expected: language ? `code block (${language})` : "code block",
|
|
449
483
|
actual: text,
|
|
450
|
-
message: message ||
|
|
451
|
-
(hasCodeBlock ? "Contains code block" : "No code block found"),
|
|
484
|
+
message: message || (hasCodeBlock ? "Contains code" : "No code found"),
|
|
452
485
|
};
|
|
453
486
|
}
|
|
454
487
|
/**
|
|
455
|
-
*
|
|
456
|
-
*
|
|
488
|
+
* Blocklist check for 7 common profane words. Does NOT analyze tone,
|
|
489
|
+
* formality, or professional communication quality. For actual tone
|
|
490
|
+
* analysis, use an LLM-backed assertion.
|
|
491
|
+
* @see hasSentimentAsync for LLM-based tone checking
|
|
492
|
+
* @example expect(output).toHaveNoProfanity()
|
|
457
493
|
*/
|
|
458
|
-
|
|
494
|
+
toHaveNoProfanity(message) {
|
|
459
495
|
const text = String(this.value).toLowerCase();
|
|
460
496
|
const profanity = ["damn", "hell", "shit", "fuck", "ass", "bitch", "crap"];
|
|
461
|
-
const foundProfanity = profanity.filter((word) => text
|
|
497
|
+
const foundProfanity = profanity.filter((word) => textContainsTerm(text, word));
|
|
462
498
|
const passed = foundProfanity.length === 0;
|
|
463
499
|
return {
|
|
464
|
-
name: "
|
|
500
|
+
name: "toHaveNoProfanity",
|
|
465
501
|
passed,
|
|
466
|
-
expected: "
|
|
502
|
+
expected: "no profanity",
|
|
467
503
|
actual: foundProfanity.length > 0
|
|
468
504
|
? `Found: ${foundProfanity.join(", ")}`
|
|
469
|
-
: "
|
|
505
|
+
: "clean",
|
|
470
506
|
message: message ||
|
|
471
507
|
(passed
|
|
472
|
-
? "
|
|
473
|
-
: `
|
|
508
|
+
? "No profanity found"
|
|
509
|
+
: `Profanity detected: ${foundProfanity.join(", ")}`),
|
|
474
510
|
};
|
|
475
511
|
}
|
|
512
|
+
/**
|
|
513
|
+
* @deprecated Use {@link toHaveNoProfanity} instead. This method only
|
|
514
|
+
* checks for 7 profane words — it does not analyze professional tone.
|
|
515
|
+
*/
|
|
516
|
+
toBeProfessional(message) {
|
|
517
|
+
return this.toHaveNoProfanity(message);
|
|
518
|
+
}
|
|
476
519
|
/**
|
|
477
520
|
* Assert value has proper grammar (basic checks)
|
|
478
521
|
* @example expect(output).toHaveProperGrammar()
|
|
@@ -690,6 +733,120 @@ function hasSentiment(text, expected) {
|
|
|
690
733
|
return negativeCount > positiveCount;
|
|
691
734
|
return positiveCount === negativeCount; // neutral
|
|
692
735
|
}
|
|
736
|
+
/**
|
|
737
|
+
* Lexicon-based sentiment check with confidence score.
|
|
738
|
+
* Returns the detected sentiment, a confidence score (0–1), and whether
|
|
739
|
+
* it matches the expected sentiment.
|
|
740
|
+
*
|
|
741
|
+
* Confidence is derived from the magnitude of the word-count difference
|
|
742
|
+
* relative to the total sentiment-bearing words found.
|
|
743
|
+
*
|
|
744
|
+
* @example
|
|
745
|
+
* ```ts
|
|
746
|
+
* const { sentiment, confidence, matches } = hasSentimentWithScore(
|
|
747
|
+
* "This product is absolutely amazing and wonderful!",
|
|
748
|
+
* "positive",
|
|
749
|
+
* );
|
|
750
|
+
* // sentiment: "positive", confidence: ~0.9, matches: true
|
|
751
|
+
* ```
|
|
752
|
+
*/
|
|
753
|
+
function hasSentimentWithScore(text, expected) {
|
|
754
|
+
const lower = text.toLowerCase();
|
|
755
|
+
const positiveWords = [
|
|
756
|
+
"good",
|
|
757
|
+
"great",
|
|
758
|
+
"excellent",
|
|
759
|
+
"amazing",
|
|
760
|
+
"wonderful",
|
|
761
|
+
"fantastic",
|
|
762
|
+
"love",
|
|
763
|
+
"best",
|
|
764
|
+
"happy",
|
|
765
|
+
"helpful",
|
|
766
|
+
"awesome",
|
|
767
|
+
"superb",
|
|
768
|
+
"outstanding",
|
|
769
|
+
"brilliant",
|
|
770
|
+
"perfect",
|
|
771
|
+
"delightful",
|
|
772
|
+
"joyful",
|
|
773
|
+
"pleased",
|
|
774
|
+
"glad",
|
|
775
|
+
"terrific",
|
|
776
|
+
"fabulous",
|
|
777
|
+
"exceptional",
|
|
778
|
+
"impressive",
|
|
779
|
+
"magnificent",
|
|
780
|
+
"marvelous",
|
|
781
|
+
"splendid",
|
|
782
|
+
"positive",
|
|
783
|
+
"enjoy",
|
|
784
|
+
"enjoyed",
|
|
785
|
+
"like",
|
|
786
|
+
"liked",
|
|
787
|
+
"beautiful",
|
|
788
|
+
"innovative",
|
|
789
|
+
"inspiring",
|
|
790
|
+
"effective",
|
|
791
|
+
"useful",
|
|
792
|
+
"valuable",
|
|
793
|
+
];
|
|
794
|
+
const negativeWords = [
|
|
795
|
+
"bad",
|
|
796
|
+
"terrible",
|
|
797
|
+
"awful",
|
|
798
|
+
"horrible",
|
|
799
|
+
"worst",
|
|
800
|
+
"hate",
|
|
801
|
+
"poor",
|
|
802
|
+
"disappointing",
|
|
803
|
+
"sad",
|
|
804
|
+
"useless",
|
|
805
|
+
"dreadful",
|
|
806
|
+
"miserable",
|
|
807
|
+
"angry",
|
|
808
|
+
"frustrated",
|
|
809
|
+
"broken",
|
|
810
|
+
"failed",
|
|
811
|
+
"pathetic",
|
|
812
|
+
"stupid",
|
|
813
|
+
"disgusting",
|
|
814
|
+
"unacceptable",
|
|
815
|
+
"wrong",
|
|
816
|
+
"error",
|
|
817
|
+
"fail",
|
|
818
|
+
"problem",
|
|
819
|
+
"negative",
|
|
820
|
+
"dislike",
|
|
821
|
+
"annoying",
|
|
822
|
+
"irritating",
|
|
823
|
+
"offensive",
|
|
824
|
+
"regret",
|
|
825
|
+
"disappointment",
|
|
826
|
+
"inadequate",
|
|
827
|
+
"mediocre",
|
|
828
|
+
"flawed",
|
|
829
|
+
"unreliable",
|
|
830
|
+
];
|
|
831
|
+
const positiveCount = positiveWords.filter((w) => lower.includes(w)).length;
|
|
832
|
+
const negativeCount = negativeWords.filter((w) => lower.includes(w)).length;
|
|
833
|
+
const total = positiveCount + negativeCount;
|
|
834
|
+
let sentiment;
|
|
835
|
+
let confidence;
|
|
836
|
+
if (positiveCount > negativeCount) {
|
|
837
|
+
sentiment = "positive";
|
|
838
|
+
confidence = total > 0 ? (positiveCount - negativeCount) / total : 0;
|
|
839
|
+
}
|
|
840
|
+
else if (negativeCount > positiveCount) {
|
|
841
|
+
sentiment = "negative";
|
|
842
|
+
confidence = total > 0 ? (negativeCount - positiveCount) / total : 0;
|
|
843
|
+
}
|
|
844
|
+
else {
|
|
845
|
+
sentiment = "neutral";
|
|
846
|
+
confidence = total === 0 ? 1 : 0; // high confidence neutral when no words found
|
|
847
|
+
}
|
|
848
|
+
return { sentiment, confidence, matches: sentiment === expected };
|
|
849
|
+
}
|
|
693
850
|
function similarTo(text1, text2, threshold = 0.8) {
|
|
694
851
|
// Simple similarity check - in a real app, you'd use a proper string similarity algorithm
|
|
695
852
|
const words1 = new Set(text1.toLowerCase().split(/\s+/));
|
|
@@ -698,6 +855,74 @@ function similarTo(text1, text2, threshold = 0.8) {
|
|
|
698
855
|
const union = new Set([...words1, ...words2]);
|
|
699
856
|
return intersection.size / union.size >= threshold;
|
|
700
857
|
}
|
|
858
|
+
/**
|
|
859
|
+
* Compute pairwise Jaccard similarity between word sets.
|
|
860
|
+
* Returns the mean of all C(n,2) pair similarities.
|
|
861
|
+
*/
|
|
862
|
+
function meanPairwiseJaccard(texts) {
|
|
863
|
+
if (texts.length < 2)
|
|
864
|
+
return 1;
|
|
865
|
+
const wordSets = texts.map((t) => new Set(t.toLowerCase().split(/\s+/).filter(Boolean)));
|
|
866
|
+
let sum = 0;
|
|
867
|
+
let count = 0;
|
|
868
|
+
for (let i = 0; i < wordSets.length; i++) {
|
|
869
|
+
for (let j = i + 1; j < wordSets.length; j++) {
|
|
870
|
+
const a = wordSets[i];
|
|
871
|
+
const b = wordSets[j];
|
|
872
|
+
const intersection = new Set([...a].filter((w) => b.has(w)));
|
|
873
|
+
const union = new Set([...a, ...b]);
|
|
874
|
+
sum += union.size > 0 ? intersection.size / union.size : 1;
|
|
875
|
+
count++;
|
|
876
|
+
}
|
|
877
|
+
}
|
|
878
|
+
return count > 0 ? sum / count : 1;
|
|
879
|
+
}
|
|
880
|
+
/**
|
|
881
|
+
* Measure consistency across multiple outputs for the same input.
|
|
882
|
+
* **Fast and approximate** — uses word-overlap (Jaccard) across all pairs.
|
|
883
|
+
* Returns a score from 0 (completely inconsistent) to 1 (identical).
|
|
884
|
+
*
|
|
885
|
+
* @param outputs - Array of LLM outputs to compare (minimum 2)
|
|
886
|
+
* @param threshold - Optional minimum consistency score to return true (default 0.7)
|
|
887
|
+
* @returns `{ score, consistent }` where `consistent` is `score >= threshold`
|
|
888
|
+
*
|
|
889
|
+
* @example
|
|
890
|
+
* ```ts
|
|
891
|
+
* const { score, consistent } = hasConsistency([
|
|
892
|
+
* "The capital of France is Paris.",
|
|
893
|
+
* "Paris is the capital of France.",
|
|
894
|
+
* "France's capital city is Paris.",
|
|
895
|
+
* ]);
|
|
896
|
+
* // score ≈ 0.6-0.8, consistent = true at default threshold
|
|
897
|
+
* ```
|
|
898
|
+
*/
|
|
899
|
+
function hasConsistency(outputs, threshold = 0.7) {
|
|
900
|
+
if (outputs.length < 2) {
|
|
901
|
+
return { score: 1, consistent: true };
|
|
902
|
+
}
|
|
903
|
+
const score = meanPairwiseJaccard(outputs);
|
|
904
|
+
return { score, consistent: score >= threshold };
|
|
905
|
+
}
|
|
906
|
+
/**
|
|
907
|
+
* LLM-backed consistency check. **Slow and accurate** — asks the LLM to
|
|
908
|
+
* judge whether multiple outputs convey the same meaning, catching
|
|
909
|
+
* paraphrased contradictions that word-overlap misses.
|
|
910
|
+
*
|
|
911
|
+
* @returns A score from 0 to 1 where 1 = perfectly consistent.
|
|
912
|
+
*/
|
|
913
|
+
async function hasConsistencyAsync(outputs, config) {
|
|
914
|
+
if (outputs.length < 2) {
|
|
915
|
+
return { score: 1, consistent: true };
|
|
916
|
+
}
|
|
917
|
+
const numbered = outputs.map((o, i) => `Output ${i + 1}: "${o}"`).join("\n");
|
|
918
|
+
const prompt = `Rate the semantic consistency of the following ${outputs.length} outputs on a scale from 0 to 100, where 100 means they all convey exactly the same meaning and 0 means they completely contradict each other. Reply with ONLY a number.\n\n${numbered}`;
|
|
919
|
+
const result = await callAssertionLLM(prompt, config);
|
|
920
|
+
const parsed = parseInt(result.replace(/[^0-9]/g, ""), 10);
|
|
921
|
+
const score = Number.isNaN(parsed)
|
|
922
|
+
? 0
|
|
923
|
+
: Math.min(100, Math.max(0, parsed)) / 100;
|
|
924
|
+
return { score, consistent: score >= 0.7 };
|
|
925
|
+
}
|
|
701
926
|
function withinRange(value, min, max) {
|
|
702
927
|
return value >= min && value <= max;
|
|
703
928
|
}
|
|
@@ -719,7 +944,7 @@ function isValidURL(url) {
|
|
|
719
944
|
* facts but cannot detect paraphrased fabrications. Use
|
|
720
945
|
* {@link hasNoHallucinationsAsync} for semantic accuracy.
|
|
721
946
|
*/
|
|
722
|
-
function hasNoHallucinations(text, groundTruth) {
|
|
947
|
+
function hasNoHallucinations(text, groundTruth = []) {
|
|
723
948
|
const lower = text.toLowerCase();
|
|
724
949
|
return groundTruth.every((truth) => lower.includes(truth.toLowerCase()));
|
|
725
950
|
}
|
|
@@ -739,12 +964,14 @@ function matchesSchema(value, schema) {
|
|
|
739
964
|
return Object.keys(schema).every((key) => key in obj);
|
|
740
965
|
}
|
|
741
966
|
function hasReadabilityScore(text, minScore) {
|
|
967
|
+
const threshold = typeof minScore === "number" ? minScore : (minScore.min ?? 0);
|
|
968
|
+
const maxThreshold = typeof minScore === "object" ? minScore.max : undefined;
|
|
742
969
|
const wordList = text.trim().split(/\s+/).filter(Boolean);
|
|
743
970
|
const words = wordList.length || 1;
|
|
744
971
|
const sentences = text.split(/[.!?]+/).filter((s) => s.trim().length > 0).length || 1;
|
|
745
972
|
const totalSyllables = wordList.reduce((sum, w) => sum + syllables(w), 0);
|
|
746
973
|
const score = 206.835 - 1.015 * (words / sentences) - 84.6 * (totalSyllables / words);
|
|
747
|
-
return score >=
|
|
974
|
+
return (score >= threshold && (maxThreshold === undefined || score <= maxThreshold));
|
|
748
975
|
}
|
|
749
976
|
function syllables(word) {
|
|
750
977
|
// Simple syllable counter
|
|
@@ -985,9 +1212,30 @@ function hasFactualAccuracy(text, facts) {
|
|
|
985
1212
|
const lower = text.toLowerCase();
|
|
986
1213
|
return facts.every((fact) => lower.includes(fact.toLowerCase()));
|
|
987
1214
|
}
|
|
988
|
-
|
|
1215
|
+
/**
|
|
1216
|
+
* Check if a measured duration is within the allowed limit.
|
|
1217
|
+
* @param durationMs - The actual elapsed time in milliseconds
|
|
1218
|
+
* @param maxMs - Maximum allowed duration in milliseconds
|
|
1219
|
+
*/
|
|
1220
|
+
function respondedWithinDuration(durationMs, maxMs) {
|
|
1221
|
+
return durationMs <= maxMs;
|
|
1222
|
+
}
|
|
1223
|
+
/**
|
|
1224
|
+
* Check if elapsed time since a start timestamp is within the allowed limit.
|
|
1225
|
+
* @param startTime - Timestamp from Date.now() captured before the operation
|
|
1226
|
+
* @param maxMs - Maximum allowed duration in milliseconds
|
|
1227
|
+
*/
|
|
1228
|
+
function respondedWithinTimeSince(startTime, maxMs) {
|
|
989
1229
|
return Date.now() - startTime <= maxMs;
|
|
990
1230
|
}
|
|
1231
|
+
/**
|
|
1232
|
+
* @deprecated Use {@link respondedWithinDuration} (takes measured duration)
|
|
1233
|
+
* or {@link respondedWithinTimeSince} (takes start timestamp) instead.
|
|
1234
|
+
* This function takes a start timestamp, not a duration — the name is misleading.
|
|
1235
|
+
*/
|
|
1236
|
+
function respondedWithinTime(startTime, maxMs) {
|
|
1237
|
+
return respondedWithinTimeSince(startTime, maxMs);
|
|
1238
|
+
}
|
|
991
1239
|
/**
|
|
992
1240
|
* Blocklist-based toxicity check (~80 terms across 9 categories).
|
|
993
1241
|
* **Fast and approximate** — catches explicit harmful language but has
|
|
@@ -1151,10 +1399,13 @@ function hasNoToxicity(text) {
|
|
|
1151
1399
|
"motherfucker",
|
|
1152
1400
|
"fucktard",
|
|
1153
1401
|
];
|
|
1154
|
-
return !toxicTerms.some((term) => lower
|
|
1402
|
+
return !toxicTerms.some((term) => textContainsTerm(lower, term));
|
|
1155
1403
|
}
|
|
1156
1404
|
function followsInstructions(text, instructions) {
|
|
1157
|
-
|
|
1405
|
+
const instructionList = Array.isArray(instructions)
|
|
1406
|
+
? instructions
|
|
1407
|
+
: [instructions];
|
|
1408
|
+
return instructionList.every((instruction) => {
|
|
1158
1409
|
if (instruction.startsWith("!")) {
|
|
1159
1410
|
return !text.includes(instruction.slice(1));
|
|
1160
1411
|
}
|
|
@@ -1164,6 +1415,7 @@ function followsInstructions(text, instructions) {
|
|
|
1164
1415
|
function containsAllRequiredFields(obj, requiredFields) {
|
|
1165
1416
|
return requiredFields.every((field) => obj && typeof obj === "object" && field in obj);
|
|
1166
1417
|
}
|
|
1418
|
+
const DEFAULT_ASSERTION_TIMEOUT_MS = 30000;
|
|
1167
1419
|
let _assertionLLMConfig = null;
|
|
1168
1420
|
function configureAssertions(config) {
|
|
1169
1421
|
_assertionLLMConfig = config;
|
|
@@ -1176,65 +1428,163 @@ async function callAssertionLLM(prompt, config) {
|
|
|
1176
1428
|
if (!cfg) {
|
|
1177
1429
|
throw new Error("No LLM config set. Call configureAssertions({ provider, apiKey }) first, or pass a config as the last argument.");
|
|
1178
1430
|
}
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1431
|
+
const timeoutMs = cfg.timeoutMs ?? DEFAULT_ASSERTION_TIMEOUT_MS;
|
|
1432
|
+
const ac = typeof AbortController !== "undefined" ? new AbortController() : null;
|
|
1433
|
+
const fetchWithSignal = (url, init) => fetch(url, ac ? { ...init, signal: ac.signal } : init);
|
|
1434
|
+
const llmCall = async () => {
|
|
1435
|
+
if (cfg.provider === "openai") {
|
|
1436
|
+
const baseUrl = cfg.baseUrl ?? "https://api.openai.com";
|
|
1437
|
+
const model = cfg.model ?? "gpt-4o-mini";
|
|
1438
|
+
const res = await fetchWithSignal(`${baseUrl}/v1/chat/completions`, {
|
|
1439
|
+
method: "POST",
|
|
1440
|
+
headers: {
|
|
1441
|
+
"Content-Type": "application/json",
|
|
1442
|
+
Authorization: `Bearer ${cfg.apiKey}`,
|
|
1443
|
+
},
|
|
1444
|
+
body: JSON.stringify({
|
|
1445
|
+
model,
|
|
1446
|
+
messages: [{ role: "user", content: prompt }],
|
|
1447
|
+
max_tokens: 60,
|
|
1448
|
+
temperature: 0,
|
|
1449
|
+
}),
|
|
1450
|
+
});
|
|
1451
|
+
if (!res.ok) {
|
|
1452
|
+
throw new Error(`OpenAI API error ${res.status}: ${await res.text()}`);
|
|
1453
|
+
}
|
|
1454
|
+
const data = (await res.json());
|
|
1455
|
+
return data.choices[0]?.message?.content?.trim().toLowerCase() ?? "";
|
|
1197
1456
|
}
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
|
|
1457
|
+
if (cfg.provider === "anthropic") {
|
|
1458
|
+
const baseUrl = cfg.baseUrl ?? "https://api.anthropic.com";
|
|
1459
|
+
const model = cfg.model ?? "claude-3-haiku-20240307";
|
|
1460
|
+
const res = await fetchWithSignal(`${baseUrl}/v1/messages`, {
|
|
1461
|
+
method: "POST",
|
|
1462
|
+
headers: {
|
|
1463
|
+
"Content-Type": "application/json",
|
|
1464
|
+
"x-api-key": cfg.apiKey,
|
|
1465
|
+
"anthropic-version": "2023-06-01",
|
|
1466
|
+
},
|
|
1467
|
+
body: JSON.stringify({
|
|
1468
|
+
model,
|
|
1469
|
+
max_tokens: 60,
|
|
1470
|
+
messages: [{ role: "user", content: prompt }],
|
|
1471
|
+
}),
|
|
1472
|
+
});
|
|
1473
|
+
if (!res.ok) {
|
|
1474
|
+
throw new Error(`Anthropic API error ${res.status}: ${await res.text()}`);
|
|
1475
|
+
}
|
|
1476
|
+
const data = (await res.json());
|
|
1477
|
+
return data.content[0]?.text?.trim().toLowerCase() ?? "";
|
|
1219
1478
|
}
|
|
1220
|
-
|
|
1221
|
-
|
|
1479
|
+
throw new Error(`Unsupported provider: "${cfg.provider}". Use "openai" or "anthropic".`);
|
|
1480
|
+
};
|
|
1481
|
+
let timer;
|
|
1482
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
1483
|
+
timer = setTimeout(() => {
|
|
1484
|
+
ac?.abort();
|
|
1485
|
+
reject(new Error(`Assertion LLM call timed out after ${timeoutMs}ms`));
|
|
1486
|
+
}, timeoutMs);
|
|
1487
|
+
});
|
|
1488
|
+
try {
|
|
1489
|
+
return await Promise.race([llmCall(), timeoutPromise]);
|
|
1222
1490
|
}
|
|
1223
|
-
|
|
1491
|
+
finally {
|
|
1492
|
+
clearTimeout(timer);
|
|
1493
|
+
}
|
|
1494
|
+
}
|
|
1495
|
+
let _hasSentimentAsyncDeprecationWarned = false;
|
|
1496
|
+
/** @internal Reset the one-time deprecation flag. For testing only. */
|
|
1497
|
+
function resetSentimentDeprecationWarning() {
|
|
1498
|
+
_hasSentimentAsyncDeprecationWarned = false;
|
|
1499
|
+
}
|
|
1500
|
+
function makeSentimentResult(sentiment, confidence, matches) {
|
|
1501
|
+
return {
|
|
1502
|
+
sentiment,
|
|
1503
|
+
confidence,
|
|
1504
|
+
matches,
|
|
1505
|
+
[Symbol.toPrimitive](hint) {
|
|
1506
|
+
if (!_hasSentimentAsyncDeprecationWarned) {
|
|
1507
|
+
_hasSentimentAsyncDeprecationWarned = true;
|
|
1508
|
+
console.warn("[evalgate] DEPRECATION: hasSentimentAsync() now returns { sentiment, confidence, matches }. " +
|
|
1509
|
+
"Using it as a boolean (e.g. `if (await hasSentimentAsync(...))`) is deprecated and will be " +
|
|
1510
|
+
"removed in the next major version. Migrate to: `const { matches } = await hasSentimentAsync(...)`");
|
|
1511
|
+
}
|
|
1512
|
+
if (hint === "number")
|
|
1513
|
+
return matches ? 1 : 0;
|
|
1514
|
+
if (hint === "string")
|
|
1515
|
+
return `SentimentAsyncResult(${sentiment}, matches=${matches})`;
|
|
1516
|
+
return matches;
|
|
1517
|
+
},
|
|
1518
|
+
};
|
|
1224
1519
|
}
|
|
1225
|
-
// ============================================================================
|
|
1226
|
-
// LLM-BACKED ASYNC ASSERTION FUNCTIONS
|
|
1227
|
-
// ============================================================================
|
|
1228
1520
|
/**
|
|
1229
1521
|
* LLM-backed sentiment check. **Slow and accurate** — uses an LLM to
|
|
1230
|
-
* classify sentiment with full context awareness.
|
|
1231
|
-
* {@link configureAssertions} or an inline `config` argument.
|
|
1522
|
+
* classify sentiment with full context awareness and return a confidence score.
|
|
1523
|
+
* Requires {@link configureAssertions} or an inline `config` argument.
|
|
1232
1524
|
* Falls back gracefully with a clear error if no API key is configured.
|
|
1525
|
+
*
|
|
1526
|
+
* Returns `{ sentiment, confidence, matches }` — the async layer now provides
|
|
1527
|
+
* the same rich return shape as {@link hasSentimentWithScore}, but powered by
|
|
1528
|
+
* an LLM instead of keyword counting. The `confidence` field is the LLM's
|
|
1529
|
+
* self-reported confidence (0–1), not a lexical heuristic.
|
|
1530
|
+
*
|
|
1531
|
+
* The returned object implements `Symbol.toPrimitive` so that legacy code
|
|
1532
|
+
* using `if (await hasSentimentAsync(...))` still works correctly (coerces
|
|
1533
|
+
* to `matches`), but a deprecation warning is emitted. Migrate to
|
|
1534
|
+
* destructuring: `const { matches } = await hasSentimentAsync(...)`.
|
|
1535
|
+
*
|
|
1536
|
+
* @example
|
|
1537
|
+
* ```ts
|
|
1538
|
+
* const { sentiment, confidence, matches } = await hasSentimentAsync(
|
|
1539
|
+
* "This product is revolutionary but overpriced",
|
|
1540
|
+
* "negative",
|
|
1541
|
+
* );
|
|
1542
|
+
* // sentiment: "negative", confidence: 0.7, matches: true
|
|
1543
|
+
* ```
|
|
1233
1544
|
*/
|
|
1234
1545
|
async function hasSentimentAsync(text, expected, config) {
|
|
1235
|
-
const prompt = `Classify the sentiment of the following text. Reply with
|
|
1236
|
-
const
|
|
1237
|
-
|
|
1546
|
+
const prompt = `Classify the sentiment of the following text as positive, negative, or neutral. Also rate your confidence from 0.0 to 1.0. Reply with ONLY a JSON object like {"sentiment":"positive","confidence":0.85} and nothing else.\n\nText: "${text}"`;
|
|
1547
|
+
const raw = await callAssertionLLM(prompt, config);
|
|
1548
|
+
// Parse structured response; fall back to keyword extraction if LLM doesn't return valid JSON
|
|
1549
|
+
let sentiment = "neutral";
|
|
1550
|
+
let confidence = 0.5;
|
|
1551
|
+
try {
|
|
1552
|
+
// Extract JSON from response (LLM may wrap in markdown code fences)
|
|
1553
|
+
const jsonMatch = raw.match(/\{[^}]+\}/);
|
|
1554
|
+
if (jsonMatch) {
|
|
1555
|
+
const parsed = JSON.parse(jsonMatch[0]);
|
|
1556
|
+
const s = String(parsed.sentiment ?? "").replace(/[^a-z]/g, "");
|
|
1557
|
+
if (s === "positive" || s === "negative" || s === "neutral") {
|
|
1558
|
+
sentiment = s;
|
|
1559
|
+
}
|
|
1560
|
+
const c = Number(parsed.confidence);
|
|
1561
|
+
if (!Number.isNaN(c) && c >= 0 && c <= 1) {
|
|
1562
|
+
confidence = c;
|
|
1563
|
+
}
|
|
1564
|
+
}
|
|
1565
|
+
else {
|
|
1566
|
+
// Fallback: old-style single-word response
|
|
1567
|
+
const cleaned = raw.replace(/[^a-z]/g, "");
|
|
1568
|
+
if (cleaned === "positive" ||
|
|
1569
|
+
cleaned === "negative" ||
|
|
1570
|
+
cleaned === "neutral") {
|
|
1571
|
+
sentiment = cleaned;
|
|
1572
|
+
confidence = 0.5; // no confidence info from single-word response
|
|
1573
|
+
}
|
|
1574
|
+
}
|
|
1575
|
+
}
|
|
1576
|
+
catch {
|
|
1577
|
+
// JSON parse failed — try plain text extraction
|
|
1578
|
+
const cleaned = raw.replace(/[^a-z]/g, "");
|
|
1579
|
+
if (cleaned.includes("positive"))
|
|
1580
|
+
sentiment = "positive";
|
|
1581
|
+
else if (cleaned.includes("negative"))
|
|
1582
|
+
sentiment = "negative";
|
|
1583
|
+
else
|
|
1584
|
+
sentiment = "neutral";
|
|
1585
|
+
confidence = 0.5;
|
|
1586
|
+
}
|
|
1587
|
+
return makeSentimentResult(sentiment, confidence, sentiment === expected);
|
|
1238
1588
|
}
|
|
1239
1589
|
/**
|
|
1240
1590
|
* LLM-backed toxicity check. **Slow and accurate** — context-aware, handles
|
|
@@ -1272,6 +1622,108 @@ async function hasNoHallucinationsAsync(text, groundTruth, config) {
|
|
|
1272
1622
|
const result = await callAssertionLLM(prompt, config);
|
|
1273
1623
|
return result.replace(/[^a-z]/g, "") === "yes";
|
|
1274
1624
|
}
|
|
1625
|
+
/**
|
|
1626
|
+
* Compute cosine similarity between two vectors.
|
|
1627
|
+
*/
|
|
1628
|
+
function cosineSimilarity(a, b) {
|
|
1629
|
+
if (a.length !== b.length || a.length === 0)
|
|
1630
|
+
return 0;
|
|
1631
|
+
let dot = 0;
|
|
1632
|
+
let normA = 0;
|
|
1633
|
+
let normB = 0;
|
|
1634
|
+
for (let i = 0; i < a.length; i++) {
|
|
1635
|
+
dot += a[i] * b[i];
|
|
1636
|
+
normA += a[i] * a[i];
|
|
1637
|
+
normB += b[i] * b[i];
|
|
1638
|
+
}
|
|
1639
|
+
const denom = Math.sqrt(normA) * Math.sqrt(normB);
|
|
1640
|
+
return denom === 0 ? 0 : dot / denom;
|
|
1641
|
+
}
|
|
1642
|
+
/**
|
|
1643
|
+
* Fetch embeddings from OpenAI's embedding API.
|
|
1644
|
+
* Returns an array of embedding vectors, one per input string.
|
|
1645
|
+
*/
|
|
1646
|
+
async function fetchEmbeddings(texts, config) {
|
|
1647
|
+
if (config.provider !== "openai") {
|
|
1648
|
+
throw new Error(`Embedding-based semantic containment requires provider "openai" (got "${config.provider}"). ` +
|
|
1649
|
+
`Set provider to "openai" or use toSemanticallyContainLLM() for LLM-prompt fallback.`);
|
|
1650
|
+
}
|
|
1651
|
+
const baseUrl = config.baseUrl ?? "https://api.openai.com";
|
|
1652
|
+
const model = config.embeddingModel ?? "text-embedding-3-small";
|
|
1653
|
+
const res = await fetch(`${baseUrl}/v1/embeddings`, {
|
|
1654
|
+
method: "POST",
|
|
1655
|
+
headers: {
|
|
1656
|
+
"Content-Type": "application/json",
|
|
1657
|
+
Authorization: `Bearer ${config.apiKey}`,
|
|
1658
|
+
},
|
|
1659
|
+
body: JSON.stringify({ model, input: texts }),
|
|
1660
|
+
});
|
|
1661
|
+
if (!res.ok) {
|
|
1662
|
+
throw new Error(`OpenAI Embeddings API error ${res.status}: ${await res.text()}`);
|
|
1663
|
+
}
|
|
1664
|
+
const data = (await res.json());
|
|
1665
|
+
// Return embeddings sorted by input index
|
|
1666
|
+
return data.data.sort((a, b) => a.index - b.index).map((d) => d.embedding);
|
|
1667
|
+
}
|
|
1668
|
+
/**
|
|
1669
|
+
* Embedding-based semantic containment check. Uses OpenAI embeddings and
|
|
1670
|
+
* cosine similarity to determine whether the text semantically contains
|
|
1671
|
+
* the given concept — no LLM prompt, no "does this text contain X" trick.
|
|
1672
|
+
*
|
|
1673
|
+
* This is **real semantic containment**: embed both strings, compute cosine
|
|
1674
|
+
* similarity, and compare against a threshold. "The city of lights" will
|
|
1675
|
+
* have high similarity to "Paris" because their embeddings are close in
|
|
1676
|
+
* vector space.
|
|
1677
|
+
*
|
|
1678
|
+
* Requires `provider: "openai"` in the config. For Anthropic or other
|
|
1679
|
+
* providers without an embedding API, use {@link toSemanticallyContainLLM}.
|
|
1680
|
+
*
|
|
1681
|
+
* @param text - The text to check
|
|
1682
|
+
* @param phrase - The semantic concept to look for
|
|
1683
|
+
* @param config - LLM config (must be OpenAI with embedding support)
|
|
1684
|
+
* @param threshold - Cosine similarity threshold (default: 0.4). Lower values
|
|
1685
|
+
* are more permissive. Typical ranges: 0.3–0.5 for concept containment,
|
|
1686
|
+
* 0.6–0.8 for paraphrase detection, 0.9+ for near-duplicates.
|
|
1687
|
+
* @returns `{ contains, similarity }` — whether the threshold was met and the raw score
|
|
1688
|
+
*
|
|
1689
|
+
* @example
|
|
1690
|
+
* ```ts
|
|
1691
|
+
* const { contains, similarity } = await toSemanticallyContain(
|
|
1692
|
+
* "The city of lights is beautiful in spring",
|
|
1693
|
+
* "Paris",
|
|
1694
|
+
* { provider: "openai", apiKey: process.env.OPENAI_API_KEY },
|
|
1695
|
+
* );
|
|
1696
|
+
* // contains: true, similarity: ~0.52
|
|
1697
|
+
* ```
|
|
1698
|
+
*/
|
|
1699
|
+
async function toSemanticallyContain(text, phrase, config, threshold = 0.4) {
|
|
1700
|
+
const cfg = config ?? _assertionLLMConfig;
|
|
1701
|
+
if (!cfg) {
|
|
1702
|
+
throw new Error("No LLM config set. Call configureAssertions({ provider, apiKey }) first, or pass a config argument.");
|
|
1703
|
+
}
|
|
1704
|
+
const [textEmbedding, phraseEmbedding] = await fetchEmbeddings([text, phrase], cfg);
|
|
1705
|
+
const similarity = cosineSimilarity(textEmbedding, phraseEmbedding);
|
|
1706
|
+
return { contains: similarity >= threshold, similarity };
|
|
1707
|
+
}
|
|
1708
|
+
/**
|
|
1709
|
+
* LLM-prompt-based semantic containment check. Uses an LLM prompt to ask
|
|
1710
|
+
* whether the text conveys a concept. This is a **fallback** for providers
|
|
1711
|
+
* that don't offer an embedding API (e.g., Anthropic).
|
|
1712
|
+
*
|
|
1713
|
+
* Note: This is functionally similar to `followsInstructions` — the LLM is
|
|
1714
|
+
* being asked to judge containment, not compute vector similarity. For
|
|
1715
|
+
* real embedding-based semantic containment, use {@link toSemanticallyContain}.
|
|
1716
|
+
*
|
|
1717
|
+
* @param text - The text to check
|
|
1718
|
+
* @param phrase - The semantic concept to look for
|
|
1719
|
+
* @param config - Optional LLM config
|
|
1720
|
+
* @returns true if the LLM judges the text contains the concept
|
|
1721
|
+
*/
|
|
1722
|
+
async function toSemanticallyContainLLM(text, phrase, config) {
|
|
1723
|
+
const prompt = `Does the following text semantically contain or convey the concept "${phrase}"? The text does not need to use those exact words — paraphrases, synonyms, and implied references count. Reply with only "yes" or "no".\n\nText: "${text}"`;
|
|
1724
|
+
const result = await callAssertionLLM(prompt, config);
|
|
1725
|
+
return result.replace(/[^a-z]/g, "") === "yes";
|
|
1726
|
+
}
|
|
1275
1727
|
function hasValidCodeSyntax(code, language) {
|
|
1276
1728
|
const lang = language.toLowerCase();
|
|
1277
1729
|
if (lang === "json") {
|