@evalgate/sdk 2.2.1 → 2.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +70 -1
- package/README.md +36 -7
- package/dist/assertions.d.ts +67 -5
- package/dist/assertions.js +733 -45
- package/dist/cache.d.ts +1 -1
- package/dist/cache.js +1 -1
- package/dist/cli/upgrade.js +5 -0
- package/dist/client.js +1 -1
- package/dist/errors.js +7 -0
- package/dist/export.d.ts +1 -1
- package/dist/export.js +3 -3
- package/dist/index.d.ts +4 -4
- package/dist/index.js +14 -3
- package/dist/integrations/anthropic.js +6 -6
- package/dist/integrations/openai.js +6 -6
- package/dist/pagination.d.ts +13 -2
- package/dist/pagination.js +28 -2
- package/dist/runtime/adapters/testsuite-to-dsl.js +1 -6
- package/dist/runtime/executor.d.ts +3 -2
- package/dist/runtime/executor.js +3 -2
- package/dist/runtime/registry.d.ts +4 -1
- package/dist/runtime/registry.js +4 -1
- package/dist/snapshot.d.ts +14 -2
- package/dist/snapshot.js +30 -4
- package/dist/types.d.ts +7 -2
- package/dist/types.js +7 -2
- package/dist/version.d.ts +2 -2
- package/dist/version.js +2 -2
- package/dist/workflows.js +6 -1
- package/package.json +2 -2
package/dist/assertions.js
CHANGED
|
@@ -39,6 +39,14 @@ exports.respondedWithinTime = respondedWithinTime;
|
|
|
39
39
|
exports.hasNoToxicity = hasNoToxicity;
|
|
40
40
|
exports.followsInstructions = followsInstructions;
|
|
41
41
|
exports.containsAllRequiredFields = containsAllRequiredFields;
|
|
42
|
+
exports.configureAssertions = configureAssertions;
|
|
43
|
+
exports.getAssertionConfig = getAssertionConfig;
|
|
44
|
+
exports.hasSentimentAsync = hasSentimentAsync;
|
|
45
|
+
exports.hasNoToxicityAsync = hasNoToxicityAsync;
|
|
46
|
+
exports.containsLanguageAsync = containsLanguageAsync;
|
|
47
|
+
exports.hasValidCodeSyntaxAsync = hasValidCodeSyntaxAsync;
|
|
48
|
+
exports.hasFactualAccuracyAsync = hasFactualAccuracyAsync;
|
|
49
|
+
exports.hasNoHallucinationsAsync = hasNoHallucinationsAsync;
|
|
42
50
|
exports.hasValidCodeSyntax = hasValidCodeSyntax;
|
|
43
51
|
class AssertionError extends Error {
|
|
44
52
|
constructor(message, expected, actual) {
|
|
@@ -226,9 +234,10 @@ class Expectation {
|
|
|
226
234
|
let parsedJson = null;
|
|
227
235
|
try {
|
|
228
236
|
parsedJson = JSON.parse(String(this.value));
|
|
229
|
-
const
|
|
230
|
-
|
|
231
|
-
|
|
237
|
+
const entries = Object.entries(schema);
|
|
238
|
+
passed = entries.every(([key, expectedValue]) => parsedJson !== null &&
|
|
239
|
+
key in parsedJson &&
|
|
240
|
+
JSON.stringify(parsedJson[key]) === JSON.stringify(expectedValue));
|
|
232
241
|
}
|
|
233
242
|
catch (_e) {
|
|
234
243
|
passed = false;
|
|
@@ -428,19 +437,30 @@ class Expectation {
|
|
|
428
437
|
};
|
|
429
438
|
}
|
|
430
439
|
/**
|
|
431
|
-
* Assert value contains code block
|
|
440
|
+
* Assert value contains code block or raw code
|
|
432
441
|
* @example expect(output).toContainCode()
|
|
442
|
+
* @example expect(output).toContainCode('typescript')
|
|
433
443
|
*/
|
|
434
|
-
toContainCode(message) {
|
|
444
|
+
toContainCode(language, message) {
|
|
435
445
|
const text = String(this.value);
|
|
436
|
-
const
|
|
446
|
+
const hasMarkdownBlock = language
|
|
447
|
+
? new RegExp(`\`\`\`${language}[\\s\\S]*?\`\`\``).test(text)
|
|
448
|
+
: /```[\s\S]*?```/.test(text);
|
|
449
|
+
const hasHtmlBlock = /<code>[\s\S]*?<\/code>/.test(text);
|
|
450
|
+
const hasRawCode = /\bfunction\s+\w+\s*\(/.test(text) ||
|
|
451
|
+
/\b(?:const|let|var)\s+\w+\s*=/.test(text) ||
|
|
452
|
+
/\bclass\s+\w+/.test(text) ||
|
|
453
|
+
/=>\s*[{(]/.test(text) ||
|
|
454
|
+
/\bimport\s+.*\bfrom\b/.test(text) ||
|
|
455
|
+
/\bexport\s+(?:default\s+)?(?:function|class|const)/.test(text) ||
|
|
456
|
+
/\breturn\s+.+;/.test(text);
|
|
457
|
+
const hasCodeBlock = hasMarkdownBlock || hasHtmlBlock || hasRawCode;
|
|
437
458
|
return {
|
|
438
459
|
name: "toContainCode",
|
|
439
460
|
passed: hasCodeBlock,
|
|
440
|
-
expected: "code block",
|
|
461
|
+
expected: language ? `code block (${language})` : "code block",
|
|
441
462
|
actual: text,
|
|
442
|
-
message: message ||
|
|
443
|
-
(hasCodeBlock ? "Contains code block" : "No code block found"),
|
|
463
|
+
message: message || (hasCodeBlock ? "Contains code" : "No code found"),
|
|
444
464
|
};
|
|
445
465
|
}
|
|
446
466
|
/**
|
|
@@ -591,13 +611,91 @@ function notContainsPII(text) {
|
|
|
591
611
|
function hasPII(text) {
|
|
592
612
|
return !notContainsPII(text);
|
|
593
613
|
}
|
|
614
|
+
/**
|
|
615
|
+
* Lexicon-based sentiment check. **Fast and approximate** — suitable for
|
|
616
|
+
* low-stakes filtering or CI smoke tests. For production safety gates use
|
|
617
|
+
* {@link hasSentimentAsync} with an LLM provider for context-aware accuracy.
|
|
618
|
+
*/
|
|
594
619
|
function hasSentiment(text, expected) {
|
|
595
|
-
|
|
596
|
-
const positiveWords = [
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
620
|
+
const lower = text.toLowerCase();
|
|
621
|
+
const positiveWords = [
|
|
622
|
+
"good",
|
|
623
|
+
"great",
|
|
624
|
+
"excellent",
|
|
625
|
+
"amazing",
|
|
626
|
+
"wonderful",
|
|
627
|
+
"fantastic",
|
|
628
|
+
"love",
|
|
629
|
+
"best",
|
|
630
|
+
"happy",
|
|
631
|
+
"helpful",
|
|
632
|
+
"awesome",
|
|
633
|
+
"superb",
|
|
634
|
+
"outstanding",
|
|
635
|
+
"brilliant",
|
|
636
|
+
"perfect",
|
|
637
|
+
"delightful",
|
|
638
|
+
"joyful",
|
|
639
|
+
"pleased",
|
|
640
|
+
"glad",
|
|
641
|
+
"terrific",
|
|
642
|
+
"fabulous",
|
|
643
|
+
"exceptional",
|
|
644
|
+
"impressive",
|
|
645
|
+
"magnificent",
|
|
646
|
+
"marvelous",
|
|
647
|
+
"splendid",
|
|
648
|
+
"positive",
|
|
649
|
+
"enjoy",
|
|
650
|
+
"enjoyed",
|
|
651
|
+
"like",
|
|
652
|
+
"liked",
|
|
653
|
+
"beautiful",
|
|
654
|
+
"innovative",
|
|
655
|
+
"inspiring",
|
|
656
|
+
"effective",
|
|
657
|
+
"useful",
|
|
658
|
+
"valuable",
|
|
659
|
+
];
|
|
660
|
+
const negativeWords = [
|
|
661
|
+
"bad",
|
|
662
|
+
"terrible",
|
|
663
|
+
"awful",
|
|
664
|
+
"horrible",
|
|
665
|
+
"worst",
|
|
666
|
+
"hate",
|
|
667
|
+
"poor",
|
|
668
|
+
"disappointing",
|
|
669
|
+
"sad",
|
|
670
|
+
"useless",
|
|
671
|
+
"dreadful",
|
|
672
|
+
"miserable",
|
|
673
|
+
"angry",
|
|
674
|
+
"frustrated",
|
|
675
|
+
"broken",
|
|
676
|
+
"failed",
|
|
677
|
+
"pathetic",
|
|
678
|
+
"stupid",
|
|
679
|
+
"disgusting",
|
|
680
|
+
"unacceptable",
|
|
681
|
+
"wrong",
|
|
682
|
+
"error",
|
|
683
|
+
"fail",
|
|
684
|
+
"problem",
|
|
685
|
+
"negative",
|
|
686
|
+
"dislike",
|
|
687
|
+
"annoying",
|
|
688
|
+
"irritating",
|
|
689
|
+
"offensive",
|
|
690
|
+
"regret",
|
|
691
|
+
"disappointment",
|
|
692
|
+
"inadequate",
|
|
693
|
+
"mediocre",
|
|
694
|
+
"flawed",
|
|
695
|
+
"unreliable",
|
|
696
|
+
];
|
|
697
|
+
const positiveCount = positiveWords.filter((w) => lower.includes(w)).length;
|
|
698
|
+
const negativeCount = negativeWords.filter((w) => lower.includes(w)).length;
|
|
601
699
|
if (expected === "positive")
|
|
602
700
|
return positiveCount > negativeCount;
|
|
603
701
|
if (expected === "negative")
|
|
@@ -627,22 +725,40 @@ function isValidURL(url) {
|
|
|
627
725
|
return false;
|
|
628
726
|
}
|
|
629
727
|
}
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
728
|
+
/**
|
|
729
|
+
* Substring-based hallucination check — verifies each ground-truth fact
|
|
730
|
+
* appears verbatim in the text. **Fast and approximate**: catches missing
|
|
731
|
+
* facts but cannot detect paraphrased fabrications. Use
|
|
732
|
+
* {@link hasNoHallucinationsAsync} for semantic accuracy.
|
|
733
|
+
*/
|
|
734
|
+
function hasNoHallucinations(text, groundTruth = []) {
|
|
735
|
+
const lower = text.toLowerCase();
|
|
736
|
+
return groundTruth.every((truth) => lower.includes(truth.toLowerCase()));
|
|
633
737
|
}
|
|
634
738
|
function matchesSchema(value, schema) {
|
|
635
|
-
// This is a simplified implementation
|
|
636
739
|
if (typeof value !== "object" || value === null)
|
|
637
740
|
return false;
|
|
638
|
-
|
|
741
|
+
const obj = value;
|
|
742
|
+
// JSON Schema: { required: ['name', 'age'] } — check required keys exist
|
|
743
|
+
if (Array.isArray(schema.required)) {
|
|
744
|
+
return schema.required.every((key) => key in obj);
|
|
745
|
+
}
|
|
746
|
+
// JSON Schema: { properties: { name: {}, age: {} } } — check property keys exist
|
|
747
|
+
if (schema.properties && typeof schema.properties === "object") {
|
|
748
|
+
return Object.keys(schema.properties).every((key) => key in obj);
|
|
749
|
+
}
|
|
750
|
+
// Simple template format: { name: '', value: '' } — all schema keys must exist in value
|
|
751
|
+
return Object.keys(schema).every((key) => key in obj);
|
|
639
752
|
}
|
|
640
753
|
function hasReadabilityScore(text, minScore) {
|
|
641
|
-
|
|
642
|
-
const
|
|
643
|
-
const
|
|
644
|
-
const
|
|
645
|
-
|
|
754
|
+
const threshold = typeof minScore === "number" ? minScore : (minScore.min ?? 0);
|
|
755
|
+
const maxThreshold = typeof minScore === "object" ? minScore.max : undefined;
|
|
756
|
+
const wordList = text.trim().split(/\s+/).filter(Boolean);
|
|
757
|
+
const words = wordList.length || 1;
|
|
758
|
+
const sentences = text.split(/[.!?]+/).filter((s) => s.trim().length > 0).length || 1;
|
|
759
|
+
const totalSyllables = wordList.reduce((sum, w) => sum + syllables(w), 0);
|
|
760
|
+
const score = 206.835 - 1.015 * (words / sentences) - 84.6 * (totalSyllables / words);
|
|
761
|
+
return (score >= threshold && (maxThreshold === undefined || score <= maxThreshold));
|
|
646
762
|
}
|
|
647
763
|
function syllables(word) {
|
|
648
764
|
// Simple syllable counter
|
|
@@ -654,31 +770,408 @@ function syllables(word) {
|
|
|
654
770
|
.trim()
|
|
655
771
|
.split(/\s+/).length;
|
|
656
772
|
}
|
|
773
|
+
/**
|
|
774
|
+
* Keyword-frequency language detector supporting 12 languages.
|
|
775
|
+
* **Fast and approximate** — detects the most common languages reliably
|
|
776
|
+
* but may struggle with short texts or closely related languages.
|
|
777
|
+
* Use {@link containsLanguageAsync} for reliable detection of any language.
|
|
778
|
+
*/
|
|
657
779
|
function containsLanguage(text, language) {
|
|
658
|
-
// This is a simplified implementation
|
|
659
|
-
// In a real app, you'd use a language detection library
|
|
660
780
|
const languageKeywords = {
|
|
661
|
-
en: [
|
|
662
|
-
|
|
663
|
-
|
|
781
|
+
en: [
|
|
782
|
+
"the",
|
|
783
|
+
"and",
|
|
784
|
+
"you",
|
|
785
|
+
"that",
|
|
786
|
+
"was",
|
|
787
|
+
"for",
|
|
788
|
+
"are",
|
|
789
|
+
"with",
|
|
790
|
+
"have",
|
|
791
|
+
"this",
|
|
792
|
+
"from",
|
|
793
|
+
"they",
|
|
794
|
+
"will",
|
|
795
|
+
"would",
|
|
796
|
+
"been",
|
|
797
|
+
"their",
|
|
798
|
+
],
|
|
799
|
+
es: [
|
|
800
|
+
"el",
|
|
801
|
+
"la",
|
|
802
|
+
"los",
|
|
803
|
+
"las",
|
|
804
|
+
"de",
|
|
805
|
+
"que",
|
|
806
|
+
"y",
|
|
807
|
+
"en",
|
|
808
|
+
"es",
|
|
809
|
+
"por",
|
|
810
|
+
"para",
|
|
811
|
+
"con",
|
|
812
|
+
"una",
|
|
813
|
+
"como",
|
|
814
|
+
"pero",
|
|
815
|
+
"también",
|
|
816
|
+
],
|
|
817
|
+
fr: [
|
|
818
|
+
"le",
|
|
819
|
+
"la",
|
|
820
|
+
"les",
|
|
821
|
+
"de",
|
|
822
|
+
"et",
|
|
823
|
+
"à",
|
|
824
|
+
"un",
|
|
825
|
+
"une",
|
|
826
|
+
"du",
|
|
827
|
+
"des",
|
|
828
|
+
"est",
|
|
829
|
+
"que",
|
|
830
|
+
"dans",
|
|
831
|
+
"pour",
|
|
832
|
+
"sur",
|
|
833
|
+
"avec",
|
|
834
|
+
],
|
|
835
|
+
de: [
|
|
836
|
+
"der",
|
|
837
|
+
"die",
|
|
838
|
+
"das",
|
|
839
|
+
"und",
|
|
840
|
+
"ist",
|
|
841
|
+
"ich",
|
|
842
|
+
"nicht",
|
|
843
|
+
"mit",
|
|
844
|
+
"sie",
|
|
845
|
+
"ein",
|
|
846
|
+
"eine",
|
|
847
|
+
"von",
|
|
848
|
+
"zu",
|
|
849
|
+
"auf",
|
|
850
|
+
"auch",
|
|
851
|
+
"dem",
|
|
852
|
+
],
|
|
853
|
+
it: [
|
|
854
|
+
"il",
|
|
855
|
+
"di",
|
|
856
|
+
"che",
|
|
857
|
+
"non",
|
|
858
|
+
"si",
|
|
859
|
+
"per",
|
|
860
|
+
"del",
|
|
861
|
+
"un",
|
|
862
|
+
"una",
|
|
863
|
+
"con",
|
|
864
|
+
"sono",
|
|
865
|
+
"nel",
|
|
866
|
+
"una",
|
|
867
|
+
"questo",
|
|
868
|
+
"come",
|
|
869
|
+
],
|
|
870
|
+
pt: [
|
|
871
|
+
"de",
|
|
872
|
+
"que",
|
|
873
|
+
"do",
|
|
874
|
+
"da",
|
|
875
|
+
"em",
|
|
876
|
+
"um",
|
|
877
|
+
"para",
|
|
878
|
+
"com",
|
|
879
|
+
"uma",
|
|
880
|
+
"os",
|
|
881
|
+
"as",
|
|
882
|
+
"não",
|
|
883
|
+
"mas",
|
|
884
|
+
"por",
|
|
885
|
+
"mais",
|
|
886
|
+
],
|
|
887
|
+
nl: [
|
|
888
|
+
"de",
|
|
889
|
+
"het",
|
|
890
|
+
"een",
|
|
891
|
+
"van",
|
|
892
|
+
"en",
|
|
893
|
+
"in",
|
|
894
|
+
"is",
|
|
895
|
+
"dat",
|
|
896
|
+
"op",
|
|
897
|
+
"te",
|
|
898
|
+
"zijn",
|
|
899
|
+
"niet",
|
|
900
|
+
"ook",
|
|
901
|
+
"met",
|
|
902
|
+
"voor",
|
|
903
|
+
],
|
|
904
|
+
ru: [
|
|
905
|
+
"и",
|
|
906
|
+
"в",
|
|
907
|
+
"не",
|
|
908
|
+
"на",
|
|
909
|
+
"я",
|
|
910
|
+
"что",
|
|
911
|
+
"с",
|
|
912
|
+
"по",
|
|
913
|
+
"это",
|
|
914
|
+
"как",
|
|
915
|
+
"но",
|
|
916
|
+
"он",
|
|
917
|
+
"она",
|
|
918
|
+
"мы",
|
|
919
|
+
"они",
|
|
920
|
+
],
|
|
921
|
+
zh: [
|
|
922
|
+
"的",
|
|
923
|
+
"了",
|
|
924
|
+
"是",
|
|
925
|
+
"在",
|
|
926
|
+
"我",
|
|
927
|
+
"有",
|
|
928
|
+
"和",
|
|
929
|
+
"就",
|
|
930
|
+
"不",
|
|
931
|
+
"都",
|
|
932
|
+
"也",
|
|
933
|
+
"很",
|
|
934
|
+
"会",
|
|
935
|
+
"这",
|
|
936
|
+
"他",
|
|
937
|
+
],
|
|
938
|
+
ja: [
|
|
939
|
+
"は",
|
|
940
|
+
"が",
|
|
941
|
+
"の",
|
|
942
|
+
"に",
|
|
943
|
+
"を",
|
|
944
|
+
"で",
|
|
945
|
+
"と",
|
|
946
|
+
"た",
|
|
947
|
+
"し",
|
|
948
|
+
"て",
|
|
949
|
+
"も",
|
|
950
|
+
"な",
|
|
951
|
+
"か",
|
|
952
|
+
"から",
|
|
953
|
+
"まで",
|
|
954
|
+
],
|
|
955
|
+
ko: [
|
|
956
|
+
"이",
|
|
957
|
+
"은",
|
|
958
|
+
"는",
|
|
959
|
+
"을",
|
|
960
|
+
"를",
|
|
961
|
+
"의",
|
|
962
|
+
"에",
|
|
963
|
+
"가",
|
|
964
|
+
"로",
|
|
965
|
+
"도",
|
|
966
|
+
"와",
|
|
967
|
+
"과",
|
|
968
|
+
"하",
|
|
969
|
+
"있",
|
|
970
|
+
"합",
|
|
971
|
+
],
|
|
972
|
+
ar: [
|
|
973
|
+
"في",
|
|
974
|
+
"من",
|
|
975
|
+
"على",
|
|
976
|
+
"إلى",
|
|
977
|
+
"هذا",
|
|
978
|
+
"مع",
|
|
979
|
+
"أن",
|
|
980
|
+
"هو",
|
|
981
|
+
"كان",
|
|
982
|
+
"كل",
|
|
983
|
+
"التي",
|
|
984
|
+
"الذي",
|
|
985
|
+
"عن",
|
|
986
|
+
"لا",
|
|
987
|
+
],
|
|
664
988
|
};
|
|
665
|
-
const
|
|
989
|
+
const lang = language.toLowerCase();
|
|
990
|
+
const keywords = languageKeywords[lang] || languageKeywords[lang.split("-")[0]] || [];
|
|
666
991
|
return keywords.some((keyword) => text.toLowerCase().includes(keyword));
|
|
667
992
|
}
|
|
993
|
+
/**
|
|
994
|
+
* Substring-based factual accuracy check. **Fast and approximate** — verifies
|
|
995
|
+
* each fact string appears in the text but cannot reason about meaning or
|
|
996
|
+
* paraphrasing. Use {@link hasFactualAccuracyAsync} for semantic accuracy.
|
|
997
|
+
*/
|
|
668
998
|
function hasFactualAccuracy(text, facts) {
|
|
669
|
-
|
|
670
|
-
return facts.every((fact) =>
|
|
999
|
+
const lower = text.toLowerCase();
|
|
1000
|
+
return facts.every((fact) => lower.includes(fact.toLowerCase()));
|
|
671
1001
|
}
|
|
672
1002
|
function respondedWithinTime(startTime, maxMs) {
|
|
673
1003
|
return Date.now() - startTime <= maxMs;
|
|
674
1004
|
}
|
|
1005
|
+
/**
|
|
1006
|
+
* Blocklist-based toxicity check (~80 terms across 9 categories).
|
|
1007
|
+
* **Fast and approximate** — catches explicit harmful language but has
|
|
1008
|
+
* inherent gaps and context-blind false positives. Do NOT rely on this
|
|
1009
|
+
* alone for production content safety gates; use {@link hasNoToxicityAsync}
|
|
1010
|
+
* with an LLM for context-aware moderation.
|
|
1011
|
+
*/
|
|
675
1012
|
function hasNoToxicity(text) {
|
|
676
|
-
|
|
677
|
-
const
|
|
678
|
-
|
|
1013
|
+
const lower = text.toLowerCase();
|
|
1014
|
+
const toxicTerms = [
|
|
1015
|
+
// Insults and derogatory attacks
|
|
1016
|
+
"idiot",
|
|
1017
|
+
"moron",
|
|
1018
|
+
"imbecile",
|
|
1019
|
+
"dumbass",
|
|
1020
|
+
"dumb ass",
|
|
1021
|
+
"jackass",
|
|
1022
|
+
"dimwit",
|
|
1023
|
+
"nitwit",
|
|
1024
|
+
"halfwit",
|
|
1025
|
+
"numbskull",
|
|
1026
|
+
"bonehead",
|
|
1027
|
+
"blockhead",
|
|
1028
|
+
"scumbag",
|
|
1029
|
+
"dirtbag",
|
|
1030
|
+
"slimeball",
|
|
1031
|
+
"lowlife",
|
|
1032
|
+
"piece of shit",
|
|
1033
|
+
"piece of crap",
|
|
1034
|
+
"piece of garbage",
|
|
1035
|
+
"piece of trash",
|
|
1036
|
+
"waste of space",
|
|
1037
|
+
"waste of oxygen",
|
|
1038
|
+
// Degradation directed at people
|
|
1039
|
+
"you're worthless",
|
|
1040
|
+
"you are worthless",
|
|
1041
|
+
"you're useless",
|
|
1042
|
+
"you are useless",
|
|
1043
|
+
"you're pathetic",
|
|
1044
|
+
"you are pathetic",
|
|
1045
|
+
"you're disgusting",
|
|
1046
|
+
"you are disgusting",
|
|
1047
|
+
"you're nothing",
|
|
1048
|
+
"you are nothing",
|
|
1049
|
+
"you're scum",
|
|
1050
|
+
"you're garbage",
|
|
1051
|
+
"you're trash",
|
|
1052
|
+
"you're a loser",
|
|
1053
|
+
"complete loser",
|
|
1054
|
+
"total loser",
|
|
1055
|
+
"absolute loser",
|
|
1056
|
+
// Violence and threats
|
|
1057
|
+
"kill you",
|
|
1058
|
+
"kill yourself",
|
|
1059
|
+
"i'll kill",
|
|
1060
|
+
"i will kill",
|
|
1061
|
+
"gonna kill",
|
|
1062
|
+
"going to kill",
|
|
1063
|
+
"murder you",
|
|
1064
|
+
"i'll murder",
|
|
1065
|
+
"i will murder",
|
|
1066
|
+
"want you dead",
|
|
1067
|
+
"want them dead",
|
|
1068
|
+
"you should die",
|
|
1069
|
+
"hope you die",
|
|
1070
|
+
"hope you rot",
|
|
1071
|
+
"hope you suffer",
|
|
1072
|
+
"shoot you",
|
|
1073
|
+
"stab you",
|
|
1074
|
+
"beat you up",
|
|
1075
|
+
"hurt you",
|
|
1076
|
+
"harm you",
|
|
1077
|
+
"i'll hurt",
|
|
1078
|
+
"i will hurt",
|
|
1079
|
+
"i'll destroy you",
|
|
1080
|
+
"i will destroy you",
|
|
1081
|
+
"i'll end you",
|
|
1082
|
+
"you're dead",
|
|
1083
|
+
"you're going to die",
|
|
1084
|
+
"i know where you live",
|
|
1085
|
+
"i'll find you",
|
|
1086
|
+
"i will find you",
|
|
1087
|
+
// Self-harm directed at others
|
|
1088
|
+
"go kill yourself",
|
|
1089
|
+
"kys",
|
|
1090
|
+
"end yourself",
|
|
1091
|
+
"off yourself",
|
|
1092
|
+
"just die",
|
|
1093
|
+
"better off dead",
|
|
1094
|
+
"world is better without you",
|
|
1095
|
+
"no one would miss you",
|
|
1096
|
+
"nobody would miss you",
|
|
1097
|
+
"do us a favor and die",
|
|
1098
|
+
// Dehumanization
|
|
1099
|
+
"not even human",
|
|
1100
|
+
"barely human",
|
|
1101
|
+
"subhuman",
|
|
1102
|
+
"less than human",
|
|
1103
|
+
"you're vermin",
|
|
1104
|
+
"you're a parasite",
|
|
1105
|
+
"deserve to die",
|
|
1106
|
+
"deserve to suffer",
|
|
1107
|
+
"deserve pain",
|
|
1108
|
+
"deserve misery",
|
|
1109
|
+
// Hate and rejection
|
|
1110
|
+
"i hate you",
|
|
1111
|
+
"everyone hates you",
|
|
1112
|
+
"they all hate you",
|
|
1113
|
+
"no one likes you",
|
|
1114
|
+
"nobody likes you",
|
|
1115
|
+
"nobody wants you",
|
|
1116
|
+
"nobody cares about you",
|
|
1117
|
+
"no one cares about you",
|
|
1118
|
+
"go to hell",
|
|
1119
|
+
"burn in hell",
|
|
1120
|
+
"rot in hell",
|
|
1121
|
+
"drop dead",
|
|
1122
|
+
"go die",
|
|
1123
|
+
"i hope you suffer",
|
|
1124
|
+
// Harassment and threats
|
|
1125
|
+
"i'll ruin you",
|
|
1126
|
+
"ruin your life",
|
|
1127
|
+
"make your life hell",
|
|
1128
|
+
"i'll expose you",
|
|
1129
|
+
"i'll dox you",
|
|
1130
|
+
"i will dox you",
|
|
1131
|
+
"doxxing",
|
|
1132
|
+
"i'll come for you",
|
|
1133
|
+
"you'll pay for this",
|
|
1134
|
+
// Profanity as direct attacks
|
|
1135
|
+
"fuck you",
|
|
1136
|
+
"fuck off",
|
|
1137
|
+
"go fuck yourself",
|
|
1138
|
+
"screw you",
|
|
1139
|
+
"shut the fuck up",
|
|
1140
|
+
"to hell with you",
|
|
1141
|
+
// Bullying
|
|
1142
|
+
"you're a joke",
|
|
1143
|
+
"you're a laughingstock",
|
|
1144
|
+
"everyone is laughing at you",
|
|
1145
|
+
"pathetic loser",
|
|
1146
|
+
// Appearance attacks
|
|
1147
|
+
"fat pig",
|
|
1148
|
+
"fat slob",
|
|
1149
|
+
"hideous freak",
|
|
1150
|
+
// Mental health weaponized
|
|
1151
|
+
"you're crazy",
|
|
1152
|
+
"you're insane",
|
|
1153
|
+
"you're a psycho",
|
|
1154
|
+
"you're delusional",
|
|
1155
|
+
"you're mental",
|
|
1156
|
+
"you belong in an asylum",
|
|
1157
|
+
"you're a lunatic",
|
|
1158
|
+
// Explicit profanity used as insults
|
|
1159
|
+
"bastard",
|
|
1160
|
+
"bitch",
|
|
1161
|
+
"cunt",
|
|
1162
|
+
"asshole",
|
|
1163
|
+
"dipshit",
|
|
1164
|
+
"douchebag",
|
|
1165
|
+
"motherfucker",
|
|
1166
|
+
"fucktard",
|
|
1167
|
+
];
|
|
1168
|
+
return !toxicTerms.some((term) => lower.includes(term));
|
|
679
1169
|
}
|
|
680
1170
|
function followsInstructions(text, instructions) {
|
|
681
|
-
|
|
1171
|
+
const instructionList = Array.isArray(instructions)
|
|
1172
|
+
? instructions
|
|
1173
|
+
: [instructions];
|
|
1174
|
+
return instructionList.every((instruction) => {
|
|
682
1175
|
if (instruction.startsWith("!")) {
|
|
683
1176
|
return !text.includes(instruction.slice(1));
|
|
684
1177
|
}
|
|
@@ -688,16 +1181,211 @@ function followsInstructions(text, instructions) {
|
|
|
688
1181
|
function containsAllRequiredFields(obj, requiredFields) {
|
|
689
1182
|
return requiredFields.every((field) => obj && typeof obj === "object" && field in obj);
|
|
690
1183
|
}
|
|
1184
|
+
let _assertionLLMConfig = null;
|
|
1185
|
+
function configureAssertions(config) {
|
|
1186
|
+
_assertionLLMConfig = config;
|
|
1187
|
+
}
|
|
1188
|
+
function getAssertionConfig() {
|
|
1189
|
+
return _assertionLLMConfig;
|
|
1190
|
+
}
|
|
1191
|
+
async function callAssertionLLM(prompt, config) {
|
|
1192
|
+
const cfg = config ?? _assertionLLMConfig;
|
|
1193
|
+
if (!cfg) {
|
|
1194
|
+
throw new Error("No LLM config set. Call configureAssertions({ provider, apiKey }) first, or pass a config as the last argument.");
|
|
1195
|
+
}
|
|
1196
|
+
if (cfg.provider === "openai") {
|
|
1197
|
+
const baseUrl = cfg.baseUrl ?? "https://api.openai.com";
|
|
1198
|
+
const model = cfg.model ?? "gpt-4o-mini";
|
|
1199
|
+
const res = await fetch(`${baseUrl}/v1/chat/completions`, {
|
|
1200
|
+
method: "POST",
|
|
1201
|
+
headers: {
|
|
1202
|
+
"Content-Type": "application/json",
|
|
1203
|
+
Authorization: `Bearer ${cfg.apiKey}`,
|
|
1204
|
+
},
|
|
1205
|
+
body: JSON.stringify({
|
|
1206
|
+
model,
|
|
1207
|
+
messages: [{ role: "user", content: prompt }],
|
|
1208
|
+
max_tokens: 10,
|
|
1209
|
+
temperature: 0,
|
|
1210
|
+
}),
|
|
1211
|
+
});
|
|
1212
|
+
if (!res.ok) {
|
|
1213
|
+
throw new Error(`OpenAI API error ${res.status}: ${await res.text()}`);
|
|
1214
|
+
}
|
|
1215
|
+
const data = (await res.json());
|
|
1216
|
+
return data.choices[0]?.message?.content?.trim().toLowerCase() ?? "";
|
|
1217
|
+
}
|
|
1218
|
+
if (cfg.provider === "anthropic") {
|
|
1219
|
+
const baseUrl = cfg.baseUrl ?? "https://api.anthropic.com";
|
|
1220
|
+
const model = cfg.model ?? "claude-3-haiku-20240307";
|
|
1221
|
+
const res = await fetch(`${baseUrl}/v1/messages`, {
|
|
1222
|
+
method: "POST",
|
|
1223
|
+
headers: {
|
|
1224
|
+
"Content-Type": "application/json",
|
|
1225
|
+
"x-api-key": cfg.apiKey,
|
|
1226
|
+
"anthropic-version": "2023-06-01",
|
|
1227
|
+
},
|
|
1228
|
+
body: JSON.stringify({
|
|
1229
|
+
model,
|
|
1230
|
+
max_tokens: 10,
|
|
1231
|
+
messages: [{ role: "user", content: prompt }],
|
|
1232
|
+
}),
|
|
1233
|
+
});
|
|
1234
|
+
if (!res.ok) {
|
|
1235
|
+
throw new Error(`Anthropic API error ${res.status}: ${await res.text()}`);
|
|
1236
|
+
}
|
|
1237
|
+
const data = (await res.json());
|
|
1238
|
+
return data.content[0]?.text?.trim().toLowerCase() ?? "";
|
|
1239
|
+
}
|
|
1240
|
+
throw new Error(`Unsupported provider: "${cfg.provider}". Use "openai" or "anthropic".`);
|
|
1241
|
+
}
|
|
1242
|
+
// ============================================================================
|
|
1243
|
+
// LLM-BACKED ASYNC ASSERTION FUNCTIONS
|
|
1244
|
+
// ============================================================================
|
|
1245
|
+
/**
|
|
1246
|
+
* LLM-backed sentiment check. **Slow and accurate** — uses an LLM to
|
|
1247
|
+
* classify sentiment with full context awareness. Requires
|
|
1248
|
+
* {@link configureAssertions} or an inline `config` argument.
|
|
1249
|
+
* Falls back gracefully with a clear error if no API key is configured.
|
|
1250
|
+
*/
|
|
1251
|
+
async function hasSentimentAsync(text, expected, config) {
|
|
1252
|
+
const prompt = `Classify the sentiment of the following text. Reply with exactly one word — positive, negative, or neutral — and nothing else.\n\nText: "${text}"`;
|
|
1253
|
+
const result = await callAssertionLLM(prompt, config);
|
|
1254
|
+
return result.replace(/[^a-z]/g, "") === expected;
|
|
1255
|
+
}
|
|
1256
|
+
/**
|
|
1257
|
+
* LLM-backed toxicity check. **Slow and accurate** — context-aware, handles
|
|
1258
|
+
* sarcasm, implicit threats, and culturally specific harmful content that
|
|
1259
|
+
* blocklists miss. Recommended for production content safety gates.
|
|
1260
|
+
*/
|
|
1261
|
+
async function hasNoToxicityAsync(text, config) {
|
|
1262
|
+
const prompt = `Does the following text contain toxic, harmful, hateful, threatening, or abusive content? Reply with only "yes" or "no".\n\nText: "${text}"`;
|
|
1263
|
+
const result = await callAssertionLLM(prompt, config);
|
|
1264
|
+
return result.replace(/[^a-z]/g, "") === "no";
|
|
1265
|
+
}
|
|
1266
|
+
async function containsLanguageAsync(text, language, config) {
|
|
1267
|
+
const prompt = `Is the following text primarily written in ${language}? Reply with only "yes" or "no".\n\nText: "${text}"`;
|
|
1268
|
+
const result = await callAssertionLLM(prompt, config);
|
|
1269
|
+
return result.replace(/[^a-z]/g, "") === "yes";
|
|
1270
|
+
}
|
|
1271
|
+
async function hasValidCodeSyntaxAsync(code, language, config) {
|
|
1272
|
+
const prompt = `Is the following ${language} code free of syntax errors? Reply with only "yes" or "no".\n\nCode:\n\`\`\`${language}\n${code}\n\`\`\``;
|
|
1273
|
+
const result = await callAssertionLLM(prompt, config);
|
|
1274
|
+
return result.replace(/[^a-z]/g, "") === "yes";
|
|
1275
|
+
}
|
|
1276
|
+
async function hasFactualAccuracyAsync(text, facts, config) {
|
|
1277
|
+
const factList = facts.map((f, i) => `${i + 1}. ${f}`).join("\n");
|
|
1278
|
+
const prompt = `Does the following text accurately convey all of these facts without contradicting or omitting any?\n\nFacts:\n${factList}\n\nText: "${text}"\n\nReply with only "yes" or "no".`;
|
|
1279
|
+
const result = await callAssertionLLM(prompt, config);
|
|
1280
|
+
return result.replace(/[^a-z]/g, "") === "yes";
|
|
1281
|
+
}
|
|
1282
|
+
/**
|
|
1283
|
+
* LLM-backed hallucination check. **Slow and accurate** — detects fabricated
|
|
1284
|
+
* claims even when they are paraphrased or contradict facts indirectly.
|
|
1285
|
+
*/
|
|
1286
|
+
async function hasNoHallucinationsAsync(text, groundTruth, config) {
|
|
1287
|
+
const truthList = groundTruth.map((f, i) => `${i + 1}. ${f}`).join("\n");
|
|
1288
|
+
const prompt = `Does the following text stay consistent with the ground truth facts below, without introducing fabricated or hallucinated claims?\n\nGround truth:\n${truthList}\n\nText: "${text}"\n\nReply with only "yes" or "no".`;
|
|
1289
|
+
const result = await callAssertionLLM(prompt, config);
|
|
1290
|
+
return result.replace(/[^a-z]/g, "") === "yes";
|
|
1291
|
+
}
|
|
691
1292
|
function hasValidCodeSyntax(code, language) {
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
if (language === "json")
|
|
1293
|
+
const lang = language.toLowerCase();
|
|
1294
|
+
if (lang === "json") {
|
|
1295
|
+
try {
|
|
696
1296
|
JSON.parse(code);
|
|
697
|
-
|
|
698
|
-
|
|
1297
|
+
return true;
|
|
1298
|
+
}
|
|
1299
|
+
catch {
|
|
1300
|
+
return false;
|
|
1301
|
+
}
|
|
699
1302
|
}
|
|
700
|
-
|
|
701
|
-
|
|
1303
|
+
// Bracket, brace, and parenthesis balance check with string/comment awareness.
|
|
1304
|
+
// Catches unmatched delimiters in JS, TS, Python, Java, C, Go, Rust, and most languages.
|
|
1305
|
+
// Template literals (backtick strings) are treated as opaque — their entire
|
|
1306
|
+
// content including ${...} expressions is skipped, so braces inside them
|
|
1307
|
+
// do not affect the balance count. This is intentional and correct.
|
|
1308
|
+
// Use hasValidCodeSyntaxAsync for deeper semantic analysis.
|
|
1309
|
+
const stack = [];
|
|
1310
|
+
const pairs = { ")": "(", "]": "[", "}": "{" };
|
|
1311
|
+
const opens = new Set(["(", "[", "{"]);
|
|
1312
|
+
const closes = new Set([")", "]", "}"]);
|
|
1313
|
+
const isPythonLike = lang === "python" || lang === "py" || lang === "ruby" || lang === "rb";
|
|
1314
|
+
const isJSLike = lang === "javascript" ||
|
|
1315
|
+
lang === "js" ||
|
|
1316
|
+
lang === "typescript" ||
|
|
1317
|
+
lang === "ts";
|
|
1318
|
+
let inSingleQuote = false;
|
|
1319
|
+
let inDoubleQuote = false;
|
|
1320
|
+
let inTemplateLiteral = false;
|
|
1321
|
+
let inLineComment = false;
|
|
1322
|
+
let inBlockComment = false;
|
|
1323
|
+
for (let i = 0; i < code.length; i++) {
|
|
1324
|
+
const ch = code[i];
|
|
1325
|
+
const next = code[i + 1] ?? "";
|
|
1326
|
+
const prev = code[i - 1] ?? "";
|
|
1327
|
+
if (inLineComment) {
|
|
1328
|
+
if (ch === "\n")
|
|
1329
|
+
inLineComment = false;
|
|
1330
|
+
continue;
|
|
1331
|
+
}
|
|
1332
|
+
if (inBlockComment) {
|
|
1333
|
+
if (ch === "*" && next === "/") {
|
|
1334
|
+
inBlockComment = false;
|
|
1335
|
+
i++;
|
|
1336
|
+
}
|
|
1337
|
+
continue;
|
|
1338
|
+
}
|
|
1339
|
+
if (inSingleQuote) {
|
|
1340
|
+
if (ch === "'" && prev !== "\\")
|
|
1341
|
+
inSingleQuote = false;
|
|
1342
|
+
continue;
|
|
1343
|
+
}
|
|
1344
|
+
if (inDoubleQuote) {
|
|
1345
|
+
if (ch === '"' && prev !== "\\")
|
|
1346
|
+
inDoubleQuote = false;
|
|
1347
|
+
continue;
|
|
1348
|
+
}
|
|
1349
|
+
if (inTemplateLiteral) {
|
|
1350
|
+
if (ch === "`" && prev !== "\\")
|
|
1351
|
+
inTemplateLiteral = false;
|
|
1352
|
+
continue;
|
|
1353
|
+
}
|
|
1354
|
+
if (ch === "/" && next === "/") {
|
|
1355
|
+
inLineComment = true;
|
|
1356
|
+
i++;
|
|
1357
|
+
continue;
|
|
1358
|
+
}
|
|
1359
|
+
if (ch === "/" && next === "*") {
|
|
1360
|
+
inBlockComment = true;
|
|
1361
|
+
i++;
|
|
1362
|
+
continue;
|
|
1363
|
+
}
|
|
1364
|
+
if (isPythonLike && ch === "#") {
|
|
1365
|
+
inLineComment = true;
|
|
1366
|
+
continue;
|
|
1367
|
+
}
|
|
1368
|
+
if (ch === "'") {
|
|
1369
|
+
inSingleQuote = true;
|
|
1370
|
+
continue;
|
|
1371
|
+
}
|
|
1372
|
+
if (ch === '"') {
|
|
1373
|
+
inDoubleQuote = true;
|
|
1374
|
+
continue;
|
|
1375
|
+
}
|
|
1376
|
+
if (isJSLike && ch === "`") {
|
|
1377
|
+
inTemplateLiteral = true;
|
|
1378
|
+
continue;
|
|
1379
|
+
}
|
|
1380
|
+
if (opens.has(ch)) {
|
|
1381
|
+
stack.push(ch);
|
|
1382
|
+
}
|
|
1383
|
+
else if (closes.has(ch)) {
|
|
1384
|
+
if (stack.length === 0 || stack[stack.length - 1] !== pairs[ch]) {
|
|
1385
|
+
return false;
|
|
1386
|
+
}
|
|
1387
|
+
stack.pop();
|
|
1388
|
+
}
|
|
702
1389
|
}
|
|
1390
|
+
return stack.length === 0;
|
|
703
1391
|
}
|