@evalgate/sdk 2.2.3 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/CHANGELOG.md +31 -0
  2. package/README.md +39 -2
  3. package/dist/assertions.d.ts +186 -6
  4. package/dist/assertions.js +515 -61
  5. package/dist/batch.js +4 -4
  6. package/dist/cache.d.ts +4 -0
  7. package/dist/cache.js +4 -0
  8. package/dist/cli/baseline.d.ts +14 -0
  9. package/dist/cli/baseline.js +43 -3
  10. package/dist/cli/check.d.ts +5 -2
  11. package/dist/cli/check.js +20 -12
  12. package/dist/cli/compare.d.ts +80 -0
  13. package/dist/cli/compare.js +266 -0
  14. package/dist/cli/index.js +244 -101
  15. package/dist/cli/regression-gate.js +23 -0
  16. package/dist/cli/run.js +22 -0
  17. package/dist/cli/start.d.ts +26 -0
  18. package/dist/cli/start.js +130 -0
  19. package/dist/cli/templates.d.ts +24 -0
  20. package/dist/cli/templates.js +314 -0
  21. package/dist/cli/traces.d.ts +109 -0
  22. package/dist/cli/traces.js +152 -0
  23. package/dist/cli/validate.d.ts +37 -0
  24. package/dist/cli/validate.js +252 -0
  25. package/dist/cli/watch.d.ts +19 -0
  26. package/dist/cli/watch.js +175 -0
  27. package/dist/client.js +6 -13
  28. package/dist/constants.d.ts +2 -0
  29. package/dist/constants.js +5 -0
  30. package/dist/index.d.ts +8 -6
  31. package/dist/index.js +26 -6
  32. package/dist/integrations/openai.js +83 -60
  33. package/dist/logger.d.ts +3 -1
  34. package/dist/logger.js +2 -1
  35. package/dist/otel.d.ts +130 -0
  36. package/dist/otel.js +309 -0
  37. package/dist/runtime/eval.d.ts +14 -4
  38. package/dist/runtime/eval.js +127 -2
  39. package/dist/runtime/registry.d.ts +4 -2
  40. package/dist/runtime/registry.js +11 -3
  41. package/dist/runtime/run-report.d.ts +1 -1
  42. package/dist/runtime/run-report.js +7 -4
  43. package/dist/runtime/types.d.ts +38 -0
  44. package/dist/testing.d.ts +8 -0
  45. package/dist/testing.js +45 -10
  46. package/dist/version.d.ts +2 -2
  47. package/dist/version.js +2 -2
  48. package/dist/workflows.d.ts +2 -0
  49. package/dist/workflows.js +184 -102
  50. package/package.json +124 -117
@@ -26,7 +26,10 @@ exports.containsJSON = containsJSON;
26
26
  exports.notContainsPII = notContainsPII;
27
27
  exports.hasPII = hasPII;
28
28
  exports.hasSentiment = hasSentiment;
29
+ exports.hasSentimentWithScore = hasSentimentWithScore;
29
30
  exports.similarTo = similarTo;
31
+ exports.hasConsistency = hasConsistency;
32
+ exports.hasConsistencyAsync = hasConsistencyAsync;
30
33
  exports.withinRange = withinRange;
31
34
  exports.isValidEmail = isValidEmail;
32
35
  exports.isValidURL = isValidURL;
@@ -35,19 +38,37 @@ exports.matchesSchema = matchesSchema;
35
38
  exports.hasReadabilityScore = hasReadabilityScore;
36
39
  exports.containsLanguage = containsLanguage;
37
40
  exports.hasFactualAccuracy = hasFactualAccuracy;
41
+ exports.respondedWithinDuration = respondedWithinDuration;
42
+ exports.respondedWithinTimeSince = respondedWithinTimeSince;
38
43
  exports.respondedWithinTime = respondedWithinTime;
39
44
  exports.hasNoToxicity = hasNoToxicity;
40
45
  exports.followsInstructions = followsInstructions;
41
46
  exports.containsAllRequiredFields = containsAllRequiredFields;
42
47
  exports.configureAssertions = configureAssertions;
43
48
  exports.getAssertionConfig = getAssertionConfig;
49
+ exports.resetSentimentDeprecationWarning = resetSentimentDeprecationWarning;
44
50
  exports.hasSentimentAsync = hasSentimentAsync;
45
51
  exports.hasNoToxicityAsync = hasNoToxicityAsync;
46
52
  exports.containsLanguageAsync = containsLanguageAsync;
47
53
  exports.hasValidCodeSyntaxAsync = hasValidCodeSyntaxAsync;
48
54
  exports.hasFactualAccuracyAsync = hasFactualAccuracyAsync;
49
55
  exports.hasNoHallucinationsAsync = hasNoHallucinationsAsync;
56
+ exports.toSemanticallyContain = toSemanticallyContain;
57
+ exports.toSemanticallyContainLLM = toSemanticallyContainLLM;
50
58
  exports.hasValidCodeSyntax = hasValidCodeSyntax;
59
+ /**
60
+ * Test if a term appears in text as a whole word (word-boundary match)
61
+ * or as a phrase (for multi-word terms). Single words use \b regex to
62
+ * avoid false positives like "hell" matching "hello".
63
+ */
64
+ function textContainsTerm(lowerText, term) {
65
+ if (term.includes(" ")) {
66
+ // Multi-word phrases: substring match is correct
67
+ return lowerText.includes(term);
68
+ }
69
+ // Single words: word-boundary match
70
+ return new RegExp(`\\b${term.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\b`, "i").test(lowerText);
71
+ }
51
72
  class AssertionError extends Error {
52
73
  constructor(message, expected, actual) {
53
74
  super(message);
@@ -464,27 +485,37 @@ class Expectation {
464
485
  };
465
486
  }
466
487
  /**
467
- * Assert value is professional tone (no profanity)
468
- * @example expect(output).toBeProfessional()
488
+ * Blocklist check for 7 common profane words. Does NOT analyze tone,
489
+ * formality, or professional communication quality. For actual tone
490
+ * analysis, use an LLM-backed assertion.
491
+ * @see hasSentimentAsync for LLM-based tone checking
492
+ * @example expect(output).toHaveNoProfanity()
469
493
  */
470
- toBeProfessional(message) {
494
+ toHaveNoProfanity(message) {
471
495
  const text = String(this.value).toLowerCase();
472
496
  const profanity = ["damn", "hell", "shit", "fuck", "ass", "bitch", "crap"];
473
- const foundProfanity = profanity.filter((word) => text.includes(word));
497
+ const foundProfanity = profanity.filter((word) => textContainsTerm(text, word));
474
498
  const passed = foundProfanity.length === 0;
475
499
  return {
476
- name: "toBeProfessional",
500
+ name: "toHaveNoProfanity",
477
501
  passed,
478
- expected: "professional tone",
502
+ expected: "no profanity",
479
503
  actual: foundProfanity.length > 0
480
504
  ? `Found: ${foundProfanity.join(", ")}`
481
- : "professional",
505
+ : "clean",
482
506
  message: message ||
483
507
  (passed
484
- ? "Professional tone"
485
- : `Unprofessional language: ${foundProfanity.join(", ")}`),
508
+ ? "No profanity found"
509
+ : `Profanity detected: ${foundProfanity.join(", ")}`),
486
510
  };
487
511
  }
512
+ /**
513
+ * @deprecated Use {@link toHaveNoProfanity} instead. This method only
514
+ * checks for 7 profane words — it does not analyze professional tone.
515
+ */
516
+ toBeProfessional(message) {
517
+ return this.toHaveNoProfanity(message);
518
+ }
488
519
  /**
489
520
  * Assert value has proper grammar (basic checks)
490
521
  * @example expect(output).toHaveProperGrammar()
@@ -702,6 +733,120 @@ function hasSentiment(text, expected) {
702
733
  return negativeCount > positiveCount;
703
734
  return positiveCount === negativeCount; // neutral
704
735
  }
736
+ /**
737
+ * Lexicon-based sentiment check with confidence score.
738
+ * Returns the detected sentiment, a confidence score (0–1), and whether
739
+ * it matches the expected sentiment.
740
+ *
741
+ * Confidence is derived from the magnitude of the word-count difference
742
+ * relative to the total sentiment-bearing words found.
743
+ *
744
+ * @example
745
+ * ```ts
746
+ * const { sentiment, confidence, matches } = hasSentimentWithScore(
747
+ * "This product is absolutely amazing and wonderful!",
748
+ * "positive",
749
+ * );
750
+ * // sentiment: "positive", confidence: ~0.9, matches: true
751
+ * ```
752
+ */
753
+ function hasSentimentWithScore(text, expected) {
754
+ const lower = text.toLowerCase();
755
+ const positiveWords = [
756
+ "good",
757
+ "great",
758
+ "excellent",
759
+ "amazing",
760
+ "wonderful",
761
+ "fantastic",
762
+ "love",
763
+ "best",
764
+ "happy",
765
+ "helpful",
766
+ "awesome",
767
+ "superb",
768
+ "outstanding",
769
+ "brilliant",
770
+ "perfect",
771
+ "delightful",
772
+ "joyful",
773
+ "pleased",
774
+ "glad",
775
+ "terrific",
776
+ "fabulous",
777
+ "exceptional",
778
+ "impressive",
779
+ "magnificent",
780
+ "marvelous",
781
+ "splendid",
782
+ "positive",
783
+ "enjoy",
784
+ "enjoyed",
785
+ "like",
786
+ "liked",
787
+ "beautiful",
788
+ "innovative",
789
+ "inspiring",
790
+ "effective",
791
+ "useful",
792
+ "valuable",
793
+ ];
794
+ const negativeWords = [
795
+ "bad",
796
+ "terrible",
797
+ "awful",
798
+ "horrible",
799
+ "worst",
800
+ "hate",
801
+ "poor",
802
+ "disappointing",
803
+ "sad",
804
+ "useless",
805
+ "dreadful",
806
+ "miserable",
807
+ "angry",
808
+ "frustrated",
809
+ "broken",
810
+ "failed",
811
+ "pathetic",
812
+ "stupid",
813
+ "disgusting",
814
+ "unacceptable",
815
+ "wrong",
816
+ "error",
817
+ "fail",
818
+ "problem",
819
+ "negative",
820
+ "dislike",
821
+ "annoying",
822
+ "irritating",
823
+ "offensive",
824
+ "regret",
825
+ "disappointment",
826
+ "inadequate",
827
+ "mediocre",
828
+ "flawed",
829
+ "unreliable",
830
+ ];
831
+ const positiveCount = positiveWords.filter((w) => lower.includes(w)).length;
832
+ const negativeCount = negativeWords.filter((w) => lower.includes(w)).length;
833
+ const total = positiveCount + negativeCount;
834
+ let sentiment;
835
+ let confidence;
836
+ if (positiveCount > negativeCount) {
837
+ sentiment = "positive";
838
+ confidence = total > 0 ? (positiveCount - negativeCount) / total : 0;
839
+ }
840
+ else if (negativeCount > positiveCount) {
841
+ sentiment = "negative";
842
+ confidence = total > 0 ? (negativeCount - positiveCount) / total : 0;
843
+ }
844
+ else {
845
+ sentiment = "neutral";
846
+ confidence = total === 0 ? 1 : 0; // high confidence neutral when no words found
847
+ }
848
+ return { sentiment, confidence, matches: sentiment === expected };
849
+ }
705
850
  function similarTo(text1, text2, threshold = 0.8) {
706
851
  // Simple similarity check - in a real app, you'd use a proper string similarity algorithm
707
852
  const words1 = new Set(text1.toLowerCase().split(/\s+/));
@@ -710,6 +855,74 @@ function similarTo(text1, text2, threshold = 0.8) {
710
855
  const union = new Set([...words1, ...words2]);
711
856
  return intersection.size / union.size >= threshold;
712
857
  }
858
+ /**
859
+ * Compute pairwise Jaccard similarity between word sets.
860
+ * Returns the mean of all C(n,2) pair similarities.
861
+ */
862
+ function meanPairwiseJaccard(texts) {
863
+ if (texts.length < 2)
864
+ return 1;
865
+ const wordSets = texts.map((t) => new Set(t.toLowerCase().split(/\s+/).filter(Boolean)));
866
+ let sum = 0;
867
+ let count = 0;
868
+ for (let i = 0; i < wordSets.length; i++) {
869
+ for (let j = i + 1; j < wordSets.length; j++) {
870
+ const a = wordSets[i];
871
+ const b = wordSets[j];
872
+ const intersection = new Set([...a].filter((w) => b.has(w)));
873
+ const union = new Set([...a, ...b]);
874
+ sum += union.size > 0 ? intersection.size / union.size : 1;
875
+ count++;
876
+ }
877
+ }
878
+ return count > 0 ? sum / count : 1;
879
+ }
880
+ /**
881
+ * Measure consistency across multiple outputs for the same input.
882
+ * **Fast and approximate** — uses word-overlap (Jaccard) across all pairs.
883
+ * Returns a score from 0 (completely inconsistent) to 1 (identical).
884
+ *
885
+ * @param outputs - Array of LLM outputs to compare (minimum 2)
886
+ * @param threshold - Optional minimum consistency score to return true (default 0.7)
887
+ * @returns `{ score, passed }` where `passed` is `score >= threshold`
888
+ *
889
+ * @example
890
+ * ```ts
891
+ * const { score, passed } = hasConsistency([
892
+ * "The capital of France is Paris.",
893
+ * "Paris is the capital of France.",
894
+ * "France's capital city is Paris.",
895
+ * ]);
896
+ * // score ≈ 0.6-0.8, passed = true at default threshold
897
+ * ```
898
+ */
899
+ function hasConsistency(outputs, threshold = 0.7) {
900
+ if (outputs.length < 2) {
901
+ return { score: 1, passed: true };
902
+ }
903
+ const score = meanPairwiseJaccard(outputs);
904
+ return { score, passed: score >= threshold };
905
+ }
906
+ /**
907
+ * LLM-backed consistency check. **Slow and accurate** — asks the LLM to
908
+ * judge whether multiple outputs convey the same meaning, catching
909
+ * paraphrased contradictions that word-overlap misses.
910
+ *
911
+ * @returns A score from 0 to 1 where 1 = perfectly consistent.
912
+ */
913
+ async function hasConsistencyAsync(outputs, config) {
914
+ if (outputs.length < 2) {
915
+ return { score: 1, passed: true };
916
+ }
917
+ const numbered = outputs.map((o, i) => `Output ${i + 1}: "${o}"`).join("\n");
918
+ const prompt = `Rate the semantic consistency of the following ${outputs.length} outputs on a scale from 0 to 100, where 100 means they all convey exactly the same meaning and 0 means they completely contradict each other. Reply with ONLY a number.\n\n${numbered}`;
919
+ const result = await callAssertionLLM(prompt, config);
920
+ const parsed = parseInt(result.replace(/[^0-9]/g, ""), 10);
921
+ const score = Number.isNaN(parsed)
922
+ ? 0
923
+ : Math.min(100, Math.max(0, parsed)) / 100;
924
+ return { score, passed: score >= 0.7 };
925
+ }
713
926
  function withinRange(value, min, max) {
714
927
  return value >= min && value <= max;
715
928
  }
@@ -999,8 +1212,48 @@ function hasFactualAccuracy(text, facts) {
999
1212
  const lower = text.toLowerCase();
1000
1213
  return facts.every((fact) => lower.includes(fact.toLowerCase()));
1001
1214
  }
1215
+ /**
1216
+ * Check if a measured duration is within the allowed limit.
1217
+ * @param durationMs - The actual elapsed time in milliseconds
1218
+ * @param maxMs - Maximum allowed duration in milliseconds
1219
+ */
1220
+ function respondedWithinDuration(durationMs, maxMs) {
1221
+ const passed = durationMs <= maxMs;
1222
+ return {
1223
+ name: "respondedWithinDuration",
1224
+ passed,
1225
+ expected: `<= ${maxMs}ms`,
1226
+ actual: `${durationMs}ms`,
1227
+ message: passed
1228
+ ? `Response time ${durationMs}ms is within ${maxMs}ms limit`
1229
+ : `Response time ${durationMs}ms exceeded ${maxMs}ms limit`,
1230
+ };
1231
+ }
1232
+ /**
1233
+ * Check if elapsed time since a start timestamp is within the allowed limit.
1234
+ * @param startTime - Timestamp from Date.now() captured before the operation
1235
+ * @param maxMs - Maximum allowed duration in milliseconds
1236
+ */
1237
+ function respondedWithinTimeSince(startTime, maxMs) {
1238
+ const elapsed = Date.now() - startTime;
1239
+ const passed = elapsed <= maxMs;
1240
+ return {
1241
+ name: "respondedWithinTimeSince",
1242
+ passed,
1243
+ expected: `<= ${maxMs}ms`,
1244
+ actual: `${elapsed}ms`,
1245
+ message: passed
1246
+ ? `Elapsed time ${elapsed}ms is within ${maxMs}ms limit`
1247
+ : `Elapsed time ${elapsed}ms exceeded ${maxMs}ms limit`,
1248
+ };
1249
+ }
1250
+ /**
1251
+ * @deprecated Use {@link respondedWithinDuration} (takes measured duration)
1252
+ * or {@link respondedWithinTimeSince} (takes start timestamp) instead.
1253
+ * This function takes a start timestamp, not a duration — the name is misleading.
1254
+ */
1002
1255
  function respondedWithinTime(startTime, maxMs) {
1003
- return Date.now() - startTime <= maxMs;
1256
+ return respondedWithinTimeSince(startTime, maxMs);
1004
1257
  }
1005
1258
  /**
1006
1259
  * Blocklist-based toxicity check (~80 terms across 9 categories).
@@ -1165,7 +1418,7 @@ function hasNoToxicity(text) {
1165
1418
  "motherfucker",
1166
1419
  "fucktard",
1167
1420
  ];
1168
- return !toxicTerms.some((term) => lower.includes(term));
1421
+ return !toxicTerms.some((term) => textContainsTerm(lower, term));
1169
1422
  }
1170
1423
  function followsInstructions(text, instructions) {
1171
1424
  const instructionList = Array.isArray(instructions)
@@ -1181,6 +1434,7 @@ function followsInstructions(text, instructions) {
1181
1434
  function containsAllRequiredFields(obj, requiredFields) {
1182
1435
  return requiredFields.every((field) => obj && typeof obj === "object" && field in obj);
1183
1436
  }
1437
+ const DEFAULT_ASSERTION_TIMEOUT_MS = 30000;
1184
1438
  let _assertionLLMConfig = null;
1185
1439
  function configureAssertions(config) {
1186
1440
  _assertionLLMConfig = config;
@@ -1193,65 +1447,163 @@ async function callAssertionLLM(prompt, config) {
1193
1447
  if (!cfg) {
1194
1448
  throw new Error("No LLM config set. Call configureAssertions({ provider, apiKey }) first, or pass a config as the last argument.");
1195
1449
  }
1196
- if (cfg.provider === "openai") {
1197
- const baseUrl = cfg.baseUrl ?? "https://api.openai.com";
1198
- const model = cfg.model ?? "gpt-4o-mini";
1199
- const res = await fetch(`${baseUrl}/v1/chat/completions`, {
1200
- method: "POST",
1201
- headers: {
1202
- "Content-Type": "application/json",
1203
- Authorization: `Bearer ${cfg.apiKey}`,
1204
- },
1205
- body: JSON.stringify({
1206
- model,
1207
- messages: [{ role: "user", content: prompt }],
1208
- max_tokens: 10,
1209
- temperature: 0,
1210
- }),
1211
- });
1212
- if (!res.ok) {
1213
- throw new Error(`OpenAI API error ${res.status}: ${await res.text()}`);
1450
+ const timeoutMs = cfg.timeoutMs ?? DEFAULT_ASSERTION_TIMEOUT_MS;
1451
+ const ac = typeof AbortController !== "undefined" ? new AbortController() : null;
1452
+ const fetchWithSignal = (url, init) => fetch(url, ac ? { ...init, signal: ac.signal } : init);
1453
+ const llmCall = async () => {
1454
+ if (cfg.provider === "openai") {
1455
+ const baseUrl = cfg.baseUrl ?? "https://api.openai.com";
1456
+ const model = cfg.model ?? "gpt-4o-mini";
1457
+ const res = await fetchWithSignal(`${baseUrl}/v1/chat/completions`, {
1458
+ method: "POST",
1459
+ headers: {
1460
+ "Content-Type": "application/json",
1461
+ Authorization: `Bearer ${cfg.apiKey}`,
1462
+ },
1463
+ body: JSON.stringify({
1464
+ model,
1465
+ messages: [{ role: "user", content: prompt }],
1466
+ max_tokens: 60,
1467
+ temperature: 0,
1468
+ }),
1469
+ });
1470
+ if (!res.ok) {
1471
+ throw new Error(`OpenAI API error ${res.status}: ${await res.text()}`);
1472
+ }
1473
+ const data = (await res.json());
1474
+ return data.choices[0]?.message?.content?.trim().toLowerCase() ?? "";
1214
1475
  }
1215
- const data = (await res.json());
1216
- return data.choices[0]?.message?.content?.trim().toLowerCase() ?? "";
1217
- }
1218
- if (cfg.provider === "anthropic") {
1219
- const baseUrl = cfg.baseUrl ?? "https://api.anthropic.com";
1220
- const model = cfg.model ?? "claude-3-haiku-20240307";
1221
- const res = await fetch(`${baseUrl}/v1/messages`, {
1222
- method: "POST",
1223
- headers: {
1224
- "Content-Type": "application/json",
1225
- "x-api-key": cfg.apiKey,
1226
- "anthropic-version": "2023-06-01",
1227
- },
1228
- body: JSON.stringify({
1229
- model,
1230
- max_tokens: 10,
1231
- messages: [{ role: "user", content: prompt }],
1232
- }),
1233
- });
1234
- if (!res.ok) {
1235
- throw new Error(`Anthropic API error ${res.status}: ${await res.text()}`);
1476
+ if (cfg.provider === "anthropic") {
1477
+ const baseUrl = cfg.baseUrl ?? "https://api.anthropic.com";
1478
+ const model = cfg.model ?? "claude-3-haiku-20240307";
1479
+ const res = await fetchWithSignal(`${baseUrl}/v1/messages`, {
1480
+ method: "POST",
1481
+ headers: {
1482
+ "Content-Type": "application/json",
1483
+ "x-api-key": cfg.apiKey,
1484
+ "anthropic-version": "2023-06-01",
1485
+ },
1486
+ body: JSON.stringify({
1487
+ model,
1488
+ max_tokens: 60,
1489
+ messages: [{ role: "user", content: prompt }],
1490
+ }),
1491
+ });
1492
+ if (!res.ok) {
1493
+ throw new Error(`Anthropic API error ${res.status}: ${await res.text()}`);
1494
+ }
1495
+ const data = (await res.json());
1496
+ return data.content[0]?.text?.trim().toLowerCase() ?? "";
1236
1497
  }
1237
- const data = (await res.json());
1238
- return data.content[0]?.text?.trim().toLowerCase() ?? "";
1498
+ throw new Error(`Unsupported provider: "${cfg.provider}". Use "openai" or "anthropic".`);
1499
+ };
1500
+ let timer;
1501
+ const timeoutPromise = new Promise((_, reject) => {
1502
+ timer = setTimeout(() => {
1503
+ ac?.abort();
1504
+ reject(new Error(`Assertion LLM call timed out after ${timeoutMs}ms`));
1505
+ }, timeoutMs);
1506
+ });
1507
+ try {
1508
+ return await Promise.race([llmCall(), timeoutPromise]);
1509
+ }
1510
+ finally {
1511
+ clearTimeout(timer);
1239
1512
  }
1240
- throw new Error(`Unsupported provider: "${cfg.provider}". Use "openai" or "anthropic".`);
1241
1513
  }
1242
- // ============================================================================
1243
- // LLM-BACKED ASYNC ASSERTION FUNCTIONS
1244
- // ============================================================================
1514
+ let _hasSentimentAsyncDeprecationWarned = false;
1515
+ /** @internal Reset the one-time deprecation flag. For testing only. */
1516
+ function resetSentimentDeprecationWarning() {
1517
+ _hasSentimentAsyncDeprecationWarned = false;
1518
+ }
1519
+ function makeSentimentResult(sentiment, confidence, matches) {
1520
+ return {
1521
+ sentiment,
1522
+ confidence,
1523
+ matches,
1524
+ [Symbol.toPrimitive](hint) {
1525
+ if (!_hasSentimentAsyncDeprecationWarned) {
1526
+ _hasSentimentAsyncDeprecationWarned = true;
1527
+ console.warn("[evalgate] DEPRECATION: hasSentimentAsync() now returns { sentiment, confidence, matches }. " +
1528
+ "Using it as a boolean (e.g. `if (await hasSentimentAsync(...))`) is deprecated and will be " +
1529
+ "removed in the next major version. Migrate to: `const { matches } = await hasSentimentAsync(...)`");
1530
+ }
1531
+ if (hint === "number")
1532
+ return matches ? 1 : 0;
1533
+ if (hint === "string")
1534
+ return `SentimentAsyncResult(${sentiment}, matches=${matches})`;
1535
+ return matches;
1536
+ },
1537
+ };
1538
+ }
1245
1539
  /**
1246
1540
  * LLM-backed sentiment check. **Slow and accurate** — uses an LLM to
1247
- * classify sentiment with full context awareness. Requires
1248
- * {@link configureAssertions} or an inline `config` argument.
1541
+ * classify sentiment with full context awareness and return a confidence score.
1542
+ * Requires {@link configureAssertions} or an inline `config` argument.
1249
1543
  * Falls back gracefully with a clear error if no API key is configured.
1544
+ *
1545
+ * Returns `{ sentiment, confidence, matches }` — the async layer now provides
1546
+ * the same rich return shape as {@link hasSentimentWithScore}, but powered by
1547
+ * an LLM instead of keyword counting. The `confidence` field is the LLM's
1548
+ * self-reported confidence (0–1), not a lexical heuristic.
1549
+ *
1550
+ * The returned object implements `Symbol.toPrimitive` so that legacy code
1551
+ * using `if (await hasSentimentAsync(...))` still works correctly (coerces
1552
+ * to `matches`), but a deprecation warning is emitted. Migrate to
1553
+ * destructuring: `const { matches } = await hasSentimentAsync(...)`.
1554
+ *
1555
+ * @example
1556
+ * ```ts
1557
+ * const { sentiment, confidence, matches } = await hasSentimentAsync(
1558
+ * "This product is revolutionary but overpriced",
1559
+ * "negative",
1560
+ * );
1561
+ * // sentiment: "negative", confidence: 0.7, matches: true
1562
+ * ```
1250
1563
  */
1251
1564
  async function hasSentimentAsync(text, expected, config) {
1252
- const prompt = `Classify the sentiment of the following text. Reply with exactly one word positive, negative, or neutral — and nothing else.\n\nText: "${text}"`;
1253
- const result = await callAssertionLLM(prompt, config);
1254
- return result.replace(/[^a-z]/g, "") === expected;
1565
+ const prompt = `Classify the sentiment of the following text as positive, negative, or neutral. Also rate your confidence from 0.0 to 1.0. Reply with ONLY a JSON object like {"sentiment":"positive","confidence":0.85} and nothing else.\n\nText: "${text}"`;
1566
+ const raw = await callAssertionLLM(prompt, config);
1567
+ // Parse structured response; fall back to keyword extraction if LLM doesn't return valid JSON
1568
+ let sentiment = "neutral";
1569
+ let confidence = 0.5;
1570
+ try {
1571
+ // Extract JSON from response (LLM may wrap in markdown code fences)
1572
+ const jsonMatch = raw.match(/\{[^}]+\}/);
1573
+ if (jsonMatch) {
1574
+ const parsed = JSON.parse(jsonMatch[0]);
1575
+ const s = String(parsed.sentiment ?? "").replace(/[^a-z]/g, "");
1576
+ if (s === "positive" || s === "negative" || s === "neutral") {
1577
+ sentiment = s;
1578
+ }
1579
+ const c = Number(parsed.confidence);
1580
+ if (!Number.isNaN(c) && c >= 0 && c <= 1) {
1581
+ confidence = c;
1582
+ }
1583
+ }
1584
+ else {
1585
+ // Fallback: old-style single-word response
1586
+ const cleaned = raw.replace(/[^a-z]/g, "");
1587
+ if (cleaned === "positive" ||
1588
+ cleaned === "negative" ||
1589
+ cleaned === "neutral") {
1590
+ sentiment = cleaned;
1591
+ confidence = 0.5; // no confidence info from single-word response
1592
+ }
1593
+ }
1594
+ }
1595
+ catch {
1596
+ // JSON parse failed — try plain text extraction
1597
+ const cleaned = raw.replace(/[^a-z]/g, "");
1598
+ if (cleaned.includes("positive"))
1599
+ sentiment = "positive";
1600
+ else if (cleaned.includes("negative"))
1601
+ sentiment = "negative";
1602
+ else
1603
+ sentiment = "neutral";
1604
+ confidence = 0.5;
1605
+ }
1606
+ return makeSentimentResult(sentiment, confidence, sentiment === expected);
1255
1607
  }
1256
1608
  /**
1257
1609
  * LLM-backed toxicity check. **Slow and accurate** — context-aware, handles
@@ -1289,6 +1641,108 @@ async function hasNoHallucinationsAsync(text, groundTruth, config) {
1289
1641
  const result = await callAssertionLLM(prompt, config);
1290
1642
  return result.replace(/[^a-z]/g, "") === "yes";
1291
1643
  }
1644
+ /**
1645
+ * Compute cosine similarity between two vectors.
1646
+ */
1647
+ function cosineSimilarity(a, b) {
1648
+ if (a.length !== b.length || a.length === 0)
1649
+ return 0;
1650
+ let dot = 0;
1651
+ let normA = 0;
1652
+ let normB = 0;
1653
+ for (let i = 0; i < a.length; i++) {
1654
+ dot += a[i] * b[i];
1655
+ normA += a[i] * a[i];
1656
+ normB += b[i] * b[i];
1657
+ }
1658
+ const denom = Math.sqrt(normA) * Math.sqrt(normB);
1659
+ return denom === 0 ? 0 : dot / denom;
1660
+ }
1661
+ /**
1662
+ * Fetch embeddings from OpenAI's embedding API.
1663
+ * Returns an array of embedding vectors, one per input string.
1664
+ */
1665
+ async function fetchEmbeddings(texts, config) {
1666
+ if (config.provider !== "openai") {
1667
+ throw new Error(`Embedding-based semantic containment requires provider "openai" (got "${config.provider}"). ` +
1668
+ `Set provider to "openai" or use toSemanticallyContainLLM() for LLM-prompt fallback.`);
1669
+ }
1670
+ const baseUrl = config.baseUrl ?? "https://api.openai.com";
1671
+ const model = config.embeddingModel ?? "text-embedding-3-small";
1672
+ const res = await fetch(`${baseUrl}/v1/embeddings`, {
1673
+ method: "POST",
1674
+ headers: {
1675
+ "Content-Type": "application/json",
1676
+ Authorization: `Bearer ${config.apiKey}`,
1677
+ },
1678
+ body: JSON.stringify({ model, input: texts }),
1679
+ });
1680
+ if (!res.ok) {
1681
+ throw new Error(`OpenAI Embeddings API error ${res.status}: ${await res.text()}`);
1682
+ }
1683
+ const data = (await res.json());
1684
+ // Return embeddings sorted by input index
1685
+ return data.data.sort((a, b) => a.index - b.index).map((d) => d.embedding);
1686
+ }
1687
+ /**
1688
+ * Embedding-based semantic containment check. Uses OpenAI embeddings and
1689
+ * cosine similarity to determine whether the text semantically contains
1690
+ * the given concept — no LLM prompt, no "does this text contain X" trick.
1691
+ *
1692
+ * This is **real semantic containment**: embed both strings, compute cosine
1693
+ * similarity, and compare against a threshold. "The city of lights" will
1694
+ * have high similarity to "Paris" because their embeddings are close in
1695
+ * vector space.
1696
+ *
1697
+ * Requires `provider: "openai"` in the config. For Anthropic or other
1698
+ * providers without an embedding API, use {@link toSemanticallyContainLLM}.
1699
+ *
1700
+ * @param text - The text to check
1701
+ * @param phrase - The semantic concept to look for
1702
+ * @param config - LLM config (must be OpenAI with embedding support)
1703
+ * @param threshold - Cosine similarity threshold (default: 0.4). Lower values
1704
+ * are more permissive. Typical ranges: 0.3–0.5 for concept containment,
1705
+ * 0.6–0.8 for paraphrase detection, 0.9+ for near-duplicates.
1706
+ * @returns `{ contains, similarity }` — whether the threshold was met and the raw score
1707
+ *
1708
+ * @example
1709
+ * ```ts
1710
+ * const { contains, similarity } = await toSemanticallyContain(
1711
+ * "The city of lights is beautiful in spring",
1712
+ * "Paris",
1713
+ * { provider: "openai", apiKey: process.env.OPENAI_API_KEY },
1714
+ * );
1715
+ * // contains: true, similarity: ~0.52
1716
+ * ```
1717
+ */
1718
+ async function toSemanticallyContain(text, phrase, config, threshold = 0.4) {
1719
+ const cfg = config ?? _assertionLLMConfig;
1720
+ if (!cfg) {
1721
+ throw new Error("No LLM config set. Call configureAssertions({ provider, apiKey }) first, or pass a config argument.");
1722
+ }
1723
+ const [textEmbedding, phraseEmbedding] = await fetchEmbeddings([text, phrase], cfg);
1724
+ const similarity = cosineSimilarity(textEmbedding, phraseEmbedding);
1725
+ return { contains: similarity >= threshold, similarity };
1726
+ }
1727
+ /**
1728
+ * LLM-prompt-based semantic containment check. Uses an LLM prompt to ask
1729
+ * whether the text conveys a concept. This is a **fallback** for providers
1730
+ * that don't offer an embedding API (e.g., Anthropic).
1731
+ *
1732
+ * Note: This is functionally similar to `followsInstructions` — the LLM is
1733
+ * being asked to judge containment, not compute vector similarity. For
1734
+ * real embedding-based semantic containment, use {@link toSemanticallyContain}.
1735
+ *
1736
+ * @param text - The text to check
1737
+ * @param phrase - The semantic concept to look for
1738
+ * @param config - Optional LLM config
1739
+ * @returns true if the LLM judges the text contains the concept
1740
+ */
1741
+ async function toSemanticallyContainLLM(text, phrase, config) {
1742
+ const prompt = `Does the following text semantically contain or convey the concept "${phrase}"? The text does not need to use those exact words — paraphrases, synonyms, and implied references count. Reply with only "yes" or "no".\n\nText: "${text}"`;
1743
+ const result = await callAssertionLLM(prompt, config);
1744
+ return result.replace(/[^a-z]/g, "") === "yes";
1745
+ }
1292
1746
  function hasValidCodeSyntax(code, language) {
1293
1747
  const lang = language.toLowerCase();
1294
1748
  if (lang === "json") {