evalgate-sdk 3.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. evalgate_sdk/__init__.py +707 -0
  2. evalgate_sdk/_version.py +3 -0
  3. evalgate_sdk/assertions.py +1362 -0
  4. evalgate_sdk/auto.py +247 -0
  5. evalgate_sdk/batch.py +174 -0
  6. evalgate_sdk/cache.py +111 -0
  7. evalgate_sdk/ci_context.py +123 -0
  8. evalgate_sdk/cli/__init__.py +111 -0
  9. evalgate_sdk/cli/api.py +261 -0
  10. evalgate_sdk/cli/cli_constants.py +20 -0
  11. evalgate_sdk/cli/commands.py +1041 -0
  12. evalgate_sdk/cli/config.py +228 -0
  13. evalgate_sdk/cli/env.py +43 -0
  14. evalgate_sdk/cli/formatters/types.py +132 -0
  15. evalgate_sdk/cli/golden_commands.py +322 -0
  16. evalgate_sdk/cli/manifest.py +301 -0
  17. evalgate_sdk/cli/new_commands.py +435 -0
  18. evalgate_sdk/cli/policy_packs.py +103 -0
  19. evalgate_sdk/cli/profiles.py +12 -0
  20. evalgate_sdk/cli/regression_gate.py +312 -0
  21. evalgate_sdk/cli/render/__init__.py +1 -0
  22. evalgate_sdk/cli/render/snippet.py +18 -0
  23. evalgate_sdk/cli/render/sort.py +29 -0
  24. evalgate_sdk/cli/report/__init__.py +1 -0
  25. evalgate_sdk/cli/report/build_check_report.py +209 -0
  26. evalgate_sdk/cli/traces.py +186 -0
  27. evalgate_sdk/cli/workspace.py +63 -0
  28. evalgate_sdk/client.py +609 -0
  29. evalgate_sdk/cluster.py +359 -0
  30. evalgate_sdk/collector.py +161 -0
  31. evalgate_sdk/constants.py +6 -0
  32. evalgate_sdk/context.py +151 -0
  33. evalgate_sdk/errors.py +236 -0
  34. evalgate_sdk/export.py +238 -0
  35. evalgate_sdk/formatters/__init__.py +11 -0
  36. evalgate_sdk/formatters/github.py +51 -0
  37. evalgate_sdk/formatters/human.py +68 -0
  38. evalgate_sdk/formatters/json_fmt.py +11 -0
  39. evalgate_sdk/formatters/pr_comment.py +80 -0
  40. evalgate_sdk/golden.py +426 -0
  41. evalgate_sdk/integrations/__init__.py +1 -0
  42. evalgate_sdk/integrations/anthropic.py +99 -0
  43. evalgate_sdk/integrations/autogen.py +62 -0
  44. evalgate_sdk/integrations/crewai.py +61 -0
  45. evalgate_sdk/integrations/langchain.py +100 -0
  46. evalgate_sdk/integrations/openai.py +155 -0
  47. evalgate_sdk/integrations/openai_eval.py +221 -0
  48. evalgate_sdk/local.py +144 -0
  49. evalgate_sdk/logger.py +123 -0
  50. evalgate_sdk/matchers.py +62 -0
  51. evalgate_sdk/otel.py +256 -0
  52. evalgate_sdk/pagination.py +145 -0
  53. evalgate_sdk/py.typed +0 -0
  54. evalgate_sdk/pytest_plugin.py +96 -0
  55. evalgate_sdk/reason_codes.py +103 -0
  56. evalgate_sdk/regression.py +196 -0
  57. evalgate_sdk/replay_decision.py +115 -0
  58. evalgate_sdk/runtime/__init__.py +50 -0
  59. evalgate_sdk/runtime/adapters/__init__.py +1 -0
  60. evalgate_sdk/runtime/adapters/config_to_dsl.py +270 -0
  61. evalgate_sdk/runtime/adapters/testsuite_to_dsl.py +213 -0
  62. evalgate_sdk/runtime/context.py +68 -0
  63. evalgate_sdk/runtime/eval.py +318 -0
  64. evalgate_sdk/runtime/execution_mode.py +170 -0
  65. evalgate_sdk/runtime/executor.py +92 -0
  66. evalgate_sdk/runtime/registry.py +125 -0
  67. evalgate_sdk/runtime/run_report.py +249 -0
  68. evalgate_sdk/runtime/types.py +143 -0
  69. evalgate_sdk/snapshot.py +219 -0
  70. evalgate_sdk/streaming.py +124 -0
  71. evalgate_sdk/synthesize.py +226 -0
  72. evalgate_sdk/testing.py +128 -0
  73. evalgate_sdk/types.py +666 -0
  74. evalgate_sdk/utils/__init__.py +1 -0
  75. evalgate_sdk/utils/input_hash.py +42 -0
  76. evalgate_sdk/workflows.py +264 -0
  77. evalgate_sdk-3.3.1.dist-info/METADATA +608 -0
  78. evalgate_sdk-3.3.1.dist-info/RECORD +80 -0
  79. evalgate_sdk-3.3.1.dist-info/WHEEL +4 -0
  80. evalgate_sdk-3.3.1.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,1362 @@
1
+ """Assertion library for evaluating LLM outputs.
2
+
3
+ Provides both standalone functions and an ``expect()`` fluent API matching
4
+ the TypeScript SDK's assertion surface.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ import re
11
+ import time
12
+ from collections.abc import Callable, Sequence
13
+ from dataclasses import dataclass
14
+ from typing import Any
15
+
16
+
17
+ @dataclass
18
+ class AssertionResult:
19
+ passed: bool
20
+ assertion_type: str
21
+ message: str
22
+ expected: Any = None
23
+ actual: Any = None
24
+
25
+ def __bool__(self) -> bool:
26
+ return self.passed
27
+
28
+
29
+ def _extract_embedded_json(text: str) -> str | None:
30
+ for i, start in enumerate(text):
31
+ if start not in "[{":
32
+ continue
33
+ stack = [start]
34
+ in_string = False
35
+ escaped = False
36
+ for j in range(i + 1, len(text)):
37
+ char = text[j]
38
+ if in_string:
39
+ if escaped:
40
+ escaped = False
41
+ continue
42
+ if char == "\\":
43
+ escaped = True
44
+ continue
45
+ if char == '"':
46
+ in_string = False
47
+ continue
48
+ if char == '"':
49
+ in_string = True
50
+ continue
51
+ if char in "[{":
52
+ stack.append(char)
53
+ continue
54
+ if char in "]}":
55
+ open_char = stack.pop()
56
+ if (open_char == "{" and char != "}") or (open_char == "[" and char != "]"):
57
+ break
58
+ if not stack:
59
+ candidate = text[i : j + 1]
60
+ try:
61
+ json.loads(candidate)
62
+ return candidate
63
+ except (json.JSONDecodeError, ValueError):
64
+ break
65
+ return None
66
+
67
+
68
+ def _parse_object_candidate(value: Any) -> tuple[dict[str, Any] | None, Any]:
69
+ if isinstance(value, str):
70
+ candidate = _extract_embedded_json(value)
71
+ if candidate is None:
72
+ return None, "invalid JSON string; pass a dict or valid JSON text"
73
+ try:
74
+ parsed = json.loads(candidate)
75
+ except (json.JSONDecodeError, ValueError):
76
+ return None, "invalid JSON string; pass a dict or valid JSON text"
77
+ if isinstance(parsed, dict):
78
+ return parsed, f"parsed JSON object with keys: [{', '.join(parsed.keys())}]"
79
+ return None, "parsed JSON value is not an object"
80
+ if isinstance(value, dict):
81
+ return value, f"object with keys: [{', '.join(value.keys())}]"
82
+ return None, "not an object; pass a dict or valid JSON text"
83
+
84
+
85
+ def _matches_schema_type(value: Any, expected_type: Any) -> bool:
86
+ if isinstance(expected_type, type):
87
+ return isinstance(value, expected_type)
88
+ if not isinstance(expected_type, str):
89
+ return True
90
+ if expected_type == "string":
91
+ return isinstance(value, str)
92
+ if expected_type == "number":
93
+ return isinstance(value, (int, float)) and not isinstance(value, bool)
94
+ if expected_type == "integer":
95
+ return isinstance(value, int) and not isinstance(value, bool)
96
+ if expected_type == "boolean":
97
+ return isinstance(value, bool)
98
+ if expected_type == "object":
99
+ return isinstance(value, dict)
100
+ if expected_type == "array":
101
+ return isinstance(value, list)
102
+ if expected_type == "null":
103
+ return value is None
104
+ return True
105
+
106
+
107
+ def _evaluate_schema_object(obj: dict[str, Any], schema: dict[str, Any]) -> bool:
108
+ if "required" in schema or "properties" in schema or schema.get("type") == "object":
109
+ if "type" in schema and not _matches_schema_type(obj, schema.get("type")):
110
+ return False
111
+ required = schema.get("required")
112
+ if isinstance(required, list) and not all(key in obj for key in required):
113
+ return False
114
+ properties = schema.get("properties")
115
+ if isinstance(properties, dict):
116
+ for key, property_schema in properties.items():
117
+ if key not in obj:
118
+ continue
119
+ if isinstance(property_schema, dict) and "type" in property_schema:
120
+ if not _matches_schema_type(obj[key], property_schema["type"]):
121
+ return False
122
+ return True
123
+ for key, expected_type in schema.items():
124
+ if key not in obj:
125
+ return False
126
+ if not _matches_schema_type(obj[key], expected_type):
127
+ return False
128
+ return True
129
+
130
+
131
+ def _normalize_assertion_result(result: Any, assertion_type: str = "unknown") -> AssertionResult:
132
+ if isinstance(result, AssertionResult):
133
+ return result
134
+ if isinstance(result, dict) and isinstance(result.get("passed"), bool):
135
+ return AssertionResult(
136
+ passed=result["passed"],
137
+ assertion_type=str(result.get("assertion_type") or assertion_type),
138
+ message=str(result.get("message") or "Assertion returned legacy result mapping"),
139
+ expected=result.get("expected"),
140
+ actual=result.get("actual", result),
141
+ )
142
+ if isinstance(result, bool):
143
+ return AssertionResult(
144
+ passed=result,
145
+ assertion_type=assertion_type,
146
+ message="Assertion passed" if result else "Assertion failed",
147
+ expected=True,
148
+ actual=result,
149
+ )
150
+ return AssertionResult(
151
+ passed=bool(result),
152
+ assertion_type=assertion_type,
153
+ message=f"Assertion returned {type(result).__name__}",
154
+ expected=True,
155
+ actual=result,
156
+ )
157
+
158
+
159
+ def _invert_assertion_result(result: AssertionResult) -> AssertionResult:
160
+ return AssertionResult(
161
+ passed=not result.passed,
162
+ assertion_type=result.assertion_type,
163
+ message=(
164
+ f"Negated assertion passed: {result.message}"
165
+ if not result.passed
166
+ else f"Negated assertion failed: {result.message}"
167
+ ),
168
+ expected=result.expected,
169
+ actual=result.actual,
170
+ )
171
+
172
+
173
+ # ── Standalone assertion functions ───────────────────────────────────
174
+
175
+
176
+ def contains_keywords(text: str, keywords: Sequence[str]) -> AssertionResult:
177
+ lower = text.lower()
178
+ missing = [kw for kw in keywords if kw.lower() not in lower]
179
+ passed = len(missing) == 0
180
+ return AssertionResult(
181
+ passed=passed,
182
+ assertion_type="containsKeywords",
183
+ message=f"Missing keywords: {missing}" if missing else "All keywords found",
184
+ expected=list(keywords),
185
+ actual=text,
186
+ )
187
+
188
+
189
+ def matches_pattern(text: str, pattern: str | re.Pattern[str]) -> AssertionResult:
190
+ if isinstance(pattern, str):
191
+ pattern = re.compile(pattern)
192
+ passed = pattern.search(text) is not None
193
+ return AssertionResult(
194
+ passed=passed,
195
+ assertion_type="matchesPattern",
196
+ message=f"Matches pattern {pattern.pattern}" if passed else f"Does not match pattern {pattern.pattern}",
197
+ expected=pattern.pattern,
198
+ actual=text,
199
+ )
200
+
201
+
202
+ def has_length(text: str, *, min: int | None = None, max: int | None = None) -> AssertionResult:
203
+ length = len(text)
204
+ passed = True
205
+ if min is not None and length < min:
206
+ passed = False
207
+ if max is not None and length > max:
208
+ passed = False
209
+ return AssertionResult(
210
+ passed=passed,
211
+ assertion_type="hasLength",
212
+ message=f"Length {length} is within range" if passed else f"Length {length} not in range",
213
+ expected={"min": min, "max": max},
214
+ actual=length,
215
+ )
216
+
217
+
218
+ def contains_json(text: str) -> AssertionResult:
219
+ for start, end in (("{", "}"), ("[", "]")):
220
+ i = text.find(start)
221
+ if i == -1:
222
+ continue
223
+ j = text.rfind(end)
224
+ if j > i:
225
+ try:
226
+ json.loads(text[i : j + 1])
227
+ return AssertionResult(
228
+ passed=True,
229
+ assertion_type="containsJSON",
230
+ message="Valid JSON found",
231
+ actual=text[i : j + 1],
232
+ )
233
+ except (json.JSONDecodeError, ValueError):
234
+ pass
235
+ return AssertionResult(
236
+ passed=False,
237
+ assertion_type="containsJSON",
238
+ message="No valid JSON found in text",
239
+ actual=text,
240
+ )
241
+
242
+
243
+ _PII_PATTERNS = [
244
+ re.compile(r"\b\d{3}-\d{2}-\d{4}\b"), # SSN
245
+ re.compile(r"\b\d{16}\b"), # credit card (no sep)
246
+ re.compile(r"\b\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}\b"), # credit card
247
+ re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"), # email
248
+ re.compile(r"\b\d{3}[-.)]?\s?\d{3}[-.)]?\s?\d{4}\b"), # phone
249
+ ]
250
+
251
+
252
+ def not_contains_pii(text: str) -> AssertionResult:
253
+ findings = []
254
+ labels = ["SSN", "credit card", "credit card", "email", "phone"]
255
+ for label, pattern in zip(labels, _PII_PATTERNS, strict=False):
256
+ if pattern.search(text):
257
+ findings.append(label)
258
+ passed = len(findings) == 0
259
+ return AssertionResult(
260
+ passed=passed,
261
+ assertion_type="notContainsPII",
262
+ message="No PII detected" if passed else f"PII detected: {', '.join(findings)}",
263
+ expected="no PII",
264
+ actual=findings if findings else None,
265
+ )
266
+
267
+
268
+ _POSITIVE_WORDS = frozenset(
269
+ [
270
+ "good",
271
+ "great",
272
+ "excellent",
273
+ "wonderful",
274
+ "fantastic",
275
+ "amazing",
276
+ "love",
277
+ "best",
278
+ "happy",
279
+ "positive",
280
+ "brilliant",
281
+ "outstanding",
282
+ ]
283
+ )
284
+ _NEGATIVE_WORDS = frozenset(
285
+ [
286
+ "bad",
287
+ "terrible",
288
+ "awful",
289
+ "horrible",
290
+ "worst",
291
+ "hate",
292
+ "poor",
293
+ "negative",
294
+ "disappointing",
295
+ "dreadful",
296
+ "ugly",
297
+ ]
298
+ )
299
+
300
+
301
+ def has_sentiment(text: str, expected: str) -> AssertionResult:
302
+ words = set(text.lower().split())
303
+ pos = len(words & _POSITIVE_WORDS)
304
+ neg = len(words & _NEGATIVE_WORDS)
305
+ if expected == "positive":
306
+ passed = pos > neg
307
+ elif expected == "negative":
308
+ passed = neg > pos
309
+ else:
310
+ passed = pos == neg # neutral
311
+ detected = "positive" if pos > neg else ("negative" if neg > pos else "neutral")
312
+ return AssertionResult(
313
+ passed=passed,
314
+ assertion_type="hasSentiment",
315
+ message=f"Expected {expected} sentiment, detected {detected}",
316
+ expected=expected,
317
+ actual=detected,
318
+ )
319
+
320
+
321
+ def _ngrams(text: str, n: int) -> set[str]:
322
+ words = text.lower().split()
323
+ return {" ".join(words[i : i + n]) for i in range(len(words) - n + 1)}
324
+
325
+
326
+ def similar_to(text1: str, text2: str, threshold: float = 0.7) -> AssertionResult:
327
+ if not text1 or not text2:
328
+ passed = text1 == text2
329
+ score = 1.0 if passed else 0.0
330
+ else:
331
+ a = _ngrams(text1, 2) | _ngrams(text1, 1)
332
+ b = _ngrams(text2, 2) | _ngrams(text2, 1)
333
+ score = 1.0 if not a and not b else len(a & b) / max(len(a | b), 1)
334
+ passed = score >= threshold
335
+ op = ">=" if passed else "<"
336
+ return AssertionResult(
337
+ passed=passed,
338
+ assertion_type="similarTo",
339
+ message=f"Similarity {score:.2f} {op} threshold {threshold}",
340
+ expected=threshold,
341
+ actual=score,
342
+ )
343
+
344
+
345
+ def within_range(value: float, min_val: float, max_val: float) -> AssertionResult:
346
+ passed = min_val <= value <= max_val
347
+ return AssertionResult(
348
+ passed=passed,
349
+ assertion_type="withinRange",
350
+ message=f"{value} is within range" if passed else f"{value} is outside range",
351
+ expected={"min": min_val, "max": max_val},
352
+ actual=value,
353
+ )
354
+
355
+
356
+ def is_valid_email(email: str) -> AssertionResult:
357
+ passed = bool(re.match(r"^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}$", email))
358
+ return AssertionResult(
359
+ passed=passed,
360
+ assertion_type="isValidEmail",
361
+ message="Valid email" if passed else "Invalid email",
362
+ expected="valid email",
363
+ actual=email,
364
+ )
365
+
366
+
367
+ def is_valid_url(url: str) -> AssertionResult:
368
+ passed = bool(re.match(r"^https?://[^\s/$.?#].\S*$", url, re.IGNORECASE))
369
+ return AssertionResult(
370
+ passed=passed,
371
+ assertion_type="isValidURL",
372
+ message="Valid URL" if passed else "Invalid URL",
373
+ expected="valid URL",
374
+ actual=url,
375
+ )
376
+
377
+
378
+ def has_no_hallucinations(text: str, ground_truth: Sequence[str]) -> AssertionResult:
379
+ lower = text.lower()
380
+ missing = [fact for fact in ground_truth if fact.lower() not in lower]
381
+ passed = len(missing) == 0
382
+ return AssertionResult(
383
+ passed=passed,
384
+ assertion_type="hasNoHallucinations",
385
+ message="All ground truth facts found in text" if passed else f"Missing facts: {', '.join(missing)}",
386
+ expected=list(ground_truth),
387
+ actual=missing if missing else text,
388
+ )
389
+
390
+
391
+ def matches_schema(value: Any, schema: dict[str, Any]) -> AssertionResult:
392
+ parsed, actual = _parse_object_candidate(value)
393
+ passed = _evaluate_schema_object(parsed, schema) if parsed is not None else False
394
+ return AssertionResult(
395
+ passed=passed,
396
+ assertion_type="matchesSchema",
397
+ message="Value matches schema" if passed else "Value does not match schema",
398
+ expected=schema,
399
+ actual=actual,
400
+ )
401
+
402
+
403
+ def has_readability_score(
404
+ text: str,
405
+ min_score: float | dict[str, float | None],
406
+ ) -> AssertionResult:
407
+ sentences = max(len(re.split(r"[.!?]+", text)), 1)
408
+ words_list = text.split()
409
+ words = max(len(words_list), 1)
410
+ syllables = sum(max(len(re.findall(r"[aeiouy]+", w, re.IGNORECASE)), 1) for w in words_list)
411
+ score = 206.835 - 1.015 * (words / sentences) - 84.6 * (syllables / words)
412
+ threshold = min_score if isinstance(min_score, (int, float)) else (min_score.get("min") or 0.0)
413
+ max_threshold = min_score.get("max") if isinstance(min_score, dict) else None
414
+ passed = score >= threshold and (max_threshold is None or score <= max_threshold)
415
+ op = "within" if passed else "outside"
416
+ return AssertionResult(
417
+ passed=passed,
418
+ assertion_type="hasReadabilityScore",
419
+ message=f"Readability {score:.1f} is {op} threshold",
420
+ expected=min_score,
421
+ actual=score,
422
+ )
423
+
424
+
425
+ def contains_language(text: str, language: str) -> AssertionResult:
426
+ _LANG_MARKERS: dict[str, list[str]] = {
427
+ "english": ["the", "is", "and", "of", "to"],
428
+ "spanish": ["el", "la", "de", "en", "es"],
429
+ "french": ["le", "la", "de", "et", "les"],
430
+ "german": ["der", "die", "und", "ist", "das"],
431
+ }
432
+ markers = _LANG_MARKERS.get(language.lower(), [])
433
+ if not markers:
434
+ return AssertionResult(
435
+ passed=True,
436
+ assertion_type="containsLanguage",
437
+ message=f"No heuristic available for language '{language}'",
438
+ expected=language,
439
+ actual=text,
440
+ )
441
+ words = set(text.lower().split())
442
+ passed = any(m in words for m in markers)
443
+ return AssertionResult(
444
+ passed=passed,
445
+ assertion_type="containsLanguage",
446
+ message=f"Text contains language markers for {language}" if passed else f"Text does not contain language markers for {language}",
447
+ expected=language,
448
+ actual=text,
449
+ )
450
+
451
+
452
+ def has_factual_accuracy(
453
+ text: str,
454
+ facts: Sequence[str],
455
+ threshold: float = 0.8,
456
+ ) -> AssertionResult:
457
+ """Check that *text* is consistent with each fact in *facts*.
458
+
459
+ Uses word-overlap similarity rather than raw substring matching so that
460
+ paraphrased or reworded facts are still detected while contradictions
461
+ are not silently accepted.
462
+
463
+ Entity words (capitalised) in each fact are required to appear in the
464
+ text — a single missing entity (e.g. "Paris" replaced by "London")
465
+ fails the check immediately.
466
+ """
467
+ text_lower = text.lower()
468
+ text_words = set(text_lower.split())
469
+ failure_reason = ""
470
+ for fact in facts:
471
+ fact_words = set(fact.lower().split())
472
+ if not fact_words:
473
+ continue
474
+ entities = {w for w in fact.split() if w[0:1].isupper()}
475
+ for entity in entities:
476
+ if entity.lower() not in text_lower:
477
+ failure_reason = f"Missing entity: {entity}"
478
+ return AssertionResult(
479
+ passed=False,
480
+ assertion_type="hasFactualAccuracy",
481
+ message=failure_reason,
482
+ expected=list(facts),
483
+ actual=text,
484
+ )
485
+ overlap = len(text_words & fact_words) / len(fact_words)
486
+ if overlap < threshold:
487
+ failure_reason = f"Fact overlap {overlap:.2f} below threshold for '{fact}'"
488
+ return AssertionResult(
489
+ passed=False,
490
+ assertion_type="hasFactualAccuracy",
491
+ message=failure_reason,
492
+ expected=list(facts),
493
+ actual=text,
494
+ )
495
+ return AssertionResult(
496
+ passed=True,
497
+ assertion_type="hasFactualAccuracy",
498
+ message="Text is factually consistent with all provided facts",
499
+ expected=list(facts),
500
+ actual=text,
501
+ )
502
+
503
+
504
+ def responded_within_time(start_time: float, max_ms: float) -> AssertionResult:
505
+ return responded_within_time_since(start_time, max_ms)
506
+
507
+
508
+ _TOXIC_WORDS = [
509
+ # ── Profanity ────────────────────────────────────────────────
510
+ "fuck",
511
+ "fucking",
512
+ "fucked",
513
+ "fucker",
514
+ "motherfucker",
515
+ "shit",
516
+ "shitty",
517
+ "bullshit",
518
+ "horseshit",
519
+ "ass",
520
+ "asshole",
521
+ "arsehole",
522
+ "arse",
523
+ "bitch",
524
+ "bitches",
525
+ "bitchy",
526
+ "bastard",
527
+ "damn",
528
+ "dammit",
529
+ "crap",
530
+ "crappy",
531
+ "dick",
532
+ "dickhead",
533
+ "piss",
534
+ "pissed",
535
+ "cunt",
536
+ "twat",
537
+ "wanker",
538
+ "bollocks",
539
+ "cock",
540
+ "cocksucker",
541
+ # ── Racial / ethnic slurs ────────────────────────────────────
542
+ "nigger",
543
+ "nigga",
544
+ "negro",
545
+ "coon",
546
+ "darkie",
547
+ "chink",
548
+ "gook",
549
+ "spic",
550
+ "wetback",
551
+ "beaner",
552
+ "kike",
553
+ "yid",
554
+ "towelhead",
555
+ "raghead",
556
+ "camel jockey",
557
+ "cracker",
558
+ "honky",
559
+ "gringo",
560
+ "redskin",
561
+ "injun",
562
+ "paki",
563
+ "wog",
564
+ "zipperhead",
565
+ # ── Homophobic / transphobic slurs ───────────────────────────
566
+ "faggot",
567
+ "fag",
568
+ "dyke",
569
+ "homo",
570
+ "tranny",
571
+ "shemale",
572
+ "he-she",
573
+ "ladyboy",
574
+ # ── Gendered / sexual slurs ──────────────────────────────────
575
+ "slut",
576
+ "whore",
577
+ "skank",
578
+ "hoe",
579
+ "thot",
580
+ "bimbo",
581
+ "tramp",
582
+ "hooker",
583
+ "prostitute",
584
+ # ── Ableist slurs ────────────────────────────────────────────
585
+ "retard",
586
+ "retarded",
587
+ "spaz",
588
+ "spastic",
589
+ "cripple",
590
+ "midget",
591
+ "mongoloid",
592
+ # ── General insults / dehumanisation ─────────────────────────
593
+ "idiot",
594
+ "stupid",
595
+ "moron",
596
+ "dumb",
597
+ "loser",
598
+ "imbecile",
599
+ "cretin",
600
+ "dimwit",
601
+ "halfwit",
602
+ "degenerate",
603
+ "subhuman",
604
+ "vermin",
605
+ "scum",
606
+ "trash",
607
+ "worthless",
608
+ "pathetic",
609
+ "disgusting",
610
+ # ── Threats / violence ───────────────────────────────────────
611
+ "kill yourself",
612
+ "kys",
613
+ "die",
614
+ "go die",
615
+ "i will kill you",
616
+ "kill you",
617
+ "murder you",
618
+ "shoot you",
619
+ "stab you",
620
+ "hang yourself",
621
+ "hope you die",
622
+ "wish you were dead",
623
+ "shut up",
624
+ "hate you",
625
+ # ── Harassment / sexual harassment ───────────────────────────
626
+ "neck yourself",
627
+ "send nudes",
628
+ "suck my dick",
629
+ "blow me",
630
+ "eat shit",
631
+ "go to hell",
632
+ "burn in hell",
633
+ ]
634
+
635
+ _TOXIC_PATTERNS = [re.compile(r"\b" + re.escape(w) + r"\b", re.IGNORECASE) for w in _TOXIC_WORDS]
636
+
637
+
638
+ def has_no_toxicity(text: str) -> AssertionResult:
639
+ matched = [w for w, p in zip(_TOXIC_WORDS, _TOXIC_PATTERNS, strict=False) if p.search(text)]
640
+ passed = len(matched) == 0
641
+ return AssertionResult(
642
+ passed=passed,
643
+ assertion_type="hasNoToxicity",
644
+ message="No toxic content detected" if passed else f"Toxic content detected: {matched}",
645
+ actual=matched if matched else None,
646
+ )
647
+
648
+
649
+ def follows_instructions(text: str, instructions: Sequence[str]) -> AssertionResult:
650
+ lower = text.lower()
651
+ missing = [inst for inst in instructions if inst.lower() not in lower]
652
+ passed = len(missing) == 0
653
+ return AssertionResult(
654
+ passed=passed,
655
+ assertion_type="followsInstructions",
656
+ message="Follows all instructions" if passed else f"Missing instructions: {missing}",
657
+ expected=list(instructions),
658
+ actual=missing if missing else text,
659
+ )
660
+
661
+
662
+ def contains_all_required_fields(obj: Any, required_fields: Sequence[str]) -> AssertionResult:
663
+ if not isinstance(obj, dict):
664
+ return AssertionResult(
665
+ passed=False,
666
+ assertion_type="containsAllRequiredFields",
667
+ message="Value is not an object",
668
+ expected=list(required_fields),
669
+ actual=obj,
670
+ )
671
+ missing = [field for field in required_fields if field not in obj]
672
+ passed = len(missing) == 0
673
+ return AssertionResult(
674
+ passed=passed,
675
+ assertion_type="containsAllRequiredFields",
676
+ message="All required fields present" if passed else f"Missing fields: {missing}",
677
+ expected=list(required_fields),
678
+ actual=missing if missing else list(obj.keys()),
679
+ )
680
+
681
+
682
+ _CODE_PATTERNS: dict[str, re.Pattern[str]] = {
683
+ "javascript": re.compile(
684
+ r"(function\s+\w+\s*\(|const\s+\w+\s*=|let\s+\w+\s*=|var\s+\w+\s*=|=>\s*[{(]|module\.exports\s*=)"
685
+ ),
686
+ "typescript": re.compile(
687
+ r"(interface\s+\w+|type\s+\w+\s*=|const\s+\w+\s*[=:]|function\s+\w+|=>\s*[{(]|export\s+(default\s+)?[{a-z])"
688
+ ),
689
+ "java": re.compile(r"(public\s+class\s+|private\s+|void\s+\w+\s*\(|import\s+java\.)"),
690
+ "go": re.compile(r'(func\s+\w+|package\s+\w+|import\s+[("]\s*|type\s+\w+\s+struct)'),
691
+ "rust": re.compile(r"(fn\s+\w+|let\s+(mut\s+)?\w+|pub\s+(fn|struct|enum)|use\s+\w+|impl\s+\w+|struct\s+\w+)"),
692
+ }
693
+
694
+
695
+ def has_valid_code_syntax(code: str, language: str) -> AssertionResult:
696
+ lang = language.lower()
697
+ if lang == "python":
698
+ import ast
699
+
700
+ try:
701
+ ast.parse(code)
702
+ passed = True
703
+ except SyntaxError:
704
+ passed = False
705
+ else:
706
+ pattern = _CODE_PATTERNS.get(lang)
707
+ if pattern is None:
708
+ passed = len(code.strip()) > 0
709
+ else:
710
+ passed = pattern.search(code) is not None
711
+ return AssertionResult(
712
+ passed=passed,
713
+ assertion_type="hasValidCodeSyntax",
714
+ message=f"Valid {language} syntax" if passed else f"Invalid {language} syntax",
715
+ expected=language,
716
+ actual=code,
717
+ )
718
+
719
+
720
+ def has_pii(text: str) -> bool:
721
+ """Return ``True`` if PII is detected (inverse of ``not_contains_pii``)."""
722
+ return not not_contains_pii(text)
723
+
724
+
725
+ def has_sentiment_with_score(
726
+ text: str,
727
+ expected: str,
728
+ ) -> dict[str, Any]:
729
+ """Return sentiment, confidence score, and whether it matches *expected*.
730
+
731
+ Uses the same heuristic word-list approach as ``has_sentiment`` but also
732
+ produces a confidence value in 0–1. Confidence scales with both the
733
+ *margin* between positive/negative counts and the *magnitude* (how many
734
+ sentiment words relative to total words), matching the TS implementation.
735
+ """
736
+ all_words = text.lower().split()
737
+ word_set = set(all_words)
738
+ total_words = len(all_words)
739
+ pos = len(word_set & _POSITIVE_WORDS)
740
+ neg = len(word_set & _NEGATIVE_WORDS)
741
+ sentiment_count = pos + neg
742
+
743
+ # Minimum evidence floor: require at least MIN_EVIDENCE_WORDS of
744
+ # context before confidence can approach 1.0. A single sentiment
745
+ # word ("good") should never hit 1.0 on its own.
746
+ _MIN_EVIDENCE_WORDS = 5
747
+
748
+ if sentiment_count == 0:
749
+ sentiment = "neutral"
750
+ confidence = 0.5
751
+ elif pos > neg:
752
+ sentiment = "positive"
753
+ margin = (pos - neg) / sentiment_count
754
+ magnitude = min(sentiment_count / max(total_words, _MIN_EVIDENCE_WORDS), 1.0)
755
+ confidence = 0.5 + 0.5 * margin * magnitude
756
+ elif neg > pos:
757
+ sentiment = "negative"
758
+ margin = (neg - pos) / sentiment_count
759
+ magnitude = min(sentiment_count / max(total_words, _MIN_EVIDENCE_WORDS), 1.0)
760
+ confidence = 0.5 + 0.5 * margin * magnitude
761
+ else:
762
+ sentiment = "neutral"
763
+ confidence = 0.5
764
+
765
+ return {"sentiment": sentiment, "confidence": round(confidence, 4), "matches": sentiment == expected}
766
+
767
+
768
+ def has_consistency(outputs: Sequence[str], threshold: float = 0.7) -> dict[str, Any]:
769
+ """Check multi-output consistency using pairwise similarity.
770
+
771
+ Returns ``{"score": float, "passed": bool}``.
772
+ """
773
+ if len(outputs) < 2:
774
+ return {"score": 1.0, "passed": True}
775
+ total = 0.0
776
+ count = 0
777
+ for i in range(len(outputs)):
778
+ for j in range(i + 1, len(outputs)):
779
+ a = _ngrams(outputs[i], 2) | _ngrams(outputs[i], 1)
780
+ b = _ngrams(outputs[j], 2) | _ngrams(outputs[j], 1)
781
+ union = len(a | b)
782
+ score = len(a & b) / max(union, 1)
783
+ total += score
784
+ count += 1
785
+ avg = total / max(count, 1)
786
+ return {"score": avg, "passed": avg >= threshold}
787
+
788
+
789
+ def responded_within_duration(duration_ms: float, max_ms: float) -> AssertionResult:
790
+ """Check that an elapsed duration (in ms) is within the allowed limit.
791
+
792
+ Returns an ``AssertionResult`` (matches TS ``respondedWithinDuration``).
793
+ """
794
+ passed = duration_ms <= max_ms
795
+ return AssertionResult(
796
+ passed=passed,
797
+ assertion_type="respondedWithinDuration",
798
+ message=f"Duration {duration_ms:.1f}ms {'<=' if passed else '>'} {max_ms:.1f}ms",
799
+ expected=max_ms,
800
+ actual=duration_ms,
801
+ )
802
+
803
+
804
+ def responded_within_time_since(start_time: float, max_ms: float) -> AssertionResult:
805
+ """Check that elapsed time since *start_time* (``time.time()``) is within *max_ms*.
806
+
807
+ Returns an ``AssertionResult`` (matches TS ``respondedWithinTimeSince``).
808
+ """
809
+ elapsed = (time.time() - start_time) * 1000
810
+ passed = elapsed <= max_ms
811
+ return AssertionResult(
812
+ passed=passed,
813
+ assertion_type="respondedWithinTimeSince",
814
+ message=f"Elapsed {elapsed:.1f}ms {'<=' if passed else '>'} {max_ms:.1f}ms",
815
+ expected=max_ms,
816
+ actual=elapsed,
817
+ )
818
+
819
+
820
+ # ── LLM-backed assertion configuration ──────────────────────────────
821
+
822
+
823
+ @dataclass
824
+ class AssertionLLMConfig:
825
+ """Configuration for LLM-backed async assertions."""
826
+
827
+ provider: str = "openai"
828
+ model: str = "gpt-4o-mini"
829
+ api_key: str | None = None
830
+ temperature: float = 0.0
831
+ max_tokens: int = 100
832
+ timeout_ms: int = 30_000
833
+
834
+
835
+ _assertion_llm_config: AssertionLLMConfig | None = None
836
+
837
+
838
+ def configure_assertions(config: AssertionLLMConfig | None = None, /, **kwargs: Any) -> None:
839
+ """Set the global LLM configuration for async assertions.
840
+
841
+ Accepts either an ``AssertionLLMConfig`` instance or keyword fields such as
842
+ ``provider=``, ``api_key=``, ``model=``, and ``timeout_ms=``.
843
+ """
844
+ global _assertion_llm_config
845
+ if config is not None and kwargs:
846
+ raise TypeError("Pass either an AssertionLLMConfig instance or keyword arguments, not both.")
847
+ if config is None:
848
+ if not kwargs:
849
+ raise TypeError(
850
+ "configure_assertions() requires an AssertionLLMConfig instance or keyword arguments."
851
+ )
852
+ _assertion_llm_config = AssertionLLMConfig(**kwargs)
853
+ return
854
+ _assertion_llm_config = config
855
+
856
+
857
+ def get_assertion_config() -> AssertionLLMConfig | None:
858
+ """Return the current global LLM assertion config (or ``None``)."""
859
+ return _assertion_llm_config
860
+
861
+
862
+ async def _llm_ask(prompt: str, config: AssertionLLMConfig | None = None) -> str:
863
+ """Send a single-turn prompt to the configured LLM and return the text.
864
+
865
+ Enforces ``config.timeout_ms`` (default 30 s) via ``asyncio.wait_for``
866
+ so a hung LLM call cannot block the entire run indefinitely.
867
+
868
+ Requires the matching optional dependency, for example
869
+ ``pip install \"evalgate-sdk[openai]\"`` when using the OpenAI provider.
870
+ """
871
+ import asyncio
872
+
873
+ cfg = config or _assertion_llm_config
874
+ if cfg is None:
875
+ raise RuntimeError("No LLM config set. Call configure_assertions() first or pass a config.")
876
+
877
+ timeout_s = cfg.timeout_ms / 1000.0
878
+
879
+ async def _call() -> str:
880
+ if cfg.provider == "openai":
881
+ try:
882
+ import openai
883
+ except ImportError as exc:
884
+ raise ImportError(
885
+ 'openai package required for async assertions: pip install "evalgate-sdk[openai]"'
886
+ ) from exc
887
+ client = openai.AsyncOpenAI(api_key=cfg.api_key)
888
+ resp = await client.chat.completions.create(
889
+ model=cfg.model,
890
+ temperature=cfg.temperature,
891
+ max_tokens=cfg.max_tokens,
892
+ messages=[{"role": "user", "content": prompt}],
893
+ )
894
+ return (resp.choices[0].message.content or "").strip()
895
+
896
+ if cfg.provider == "anthropic":
897
+ try:
898
+ import anthropic
899
+ except ImportError as exc:
900
+ raise ImportError(
901
+ 'anthropic package required for async assertions: pip install "evalgate-sdk[anthropic]"'
902
+ ) from exc
903
+ client = anthropic.AsyncAnthropic(api_key=cfg.api_key)
904
+ resp = await client.messages.create(
905
+ model=cfg.model,
906
+ max_tokens=cfg.max_tokens,
907
+ messages=[{"role": "user", "content": prompt}],
908
+ )
909
+ block = resp.content[0]
910
+ return (block.text if hasattr(block, "text") else str(block)).strip()
911
+
912
+ raise ValueError(f"Unsupported LLM provider: {cfg.provider}")
913
+
914
+ return await asyncio.wait_for(_call(), timeout=timeout_s)
915
+
916
+
917
+ # ── Async / LLM-backed assertion functions ───────────────────────────
918
+
919
+
920
+ async def has_sentiment_async(
921
+ text: str,
922
+ expected: str,
923
+ config: AssertionLLMConfig | None = None,
924
+ ) -> bool:
925
+ """LLM-powered sentiment analysis — more accurate than the heuristic version.
926
+
927
+ Requires ``pip install \"evalgate-sdk[openai]\"`` for the default provider.
928
+ """
929
+ result = await _llm_ask(
930
+ "Classify the sentiment of the following text as exactly one word: "
931
+ f'positive, negative, or neutral.\n\nText: "{text}"\n\nSentiment:',
932
+ config,
933
+ )
934
+ return result.lower().replace(".", "").strip() == expected.lower()
935
+
936
+
937
+ async def has_no_toxicity_async(
938
+ text: str,
939
+ config: AssertionLLMConfig | None = None,
940
+ ) -> bool:
941
+ """LLM-powered toxicity check — detects subtle harmful content blocklists miss.
942
+
943
+ Requires ``pip install \"evalgate-sdk[openai]\"`` for the default provider.
944
+ """
945
+ result = await _llm_ask(
946
+ "Does the following text contain toxic, harmful, or offensive content? "
947
+ f'Answer exactly "yes" or "no".\n\nText: "{text}"\n\nAnswer:',
948
+ config,
949
+ )
950
+ return re.sub(r"[^a-z]", "", result.lower()) == "no"
951
+
952
+
953
+ async def contains_language_async(
954
+ text: str,
955
+ language: str,
956
+ config: AssertionLLMConfig | None = None,
957
+ ) -> bool:
958
+ """LLM-powered language detection.
959
+
960
+ Requires ``pip install \"evalgate-sdk[openai]\"`` for the default provider.
961
+ """
962
+ result = await _llm_ask(
963
+ f'Is the following text written in {language}? Answer exactly "yes" or "no".\n\nText: "{text}"\n\nAnswer:',
964
+ config,
965
+ )
966
+ return re.sub(r"[^a-z]", "", result.lower()) == "yes"
967
+
968
+
969
+ async def has_valid_code_syntax_async(
970
+ code: str,
971
+ language: str,
972
+ config: AssertionLLMConfig | None = None,
973
+ ) -> bool:
974
+ """LLM-powered code syntax validation.
975
+
976
+ Requires ``pip install \"evalgate-sdk[openai]\"`` for the default provider.
977
+ """
978
+ result = await _llm_ask(
979
+ f"Is the following code valid {language} syntax? "
980
+ f'Answer exactly "yes" or "no".\n\n```{language}\n{code}\n```\n\nAnswer:',
981
+ config,
982
+ )
983
+ return re.sub(r"[^a-z]", "", result.lower()) == "yes"
984
+
985
+
986
+ async def has_factual_accuracy_async(
987
+ text: str,
988
+ facts: Sequence[str],
989
+ config: AssertionLLMConfig | None = None,
990
+ ) -> bool:
991
+ """LLM-powered factual accuracy check.
992
+
993
+ Requires ``pip install \"evalgate-sdk[openai]\"`` for the default provider.
994
+ """
995
+ facts_str = "\n".join(f"- {f}" for f in facts)
996
+ result = await _llm_ask(
997
+ "Does the following text accurately reflect ALL of these facts? "
998
+ f'Answer exactly "yes" or "no".\n\nFacts:\n{facts_str}\n\n'
999
+ f'Text: "{text}"\n\nAnswer:',
1000
+ config,
1001
+ )
1002
+ return re.sub(r"[^a-z]", "", result.lower()) == "yes"
1003
+
1004
+
1005
+ async def has_no_hallucinations_async(
1006
+ text: str,
1007
+ ground_truth: Sequence[str],
1008
+ config: AssertionLLMConfig | None = None,
1009
+ ) -> bool:
1010
+ """LLM-powered hallucination detection — catches paraphrased fabrications.
1011
+
1012
+ Requires ``pip install \"evalgate-sdk[openai]\"`` for the default provider.
1013
+ """
1014
+ facts_str = "\n".join(f"- {f}" for f in ground_truth)
1015
+ result = await _llm_ask(
1016
+ f"Does the following text contain ONLY information supported by the given ground truth? "
1017
+ f'Answer exactly "yes" if no hallucinations, "no" if it contains fabricated claims.\n\n'
1018
+ f'Ground truth:\n{facts_str}\n\nText: "{text}"\n\nAnswer:',
1019
+ config,
1020
+ )
1021
+ return re.sub(r"[^a-z]", "", result.lower()) == "yes"
1022
+
1023
+
1024
+ async def has_consistency_async(
1025
+ outputs: Sequence[str],
1026
+ config: AssertionLLMConfig | None = None,
1027
+ ) -> dict[str, Any]:
1028
+ """LLM-powered multi-output consistency check.
1029
+
1030
+ Requires ``pip install \"evalgate-sdk[openai]\"`` for the default provider.
1031
+ """
1032
+ if len(outputs) < 2:
1033
+ return {"score": 1.0, "passed": True}
1034
+ outputs_str = "\n".join(f"Output {i + 1}: {o}" for i, o in enumerate(outputs))
1035
+ result = await _llm_ask(
1036
+ f"Rate the consistency of these outputs on a scale of 0.0 to 1.0 "
1037
+ f"(1.0 = perfectly consistent). Reply with ONLY the number.\n\n{outputs_str}\n\nScore:",
1038
+ config,
1039
+ )
1040
+ try:
1041
+ score = float(re.search(r"[\d.]+", result).group()) # type: ignore[union-attr]
1042
+ score = max(0.0, min(1.0, score))
1043
+ except (AttributeError, ValueError):
1044
+ score = 0.5
1045
+ return {"score": score, "passed": score >= 0.7}
1046
+
1047
+
1048
+ async def to_semantically_contain(
1049
+ text: str,
1050
+ phrase: str,
1051
+ config: AssertionLLMConfig | None = None,
1052
+ ) -> dict[str, Any]:
1053
+ """LLM-powered semantic containment check.
1054
+
1055
+ Returns ``{"contains": bool, "similarity": float}``.
1056
+
1057
+ Requires ``pip install \"evalgate-sdk[openai]\"`` for the default provider.
1058
+ """
1059
+ result = await _llm_ask(
1060
+ f'Does the following text semantically contain or convey the meaning of "{phrase}"? '
1061
+ f'Answer with a JSON object: {{"contains": true/false, "similarity": 0.0-1.0}}\n\n'
1062
+ f'Text: "{text}"\n\nAnswer:',
1063
+ config,
1064
+ )
1065
+ try:
1066
+ parsed = json.loads(result)
1067
+ return {"contains": bool(parsed.get("contains")), "similarity": float(parsed.get("similarity", 0))}
1068
+ except (json.JSONDecodeError, ValueError):
1069
+ contains = "true" in result.lower() or "yes" in result.lower()
1070
+ return {"contains": contains, "similarity": 0.5 if contains else 0.0}
1071
+
1072
+
1073
+ # ── Fluent expect() API ──────────────────────────────────────────────
1074
+
1075
+
1076
+ class _NegatedExpectation:
1077
+ def __init__(self, value: Any) -> None:
1078
+ self._value = value
1079
+
1080
+ @property
1081
+ def not_(self) -> Expectation:
1082
+ return Expectation(self._value)
1083
+
1084
+ def __getattr__(self, name: str) -> Any:
1085
+ attribute = getattr(Expectation(self._value), name)
1086
+ if not callable(attribute):
1087
+ return attribute
1088
+
1089
+ def wrapper(*args: Any, **kwargs: Any) -> Any:
1090
+ result = attribute(*args, **kwargs)
1091
+ normalized = _normalize_assertion_result(result, name)
1092
+ return _invert_assertion_result(normalized)
1093
+
1094
+ return wrapper
1095
+
1096
+
1097
+ class Expectation:
1098
+ """Chainable assertion builder returned by ``expect(value)``."""
1099
+
1100
+ def __init__(self, value: Any) -> None:
1101
+ self._value = value
1102
+
1103
+ @property
1104
+ def not_(self) -> _NegatedExpectation:
1105
+ return _NegatedExpectation(self._value)
1106
+
1107
+ def to_equal(self, expected: Any, message: str = "") -> AssertionResult:
1108
+ passed = self._value == expected
1109
+ return AssertionResult(
1110
+ passed=passed,
1111
+ assertion_type="equal",
1112
+ message=message or f"Expected {expected!r}, got {self._value!r}",
1113
+ expected=expected,
1114
+ actual=self._value,
1115
+ )
1116
+
1117
+ def to_contain(self, substring: str, message: str = "") -> AssertionResult:
1118
+ passed = substring in str(self._value)
1119
+ return AssertionResult(
1120
+ passed=passed,
1121
+ assertion_type="contain",
1122
+ message=message or f"Expected to contain '{substring}'",
1123
+ expected=substring,
1124
+ actual=self._value,
1125
+ )
1126
+
1127
+ def to_contain_keywords(self, keywords: Sequence[str], message: str = "") -> AssertionResult:
1128
+ result = contains_keywords(str(self._value), keywords)
1129
+ return AssertionResult(
1130
+ passed=result.passed,
1131
+ assertion_type="containsKeywords",
1132
+ message=message or result.message,
1133
+ expected=keywords,
1134
+ actual=self._value,
1135
+ )
1136
+
1137
+ def to_not_contain(self, substring: str, message: str = "") -> AssertionResult:
1138
+ passed = substring not in str(self._value)
1139
+ return AssertionResult(
1140
+ passed=passed,
1141
+ assertion_type="notContain",
1142
+ message=message or f"Expected not to contain '{substring}'",
1143
+ expected=substring,
1144
+ actual=self._value,
1145
+ )
1146
+
1147
+ def to_not_contain_pii(self, message: str = "") -> AssertionResult:
1148
+ result = not_contains_pii(str(self._value))
1149
+ return AssertionResult(
1150
+ passed=result.passed,
1151
+ assertion_type="notContainsPII",
1152
+ message=message or result.message,
1153
+ expected=result.expected,
1154
+ actual=result.actual,
1155
+ )
1156
+
1157
+ def to_match_pattern(self, pattern: str | re.Pattern[str], message: str = "") -> AssertionResult:
1158
+ result = matches_pattern(str(self._value), pattern)
1159
+ return AssertionResult(
1160
+ passed=result.passed,
1161
+ assertion_type="matchesPattern",
1162
+ message=message or result.message,
1163
+ expected=result.expected,
1164
+ actual=result.actual,
1165
+ )
1166
+
1167
+ def to_be_valid_json(self, message: str = "") -> AssertionResult:
1168
+ try:
1169
+ json.loads(str(self._value))
1170
+ passed = True
1171
+ except (json.JSONDecodeError, ValueError):
1172
+ passed = False
1173
+ return AssertionResult(
1174
+ passed=passed,
1175
+ assertion_type="validJSON",
1176
+ message=message or "Expected valid JSON",
1177
+ )
1178
+
1179
+ def to_match_json(self, schema: dict[str, Any], message: str = "") -> AssertionResult:
1180
+ result = matches_schema(self._value, schema)
1181
+ return AssertionResult(
1182
+ passed=result.passed,
1183
+ assertion_type="matchesJSON",
1184
+ message=message or result.message,
1185
+ expected=schema,
1186
+ actual=result.actual,
1187
+ )
1188
+
1189
+ def to_have_sentiment(self, expected: str, message: str = "") -> AssertionResult:
1190
+ result = has_sentiment(str(self._value), expected)
1191
+ return AssertionResult(
1192
+ passed=result.passed,
1193
+ assertion_type="sentiment",
1194
+ message=message or result.message,
1195
+ expected=expected,
1196
+ actual=result.actual,
1197
+ )
1198
+
1199
+ def to_have_length(self, *, min: int | None = None, max: int | None = None, message: str = "") -> AssertionResult:
1200
+ result = has_length(str(self._value), min=min, max=max)
1201
+ return AssertionResult(
1202
+ passed=result.passed,
1203
+ assertion_type="length",
1204
+ message=message or result.message,
1205
+ expected=result.expected,
1206
+ actual=result.actual,
1207
+ )
1208
+
1209
+ def to_not_hallucinate(self, ground_truth: Sequence[str], message: str = "") -> AssertionResult:
1210
+ result = has_no_hallucinations(str(self._value), ground_truth)
1211
+ return AssertionResult(
1212
+ passed=result.passed,
1213
+ assertion_type="noHallucinations",
1214
+ message=message or result.message,
1215
+ expected=result.expected,
1216
+ actual=result.actual,
1217
+ )
1218
+
1219
+ def to_be_faster_than(self, max_ms: float, message: str = "") -> AssertionResult:
1220
+ passed = isinstance(self._value, (int, float)) and self._value <= max_ms
1221
+ return AssertionResult(
1222
+ passed=passed,
1223
+ assertion_type="fasterThan",
1224
+ message=message or f"Expected < {max_ms}ms",
1225
+ expected=max_ms,
1226
+ actual=self._value,
1227
+ )
1228
+
1229
+ def to_be_truthy(self, message: str = "") -> AssertionResult:
1230
+ passed = bool(self._value)
1231
+ return AssertionResult(
1232
+ passed=passed,
1233
+ assertion_type="truthy",
1234
+ message=message or "Expected truthy value",
1235
+ )
1236
+
1237
+ def to_be_falsy(self, message: str = "") -> AssertionResult:
1238
+ passed = not bool(self._value)
1239
+ return AssertionResult(
1240
+ passed=passed,
1241
+ assertion_type="falsy",
1242
+ message=message or "Expected falsy value",
1243
+ )
1244
+
1245
+ def to_be_greater_than(self, expected: float, message: str = "") -> AssertionResult:
1246
+ passed = isinstance(self._value, (int, float)) and self._value > expected
1247
+ return AssertionResult(
1248
+ passed=passed,
1249
+ assertion_type="greaterThan",
1250
+ message=message or f"Expected > {expected}",
1251
+ expected=expected,
1252
+ actual=self._value,
1253
+ )
1254
+
1255
+ def to_be_less_than(self, expected: float, message: str = "") -> AssertionResult:
1256
+ passed = isinstance(self._value, (int, float)) and self._value < expected
1257
+ return AssertionResult(
1258
+ passed=passed,
1259
+ assertion_type="lessThan",
1260
+ message=message or f"Expected < {expected}",
1261
+ expected=expected,
1262
+ actual=self._value,
1263
+ )
1264
+
1265
+ def to_be_between(self, min_val: float, max_val: float, message: str = "") -> AssertionResult:
1266
+ if isinstance(self._value, (int, float)):
1267
+ result = within_range(self._value, min_val, max_val)
1268
+ else:
1269
+ result = AssertionResult(
1270
+ passed=False,
1271
+ assertion_type="between",
1272
+ message="Value is not numeric",
1273
+ expected={"min": min_val, "max": max_val},
1274
+ actual=self._value,
1275
+ )
1276
+ return AssertionResult(
1277
+ passed=result.passed,
1278
+ assertion_type="between",
1279
+ message=message or result.message,
1280
+ expected=result.expected,
1281
+ actual=result.actual,
1282
+ )
1283
+
1284
+ def to_contain_code(self, message: str = "") -> AssertionResult:
1285
+ passed = bool(re.search(r"```|def |function |class |const |import ", str(self._value)))
1286
+ return AssertionResult(
1287
+ passed=passed,
1288
+ assertion_type="containsCode",
1289
+ message=message or "Expected to contain code",
1290
+ )
1291
+
1292
+ def to_have_no_profanity(self, message: str = "") -> AssertionResult:
1293
+ result = has_no_toxicity(str(self._value))
1294
+ return AssertionResult(
1295
+ passed=result.passed,
1296
+ assertion_type="noProfanity",
1297
+ message=message or result.message,
1298
+ actual=result.actual,
1299
+ )
1300
+
1301
+ def to_be_professional(self, message: str = "") -> AssertionResult:
1302
+ result = has_no_toxicity(str(self._value))
1303
+ return AssertionResult(
1304
+ passed=result.passed,
1305
+ assertion_type="professional",
1306
+ message=message or "Expected professional tone",
1307
+ )
1308
+
1309
+ def to_have_proper_grammar(self, message: str = "") -> AssertionResult:
1310
+ text = str(self._value).strip()
1311
+ passed = len(text) > 0 and text[0].isupper() and text[-1] in ".!?"
1312
+ return AssertionResult(
1313
+ passed=passed,
1314
+ assertion_type="properGrammar",
1315
+ message=message or "Expected proper grammar",
1316
+ )
1317
+
1318
+
1319
+ def expect(value: Any) -> Expectation:
1320
+ """Create an assertion chain for the given value.
1321
+
1322
+ Usage::
1323
+
1324
+ result = expect("Hello world").to_contain("Hello")
1325
+ assert result.passed
1326
+ """
1327
+ return Expectation(value)
1328
+
1329
+
1330
+ def run_assertions(
1331
+ assertions: Sequence[Callable[[], Any]],
1332
+ ) -> list[AssertionResult]:
1333
+ """Run multiple assertions and collect results.
1334
+
1335
+ Each assertion is a zero-argument callable that returns an ``AssertionResult``.
1336
+ Exceptions are caught and turned into failing results.
1337
+
1338
+ Usage::
1339
+
1340
+ results = run_assertions([
1341
+ lambda: expect(output).to_contain("help"),
1342
+ lambda: expect(output).to_have_sentiment("positive"),
1343
+ lambda: expect(output).to_have_length(min_len=10),
1344
+ ])
1345
+ all_passed = all(r.passed for r in results)
1346
+ """
1347
+ results: list[AssertionResult] = []
1348
+ for assertion in assertions:
1349
+ try:
1350
+ assertion_type = getattr(assertion, "__name__", "unknown")
1351
+ results.append(_normalize_assertion_result(assertion(), assertion_type))
1352
+ except Exception as exc:
1353
+ results.append(
1354
+ AssertionResult(
1355
+ passed=False,
1356
+ assertion_type="unknown",
1357
+ message=str(exc),
1358
+ expected=None,
1359
+ actual=None,
1360
+ )
1361
+ )
1362
+ return results