contextforge-eval 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. context_forge/__init__.py +95 -0
  2. context_forge/core/__init__.py +55 -0
  3. context_forge/core/trace.py +369 -0
  4. context_forge/core/types.py +121 -0
  5. context_forge/evaluation.py +267 -0
  6. context_forge/exceptions.py +56 -0
  7. context_forge/graders/__init__.py +44 -0
  8. context_forge/graders/base.py +264 -0
  9. context_forge/graders/deterministic/__init__.py +11 -0
  10. context_forge/graders/deterministic/memory_corruption.py +130 -0
  11. context_forge/graders/hybrid.py +190 -0
  12. context_forge/graders/judges/__init__.py +11 -0
  13. context_forge/graders/judges/backends/__init__.py +9 -0
  14. context_forge/graders/judges/backends/ollama.py +173 -0
  15. context_forge/graders/judges/base.py +158 -0
  16. context_forge/graders/judges/memory_hygiene_judge.py +332 -0
  17. context_forge/graders/judges/models.py +113 -0
  18. context_forge/harness/__init__.py +43 -0
  19. context_forge/harness/user_simulator/__init__.py +70 -0
  20. context_forge/harness/user_simulator/adapters/__init__.py +13 -0
  21. context_forge/harness/user_simulator/adapters/base.py +67 -0
  22. context_forge/harness/user_simulator/adapters/crewai.py +100 -0
  23. context_forge/harness/user_simulator/adapters/langgraph.py +157 -0
  24. context_forge/harness/user_simulator/adapters/pydanticai.py +105 -0
  25. context_forge/harness/user_simulator/llm/__init__.py +5 -0
  26. context_forge/harness/user_simulator/llm/ollama.py +119 -0
  27. context_forge/harness/user_simulator/models.py +103 -0
  28. context_forge/harness/user_simulator/persona.py +154 -0
  29. context_forge/harness/user_simulator/runner.py +342 -0
  30. context_forge/harness/user_simulator/scenario.py +95 -0
  31. context_forge/harness/user_simulator/simulator.py +307 -0
  32. context_forge/instrumentation/__init__.py +23 -0
  33. context_forge/instrumentation/base.py +307 -0
  34. context_forge/instrumentation/instrumentors/__init__.py +17 -0
  35. context_forge/instrumentation/instrumentors/langchain.py +671 -0
  36. context_forge/instrumentation/instrumentors/langgraph.py +534 -0
  37. context_forge/instrumentation/tracer.py +588 -0
  38. context_forge/py.typed +0 -0
  39. contextforge_eval-0.1.0.dist-info/METADATA +420 -0
  40. contextforge_eval-0.1.0.dist-info/RECORD +43 -0
  41. contextforge_eval-0.1.0.dist-info/WHEEL +5 -0
  42. contextforge_eval-0.1.0.dist-info/licenses/LICENSE +201 -0
  43. contextforge_eval-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,534 @@
1
+ """LangGraph instrumentor for ContextForge.
2
+
3
+ Provides unified instrumentation for LangGraph agents, capturing:
4
+ - LLM calls (via LangChain callbacks)
5
+ - Tool calls (via LangChain callbacks)
6
+ - Memory operations (via BaseStore patching)
7
+
8
+ All events are captured in a single unified trace.
9
+ """
10
+
11
+ import functools
12
+ import logging
13
+ import time
14
+ from datetime import datetime, timezone
15
+ from pathlib import Path
16
+ from typing import Any, Callable, Optional
17
+ import uuid
18
+
19
+ from context_forge.core.trace import MemoryReadStep, MemoryWriteStep
20
+ from context_forge.core.types import FieldChange
21
+ from context_forge.instrumentation.base import RedactionConfig
22
+ from context_forge.instrumentation.instrumentors.langchain import LangChainInstrumentor
23
+
24
+
25
+ def compute_field_changes(
26
+ old_value: dict[str, Any] | None,
27
+ new_value: dict[str, Any],
28
+ prefix: str = "$",
29
+ ) -> list[FieldChange]:
30
+ """Compute field-level changes between two dictionaries using JSON paths.
31
+
32
+ Model-agnostic diff computation that works with any data structure.
33
+
34
+ Args:
35
+ old_value: Previous value (None if new record)
36
+ new_value: New value being written
37
+ prefix: JSON path prefix (default "$" for root)
38
+
39
+ Returns:
40
+ List of FieldChange objects representing each changed field
41
+ """
42
+ changes: list[FieldChange] = []
43
+ old_value = old_value or {}
44
+
45
+ # Get all keys from both old and new
46
+ all_keys = set(old_value.keys()) | set(new_value.keys())
47
+
48
+ for key in all_keys:
49
+ old_val = old_value.get(key)
50
+ new_val = new_value.get(key)
51
+ path = f"{prefix}.{key}"
52
+
53
+ # Both are dicts - recurse
54
+ if isinstance(old_val, dict) and isinstance(new_val, dict):
55
+ changes.extend(compute_field_changes(old_val, new_val, path))
56
+ # Values differ
57
+ elif old_val != new_val:
58
+ changes.append(
59
+ FieldChange(
60
+ path=path,
61
+ old_value=old_val,
62
+ new_value=new_val,
63
+ )
64
+ )
65
+
66
+ return changes
67
+
68
+ logger = logging.getLogger(__name__)
69
+
70
+ # Track whether LangGraph is available
71
+ try:
72
+ from langgraph.store.base import BaseStore
73
+
74
+ _LANGGRAPH_AVAILABLE = True
75
+ except ImportError:
76
+ BaseStore = None # type: ignore
77
+ _LANGGRAPH_AVAILABLE = False
78
+
79
+
80
+ class LangGraphInstrumentor(LangChainInstrumentor):
81
+ """Combined instrumentor for LangGraph agents.
82
+
83
+ Captures LLM calls, tool calls, AND memory operations in a single
84
+ unified trace. Extends LangChainInstrumentor with BaseStore patching.
85
+
86
+ Usage:
87
+ # One-liner instrumentation
88
+ instrumentor = LangGraphInstrumentor().instrument()
89
+ handler = instrumentor.get_callback_handler()
90
+
91
+ # Run your LangGraph agent
92
+ result = graph.invoke(input, config={"callbacks": [handler]})
93
+
94
+ # Get unified trace with everything
95
+ traces = instrumentor.get_traces()
96
+
97
+ Or with context manager:
98
+ with LangGraphInstrumentor(output_path="./traces") as inst:
99
+ handler = inst.get_callback_handler()
100
+ result = graph.invoke(input, config={"callbacks": [handler]})
101
+ # Trace saved with LLM calls + tool calls + memory ops
102
+ """
103
+
104
+ def __init__(
105
+ self,
106
+ agent_name: str = "langgraph_agent",
107
+ agent_version: Optional[str] = None,
108
+ output_path: Optional[str | Path] = None,
109
+ redaction_config: Optional[RedactionConfig] = None,
110
+ ):
111
+ super().__init__(
112
+ agent_name=agent_name,
113
+ agent_version=agent_version,
114
+ output_path=output_path,
115
+ redaction_config=redaction_config,
116
+ )
117
+ # Store original methods for restoration
118
+ self._original_store_methods: dict[str, Callable] = {}
119
+ self._langgraph_version: Optional[str] = None
120
+
121
+ @property
122
+ def framework(self) -> str:
123
+ return "langgraph"
124
+
125
+ @property
126
+ def framework_version(self) -> Optional[str]:
127
+ if self._langgraph_version is None:
128
+ try:
129
+ from importlib.metadata import version
130
+
131
+ self._langgraph_version = version("langgraph")
132
+ except Exception:
133
+ pass
134
+ return self._langgraph_version
135
+
136
+ def _install_hooks(self) -> None:
137
+ """Install both LangChain callbacks and BaseStore patches."""
138
+ # Install LangChain callback hooks (LLM, tools, retriever)
139
+ super()._install_hooks()
140
+
141
+ # Install BaseStore patches (memory operations)
142
+ if _LANGGRAPH_AVAILABLE:
143
+ self._patch_store_methods()
144
+ logger.debug("LangGraph store methods patched")
145
+ else:
146
+ logger.warning(
147
+ "LangGraph not available, memory operations will not be traced. "
148
+ "Install with: pip install langgraph"
149
+ )
150
+
151
+ def _remove_hooks(self) -> None:
152
+ """Remove both LangChain callbacks and BaseStore patches."""
153
+ # Remove BaseStore patches first
154
+ if _LANGGRAPH_AVAILABLE:
155
+ self._unpatch_store_methods()
156
+ logger.debug("LangGraph store methods unpatched")
157
+
158
+ # Remove LangChain callback hooks
159
+ super()._remove_hooks()
160
+
161
+ def _patch_store_methods(self) -> None:
162
+ """Patch BaseStore methods to capture memory operations."""
163
+ if not _LANGGRAPH_AVAILABLE or BaseStore is None:
164
+ return
165
+
166
+ # Store originals
167
+ self._original_store_methods = {
168
+ "get": BaseStore.get,
169
+ "put": BaseStore.put,
170
+ "delete": BaseStore.delete,
171
+ "search": BaseStore.search,
172
+ "aget": BaseStore.aget,
173
+ "aput": BaseStore.aput,
174
+ "adelete": BaseStore.adelete,
175
+ "asearch": BaseStore.asearch,
176
+ }
177
+
178
+ # Create patched methods
179
+ instrumentor = self # Capture for closures
180
+
181
+ @functools.wraps(self._original_store_methods["get"])
182
+ def traced_get(
183
+ store_self,
184
+ namespace: tuple[str, ...],
185
+ key: str,
186
+ *,
187
+ refresh_ttl: bool | None = None,
188
+ ):
189
+ start_time = time.perf_counter()
190
+ result = instrumentor._original_store_methods["get"](
191
+ store_self, namespace, key, refresh_ttl=refresh_ttl
192
+ )
193
+ latency_ms = int((time.perf_counter() - start_time) * 1000)
194
+
195
+ instrumentor._record_memory_read(
196
+ query={"namespace": namespace, "key": key},
197
+ results=[result.value] if result else [],
198
+ latency_ms=latency_ms,
199
+ )
200
+ return result
201
+
202
+ @functools.wraps(self._original_store_methods["put"])
203
+ def traced_put(
204
+ store_self,
205
+ namespace: tuple[str, ...],
206
+ key: str,
207
+ value: dict[str, Any],
208
+ index=None,
209
+ *,
210
+ ttl=None,
211
+ ):
212
+ # Get current value BEFORE write (for diff computation)
213
+ old_item = None
214
+ try:
215
+ old_item = instrumentor._original_store_methods["get"](
216
+ store_self, namespace, key
217
+ )
218
+ except Exception:
219
+ pass # No existing value
220
+
221
+ old_value = old_item.value if old_item else None
222
+
223
+ start_time = time.perf_counter()
224
+ # Handle the NOT_PROVIDED sentinel
225
+ kwargs = {}
226
+ if ttl is not None:
227
+ kwargs["ttl"] = ttl
228
+ result = instrumentor._original_store_methods["put"](
229
+ store_self, namespace, key, value, index, **kwargs
230
+ )
231
+ latency_ms = int((time.perf_counter() - start_time) * 1000)
232
+
233
+ # Compute field-level changes
234
+ changes = compute_field_changes(old_value, value)
235
+
236
+ instrumentor._record_memory_write(
237
+ namespace=list(namespace),
238
+ key=key,
239
+ operation="add",
240
+ data=value,
241
+ changes=changes,
242
+ latency_ms=latency_ms,
243
+ )
244
+ return result
245
+
246
+ @functools.wraps(self._original_store_methods["delete"])
247
+ def traced_delete(store_self, namespace: tuple[str, ...], key: str):
248
+ # Get current value BEFORE delete (for diff computation)
249
+ old_item = None
250
+ try:
251
+ old_item = instrumentor._original_store_methods["get"](
252
+ store_self, namespace, key
253
+ )
254
+ except Exception:
255
+ pass
256
+
257
+ old_value = old_item.value if old_item else None
258
+
259
+ start_time = time.perf_counter()
260
+ result = instrumentor._original_store_methods["delete"](
261
+ store_self, namespace, key
262
+ )
263
+ latency_ms = int((time.perf_counter() - start_time) * 1000)
264
+
265
+ # Compute changes (all fields become None)
266
+ changes = []
267
+ if old_value:
268
+ changes = compute_field_changes(old_value, {})
269
+
270
+ instrumentor._record_memory_write(
271
+ namespace=list(namespace),
272
+ key=key,
273
+ operation="delete",
274
+ data={},
275
+ changes=changes,
276
+ latency_ms=latency_ms,
277
+ )
278
+ return result
279
+
280
+ @functools.wraps(self._original_store_methods["search"])
281
+ def traced_search(
282
+ store_self,
283
+ namespace_prefix: tuple[str, ...],
284
+ /,
285
+ *,
286
+ query: str | None = None,
287
+ filter: dict[str, Any] | None = None,
288
+ limit: int = 10,
289
+ offset: int = 0,
290
+ refresh_ttl: bool | None = None,
291
+ ):
292
+ start_time = time.perf_counter()
293
+ results = instrumentor._original_store_methods["search"](
294
+ store_self,
295
+ namespace_prefix,
296
+ query=query,
297
+ filter=filter,
298
+ limit=limit,
299
+ offset=offset,
300
+ refresh_ttl=refresh_ttl,
301
+ )
302
+ latency_ms = int((time.perf_counter() - start_time) * 1000)
303
+
304
+ instrumentor._record_memory_read(
305
+ query={
306
+ "namespace_prefix": namespace_prefix,
307
+ "query": query,
308
+ "filter": filter,
309
+ },
310
+ results=[r.value for r in results],
311
+ latency_ms=latency_ms,
312
+ )
313
+ return results
314
+
315
+ # Async versions
316
+ @functools.wraps(self._original_store_methods["aget"])
317
+ async def traced_aget(
318
+ store_self,
319
+ namespace: tuple[str, ...],
320
+ key: str,
321
+ *,
322
+ refresh_ttl: bool | None = None,
323
+ ):
324
+ start_time = time.perf_counter()
325
+ result = await instrumentor._original_store_methods["aget"](
326
+ store_self, namespace, key, refresh_ttl=refresh_ttl
327
+ )
328
+ latency_ms = int((time.perf_counter() - start_time) * 1000)
329
+
330
+ instrumentor._record_memory_read(
331
+ query={"namespace": namespace, "key": key},
332
+ results=[result.value] if result else [],
333
+ latency_ms=latency_ms,
334
+ )
335
+ return result
336
+
337
+ @functools.wraps(self._original_store_methods["aput"])
338
+ async def traced_aput(
339
+ store_self,
340
+ namespace: tuple[str, ...],
341
+ key: str,
342
+ value: dict[str, Any],
343
+ index=None,
344
+ *,
345
+ ttl=None,
346
+ ):
347
+ # Get current value BEFORE write (for diff computation)
348
+ old_item = None
349
+ try:
350
+ old_item = await instrumentor._original_store_methods["aget"](
351
+ store_self, namespace, key
352
+ )
353
+ except Exception:
354
+ pass # No existing value
355
+
356
+ old_value = old_item.value if old_item else None
357
+
358
+ start_time = time.perf_counter()
359
+ kwargs = {}
360
+ if ttl is not None:
361
+ kwargs["ttl"] = ttl
362
+ result = await instrumentor._original_store_methods["aput"](
363
+ store_self, namespace, key, value, index, **kwargs
364
+ )
365
+ latency_ms = int((time.perf_counter() - start_time) * 1000)
366
+
367
+ # Compute field-level changes
368
+ changes = compute_field_changes(old_value, value)
369
+
370
+ instrumentor._record_memory_write(
371
+ namespace=list(namespace),
372
+ key=key,
373
+ operation="add",
374
+ data=value,
375
+ changes=changes,
376
+ latency_ms=latency_ms,
377
+ )
378
+ return result
379
+
380
+ @functools.wraps(self._original_store_methods["adelete"])
381
+ async def traced_adelete(store_self, namespace: tuple[str, ...], key: str):
382
+ # Get current value BEFORE delete (for diff computation)
383
+ old_item = None
384
+ try:
385
+ old_item = await instrumentor._original_store_methods["aget"](
386
+ store_self, namespace, key
387
+ )
388
+ except Exception:
389
+ pass
390
+
391
+ old_value = old_item.value if old_item else None
392
+
393
+ start_time = time.perf_counter()
394
+ result = await instrumentor._original_store_methods["adelete"](
395
+ store_self, namespace, key
396
+ )
397
+ latency_ms = int((time.perf_counter() - start_time) * 1000)
398
+
399
+ # Compute changes (all fields become None)
400
+ changes = []
401
+ if old_value:
402
+ changes = compute_field_changes(old_value, {})
403
+
404
+ instrumentor._record_memory_write(
405
+ namespace=list(namespace),
406
+ key=key,
407
+ operation="delete",
408
+ data={},
409
+ changes=changes,
410
+ latency_ms=latency_ms,
411
+ )
412
+ return result
413
+
414
+ @functools.wraps(self._original_store_methods["asearch"])
415
+ async def traced_asearch(
416
+ store_self,
417
+ namespace_prefix: tuple[str, ...],
418
+ /,
419
+ *,
420
+ query: str | None = None,
421
+ filter: dict[str, Any] | None = None,
422
+ limit: int = 10,
423
+ offset: int = 0,
424
+ refresh_ttl: bool | None = None,
425
+ ):
426
+ start_time = time.perf_counter()
427
+ results = await instrumentor._original_store_methods["asearch"](
428
+ store_self,
429
+ namespace_prefix,
430
+ query=query,
431
+ filter=filter,
432
+ limit=limit,
433
+ offset=offset,
434
+ refresh_ttl=refresh_ttl,
435
+ )
436
+ latency_ms = int((time.perf_counter() - start_time) * 1000)
437
+
438
+ instrumentor._record_memory_read(
439
+ query={
440
+ "namespace_prefix": namespace_prefix,
441
+ "query": query,
442
+ "filter": filter,
443
+ },
444
+ results=[r.value for r in results],
445
+ latency_ms=latency_ms,
446
+ )
447
+ return results
448
+
449
+ # Apply patches
450
+ BaseStore.get = traced_get
451
+ BaseStore.put = traced_put
452
+ BaseStore.delete = traced_delete
453
+ BaseStore.search = traced_search
454
+ BaseStore.aget = traced_aget
455
+ BaseStore.aput = traced_aput
456
+ BaseStore.adelete = traced_adelete
457
+ BaseStore.asearch = traced_asearch
458
+
459
+ def _unpatch_store_methods(self) -> None:
460
+ """Restore original BaseStore methods."""
461
+ if not _LANGGRAPH_AVAILABLE or BaseStore is None:
462
+ return
463
+
464
+ for method_name, original_method in self._original_store_methods.items():
465
+ setattr(BaseStore, method_name, original_method)
466
+
467
+ self._original_store_methods.clear()
468
+
469
+ def _record_memory_read(
470
+ self,
471
+ query: str | dict[str, Any],
472
+ results: list[Any],
473
+ latency_ms: Optional[int] = None,
474
+ ) -> None:
475
+ """Record a memory read operation to the current trace."""
476
+ try:
477
+ trace = self._get_current_trace()
478
+ step = MemoryReadStep(
479
+ step_id=str(uuid.uuid4()),
480
+ timestamp=datetime.now(timezone.utc),
481
+ query=query,
482
+ results=results,
483
+ match_count=len(results),
484
+ latency_ms=latency_ms,
485
+ )
486
+ trace.add_step(step)
487
+ except Exception as e:
488
+ logger.debug(f"Failed to record memory read: {e}")
489
+
490
+ def _record_memory_write(
491
+ self,
492
+ namespace: list[str],
493
+ key: str,
494
+ operation: str,
495
+ data: dict[str, Any],
496
+ changes: list[FieldChange] | None = None,
497
+ latency_ms: Optional[int] = None,
498
+ ) -> None:
499
+ """Record a memory write operation to the current trace.
500
+
501
+ Args:
502
+ namespace: Storage namespace as list (e.g., ["profiles", "user_123"])
503
+ key: Storage key within the namespace
504
+ operation: Operation type ("add", "delete")
505
+ data: The complete data being written
506
+ changes: Field-level changes with JSON paths (model-agnostic)
507
+ latency_ms: Operation latency in milliseconds
508
+ """
509
+ try:
510
+ trace = self._get_current_trace()
511
+
512
+ # Find the most recent tool call to link as trigger
513
+ triggered_by = None
514
+ for step in reversed(trace.steps):
515
+ if step.step_type == "tool_call":
516
+ triggered_by = step.step_id
517
+ break
518
+
519
+ step = MemoryWriteStep(
520
+ step_id=str(uuid.uuid4()),
521
+ timestamp=datetime.now(timezone.utc),
522
+ namespace=namespace,
523
+ key=key,
524
+ operation=operation, # type: ignore[arg-type]
525
+ data=data,
526
+ changes=changes,
527
+ triggered_by_step_id=triggered_by,
528
+ # Legacy field for backward compatibility
529
+ entity_type="/".join(namespace),
530
+ entity_id=key,
531
+ )
532
+ trace.add_step(step)
533
+ except Exception as e:
534
+ logger.debug(f"Failed to record memory write: {e}")