contextforge-eval 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. context_forge/__init__.py +95 -0
  2. context_forge/core/__init__.py +55 -0
  3. context_forge/core/trace.py +369 -0
  4. context_forge/core/types.py +121 -0
  5. context_forge/evaluation.py +267 -0
  6. context_forge/exceptions.py +56 -0
  7. context_forge/graders/__init__.py +44 -0
  8. context_forge/graders/base.py +264 -0
  9. context_forge/graders/deterministic/__init__.py +11 -0
  10. context_forge/graders/deterministic/memory_corruption.py +130 -0
  11. context_forge/graders/hybrid.py +190 -0
  12. context_forge/graders/judges/__init__.py +11 -0
  13. context_forge/graders/judges/backends/__init__.py +9 -0
  14. context_forge/graders/judges/backends/ollama.py +173 -0
  15. context_forge/graders/judges/base.py +158 -0
  16. context_forge/graders/judges/memory_hygiene_judge.py +332 -0
  17. context_forge/graders/judges/models.py +113 -0
  18. context_forge/harness/__init__.py +43 -0
  19. context_forge/harness/user_simulator/__init__.py +70 -0
  20. context_forge/harness/user_simulator/adapters/__init__.py +13 -0
  21. context_forge/harness/user_simulator/adapters/base.py +67 -0
  22. context_forge/harness/user_simulator/adapters/crewai.py +100 -0
  23. context_forge/harness/user_simulator/adapters/langgraph.py +157 -0
  24. context_forge/harness/user_simulator/adapters/pydanticai.py +105 -0
  25. context_forge/harness/user_simulator/llm/__init__.py +5 -0
  26. context_forge/harness/user_simulator/llm/ollama.py +119 -0
  27. context_forge/harness/user_simulator/models.py +103 -0
  28. context_forge/harness/user_simulator/persona.py +154 -0
  29. context_forge/harness/user_simulator/runner.py +342 -0
  30. context_forge/harness/user_simulator/scenario.py +95 -0
  31. context_forge/harness/user_simulator/simulator.py +307 -0
  32. context_forge/instrumentation/__init__.py +23 -0
  33. context_forge/instrumentation/base.py +307 -0
  34. context_forge/instrumentation/instrumentors/__init__.py +17 -0
  35. context_forge/instrumentation/instrumentors/langchain.py +671 -0
  36. context_forge/instrumentation/instrumentors/langgraph.py +534 -0
  37. context_forge/instrumentation/tracer.py +588 -0
  38. context_forge/py.typed +0 -0
  39. contextforge_eval-0.1.0.dist-info/METADATA +420 -0
  40. contextforge_eval-0.1.0.dist-info/RECORD +43 -0
  41. contextforge_eval-0.1.0.dist-info/WHEEL +5 -0
  42. contextforge_eval-0.1.0.dist-info/licenses/LICENSE +201 -0
  43. contextforge_eval-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,588 @@
1
+ """Explicit Tracer API for ContextForge.
2
+
3
+ This module implements:
4
+ - T080-T092: Tracer class with all step recording methods
5
+
6
+ Provides full manual control over trace capture for custom agents
7
+ that don't use standard frameworks.
8
+
9
+ Usage:
10
+ with Tracer.run(agent_info={"name": "my-agent"}) as t:
11
+ t.user_input(content="Hello")
12
+ t.llm_call(model="gpt-4", input="Hello", output="Hi!")
13
+ t.final_output(content="Hi!")
14
+
15
+ trace = t.get_trace()
16
+ t.save("./traces/my-trace.json")
17
+ """
18
+
19
+ import json
20
+ import uuid
21
+ from contextlib import asynccontextmanager, contextmanager
22
+ from datetime import datetime, timezone
23
+ from pathlib import Path
24
+ from typing import Any, Optional
25
+
26
+ from context_forge.core.trace import (
27
+ FinalOutputStep,
28
+ InterruptStep,
29
+ LLMCallStep,
30
+ MemoryReadStep,
31
+ MemoryWriteStep,
32
+ RetrievalStep,
33
+ StateChangeStep,
34
+ ToolCallStep,
35
+ TraceRun,
36
+ UserInputStep,
37
+ )
38
+ from context_forge.core.types import AgentInfo, ResourceImpact, RetrievalResult, TaskInfo
39
+ from context_forge.exceptions import TracerNotActiveError
40
+
41
+
42
+ class Tracer:
43
+ """Manual trace recorder for custom agents.
44
+
45
+ Provides a fluent API for explicitly recording trace steps.
46
+ Use as a context manager to ensure proper trace lifecycle.
47
+
48
+ Attributes:
49
+ trace: The TraceRun being recorded
50
+ """
51
+
52
+ def __init__(
53
+ self,
54
+ agent_info: AgentInfo | dict[str, Any],
55
+ task_info: Optional[TaskInfo | dict[str, Any]] = None,
56
+ run_id: Optional[str] = None,
57
+ ):
58
+ """Initialize tracer.
59
+
60
+ Args:
61
+ agent_info: Agent metadata (AgentInfo or dict)
62
+ task_info: Optional task metadata
63
+ run_id: Optional custom run ID (auto-generated if not provided)
64
+ """
65
+ # Convert dict to AgentInfo if needed
66
+ if isinstance(agent_info, dict):
67
+ agent_info = AgentInfo(**agent_info)
68
+ if isinstance(task_info, dict):
69
+ task_info = TaskInfo(**task_info)
70
+
71
+ self._trace = TraceRun(
72
+ run_id=run_id or str(uuid.uuid4()),
73
+ started_at=datetime.now(timezone.utc),
74
+ agent_info=agent_info,
75
+ task_info=task_info,
76
+ )
77
+ self._is_active = False
78
+ self._step_counter = 0
79
+ self._current_parent_id: Optional[str] = None
80
+
81
+ @property
82
+ def trace(self) -> TraceRun:
83
+ """Get the current trace."""
84
+ return self._trace
85
+
86
+ @property
87
+ def is_active(self) -> bool:
88
+ """Whether the tracer is currently active."""
89
+ return self._is_active
90
+
91
+ def _generate_step_id(self) -> str:
92
+ """Generate a unique step ID."""
93
+ self._step_counter += 1
94
+ return f"step-{self._step_counter:04d}"
95
+
96
+ def _ensure_active(self) -> None:
97
+ """Ensure tracer is active, raise if not."""
98
+ if not self._is_active:
99
+ raise TracerNotActiveError("Tracer is not active. Use Tracer.run() context manager.")
100
+
101
+ @classmethod
102
+ @contextmanager
103
+ def run(
104
+ cls,
105
+ agent_info: AgentInfo | dict[str, Any],
106
+ task_info: Optional[TaskInfo | dict[str, Any]] = None,
107
+ run_id: Optional[str] = None,
108
+ ):
109
+ """Create and run a tracer as a context manager.
110
+
111
+ Args:
112
+ agent_info: Agent metadata
113
+ task_info: Optional task metadata
114
+ run_id: Optional custom run ID
115
+
116
+ Yields:
117
+ Active Tracer instance
118
+
119
+ Example:
120
+ with Tracer.run(agent_info={"name": "my-agent"}) as t:
121
+ t.llm_call(...)
122
+ """
123
+ tracer = cls(agent_info=agent_info, task_info=task_info, run_id=run_id)
124
+ tracer._is_active = True
125
+ try:
126
+ yield tracer
127
+ finally:
128
+ tracer._trace.ended_at = datetime.now(timezone.utc)
129
+ tracer._is_active = False
130
+
131
+ @classmethod
132
+ @asynccontextmanager
133
+ async def run_async(
134
+ cls,
135
+ agent_info: AgentInfo | dict[str, Any],
136
+ task_info: Optional[TaskInfo | dict[str, Any]] = None,
137
+ run_id: Optional[str] = None,
138
+ ):
139
+ """Create and run a tracer as an async context manager.
140
+
141
+ Args:
142
+ agent_info: Agent metadata
143
+ task_info: Optional task metadata
144
+ run_id: Optional custom run ID
145
+
146
+ Yields:
147
+ Active Tracer instance
148
+
149
+ Example:
150
+ async with Tracer.run_async(agent_info={"name": "my-agent"}) as t:
151
+ await some_async_operation()
152
+ t.llm_call(...)
153
+ """
154
+ tracer = cls(agent_info=agent_info, task_info=task_info, run_id=run_id)
155
+ tracer._is_active = True
156
+ try:
157
+ yield tracer
158
+ finally:
159
+ tracer._trace.ended_at = datetime.now(timezone.utc)
160
+ tracer._is_active = False
161
+
162
+ # Step Recording Methods
163
+
164
+ def user_input(
165
+ self,
166
+ content: str,
167
+ input_type: Optional[str] = None,
168
+ parent_step_id: Optional[str] = None,
169
+ metadata: Optional[dict[str, Any]] = None,
170
+ ) -> str:
171
+ """Record a user input step.
172
+
173
+ Args:
174
+ content: The user's input content
175
+ input_type: Type of input (text, file, voice, etc.)
176
+ parent_step_id: Optional parent step for nesting
177
+ metadata: Optional additional metadata
178
+
179
+ Returns:
180
+ The step ID of the created step
181
+ """
182
+ self._ensure_active()
183
+ step_id = self._generate_step_id()
184
+
185
+ step = UserInputStep(
186
+ step_id=step_id,
187
+ timestamp=datetime.now(timezone.utc),
188
+ parent_step_id=parent_step_id or self._current_parent_id,
189
+ metadata=metadata,
190
+ content=content,
191
+ input_type=input_type,
192
+ )
193
+ self._trace.add_step(step)
194
+ return step_id
195
+
196
+ def llm_call(
197
+ self,
198
+ model: str,
199
+ input: str | list[dict[str, Any]],
200
+ output: str | dict[str, Any],
201
+ tokens_in: Optional[int] = None,
202
+ tokens_out: Optional[int] = None,
203
+ tokens_total: Optional[int] = None,
204
+ latency_ms: Optional[int] = None,
205
+ cost_estimate: Optional[float] = None,
206
+ provider: Optional[str] = None,
207
+ parent_step_id: Optional[str] = None,
208
+ metadata: Optional[dict[str, Any]] = None,
209
+ ) -> str:
210
+ """Record an LLM call step.
211
+
212
+ Args:
213
+ model: Model identifier (e.g., 'gpt-4', 'claude-3')
214
+ input: Prompt text or list of messages
215
+ output: Model response
216
+ tokens_in: Input token count
217
+ tokens_out: Output token count
218
+ tokens_total: Total token count
219
+ latency_ms: Response latency in milliseconds
220
+ cost_estimate: Estimated cost in USD
221
+ provider: LLM provider name
222
+ parent_step_id: Optional parent step for nesting
223
+ metadata: Optional additional metadata
224
+
225
+ Returns:
226
+ The step ID of the created step
227
+ """
228
+ self._ensure_active()
229
+ step_id = self._generate_step_id()
230
+
231
+ step = LLMCallStep(
232
+ step_id=step_id,
233
+ timestamp=datetime.now(timezone.utc),
234
+ parent_step_id=parent_step_id or self._current_parent_id,
235
+ metadata=metadata,
236
+ model=model,
237
+ input=input,
238
+ output=output,
239
+ tokens_in=tokens_in,
240
+ tokens_out=tokens_out,
241
+ tokens_total=tokens_total,
242
+ latency_ms=latency_ms,
243
+ cost_estimate=cost_estimate,
244
+ provider=provider,
245
+ )
246
+ self._trace.add_step(step)
247
+ return step_id
248
+
249
+ def tool_call(
250
+ self,
251
+ tool_name: str,
252
+ arguments: dict[str, Any],
253
+ result: Any,
254
+ latency_ms: Optional[int] = None,
255
+ success: Optional[bool] = None,
256
+ error: Optional[str] = None,
257
+ resource_impact: Optional[ResourceImpact | dict[str, Any]] = None,
258
+ parent_step_id: Optional[str] = None,
259
+ metadata: Optional[dict[str, Any]] = None,
260
+ ) -> str:
261
+ """Record a tool call step.
262
+
263
+ Args:
264
+ tool_name: Tool identifier
265
+ arguments: Arguments passed to the tool
266
+ result: Tool execution result
267
+ latency_ms: Execution time in milliseconds
268
+ success: Whether the tool call succeeded
269
+ error: Error message if failed
270
+ resource_impact: Cost/credit impact
271
+ parent_step_id: Optional parent step for nesting
272
+ metadata: Optional additional metadata
273
+
274
+ Returns:
275
+ The step ID of the created step
276
+ """
277
+ self._ensure_active()
278
+ step_id = self._generate_step_id()
279
+
280
+ # Convert dict to ResourceImpact if needed
281
+ if isinstance(resource_impact, dict):
282
+ resource_impact = ResourceImpact(**resource_impact)
283
+
284
+ step = ToolCallStep(
285
+ step_id=step_id,
286
+ timestamp=datetime.now(timezone.utc),
287
+ parent_step_id=parent_step_id or self._current_parent_id,
288
+ metadata=metadata,
289
+ tool_name=tool_name,
290
+ arguments=arguments,
291
+ result=result,
292
+ latency_ms=latency_ms,
293
+ success=success,
294
+ error=error,
295
+ resource_impact=resource_impact,
296
+ )
297
+ self._trace.add_step(step)
298
+ return step_id
299
+
300
+ def retrieval(
301
+ self,
302
+ query: str,
303
+ results: list[RetrievalResult | dict[str, Any]],
304
+ match_count: Optional[int] = None,
305
+ latency_ms: Optional[int] = None,
306
+ parent_step_id: Optional[str] = None,
307
+ metadata: Optional[dict[str, Any]] = None,
308
+ ) -> str:
309
+ """Record a retrieval step.
310
+
311
+ Args:
312
+ query: The search/retrieval query
313
+ results: List of retrieved documents
314
+ match_count: Number of matches (defaults to len(results))
315
+ latency_ms: Query execution time
316
+ parent_step_id: Optional parent step for nesting
317
+ metadata: Optional additional metadata
318
+
319
+ Returns:
320
+ The step ID of the created step
321
+ """
322
+ self._ensure_active()
323
+ step_id = self._generate_step_id()
324
+
325
+ # Convert dicts to RetrievalResult if needed
326
+ converted_results = []
327
+ for r in results:
328
+ if isinstance(r, dict):
329
+ converted_results.append(RetrievalResult(**r))
330
+ else:
331
+ converted_results.append(r)
332
+
333
+ step = RetrievalStep(
334
+ step_id=step_id,
335
+ timestamp=datetime.now(timezone.utc),
336
+ parent_step_id=parent_step_id or self._current_parent_id,
337
+ metadata=metadata,
338
+ query=query,
339
+ results=converted_results,
340
+ match_count=match_count if match_count is not None else len(converted_results),
341
+ latency_ms=latency_ms,
342
+ )
343
+ self._trace.add_step(step)
344
+ return step_id
345
+
346
+ def memory_read(
347
+ self,
348
+ query: str | dict[str, Any],
349
+ results: list[Any],
350
+ match_count: Optional[int] = None,
351
+ relevance_scores: Optional[list[float]] = None,
352
+ total_available: Optional[int] = None,
353
+ parent_step_id: Optional[str] = None,
354
+ metadata: Optional[dict[str, Any]] = None,
355
+ ) -> str:
356
+ """Record a memory read step.
357
+
358
+ Args:
359
+ query: Memory query (string or structured)
360
+ results: Items retrieved from memory
361
+ match_count: Number of matches (defaults to len(results))
362
+ relevance_scores: Relevance scores for results
363
+ total_available: Total items available in memory
364
+ parent_step_id: Optional parent step for nesting
365
+ metadata: Optional additional metadata
366
+
367
+ Returns:
368
+ The step ID of the created step
369
+ """
370
+ self._ensure_active()
371
+ step_id = self._generate_step_id()
372
+
373
+ step = MemoryReadStep(
374
+ step_id=step_id,
375
+ timestamp=datetime.now(timezone.utc),
376
+ parent_step_id=parent_step_id or self._current_parent_id,
377
+ metadata=metadata,
378
+ query=query,
379
+ results=results,
380
+ match_count=match_count if match_count is not None else len(results),
381
+ relevance_scores=relevance_scores,
382
+ total_available=total_available,
383
+ )
384
+ self._trace.add_step(step)
385
+ return step_id
386
+
387
+ def memory_write(
388
+ self,
389
+ entity_type: str,
390
+ operation: str,
391
+ data: dict[str, Any],
392
+ entity_id: Optional[str] = None,
393
+ parent_step_id: Optional[str] = None,
394
+ metadata: Optional[dict[str, Any]] = None,
395
+ ) -> str:
396
+ """Record a memory write step.
397
+
398
+ Args:
399
+ entity_type: Type of entity being written
400
+ operation: Operation type ('add', 'update', 'delete')
401
+ data: The data being written
402
+ entity_id: Optional entity identifier
403
+ parent_step_id: Optional parent step for nesting
404
+ metadata: Optional additional metadata
405
+
406
+ Returns:
407
+ The step ID of the created step
408
+ """
409
+ self._ensure_active()
410
+ step_id = self._generate_step_id()
411
+
412
+ step = MemoryWriteStep(
413
+ step_id=step_id,
414
+ timestamp=datetime.now(timezone.utc),
415
+ parent_step_id=parent_step_id or self._current_parent_id,
416
+ metadata=metadata,
417
+ entity_type=entity_type,
418
+ operation=operation,
419
+ data=data,
420
+ entity_id=entity_id,
421
+ )
422
+ self._trace.add_step(step)
423
+ return step_id
424
+
425
+ def interrupt(
426
+ self,
427
+ prompt: str,
428
+ response: str | dict[str, Any],
429
+ wait_duration_ms: int,
430
+ parent_step_id: Optional[str] = None,
431
+ metadata: Optional[dict[str, Any]] = None,
432
+ ) -> str:
433
+ """Record an interrupt (human-in-the-loop) step.
434
+
435
+ Args:
436
+ prompt: The prompt shown to the user
437
+ response: The user's response
438
+ wait_duration_ms: How long the agent waited
439
+ parent_step_id: Optional parent step for nesting
440
+ metadata: Optional additional metadata
441
+
442
+ Returns:
443
+ The step ID of the created step
444
+ """
445
+ self._ensure_active()
446
+ step_id = self._generate_step_id()
447
+
448
+ step = InterruptStep(
449
+ step_id=step_id,
450
+ timestamp=datetime.now(timezone.utc),
451
+ parent_step_id=parent_step_id or self._current_parent_id,
452
+ metadata=metadata,
453
+ prompt=prompt,
454
+ response=response,
455
+ wait_duration_ms=wait_duration_ms,
456
+ )
457
+ self._trace.add_step(step)
458
+ return step_id
459
+
460
+ def state_change(
461
+ self,
462
+ state_key: str,
463
+ new_value: Any,
464
+ old_value: Optional[Any] = None,
465
+ reason: Optional[str] = None,
466
+ parent_step_id: Optional[str] = None,
467
+ metadata: Optional[dict[str, Any]] = None,
468
+ ) -> str:
469
+ """Record a state change step.
470
+
471
+ Args:
472
+ state_key: The state field that changed
473
+ new_value: The new value
474
+ old_value: The previous value (if known)
475
+ reason: Reason for the change
476
+ parent_step_id: Optional parent step for nesting
477
+ metadata: Optional additional metadata
478
+
479
+ Returns:
480
+ The step ID of the created step
481
+ """
482
+ self._ensure_active()
483
+ step_id = self._generate_step_id()
484
+
485
+ step = StateChangeStep(
486
+ step_id=step_id,
487
+ timestamp=datetime.now(timezone.utc),
488
+ parent_step_id=parent_step_id or self._current_parent_id,
489
+ metadata=metadata,
490
+ state_key=state_key,
491
+ old_value=old_value,
492
+ new_value=new_value,
493
+ reason=reason,
494
+ )
495
+ self._trace.add_step(step)
496
+ return step_id
497
+
498
+ def final_output(
499
+ self,
500
+ content: Any,
501
+ format: Optional[str] = None,
502
+ parent_step_id: Optional[str] = None,
503
+ metadata: Optional[dict[str, Any]] = None,
504
+ ) -> str:
505
+ """Record a final output step.
506
+
507
+ Args:
508
+ content: The final output content
509
+ format: Output format (text, json, markdown, etc.)
510
+ parent_step_id: Optional parent step for nesting
511
+ metadata: Optional additional metadata
512
+
513
+ Returns:
514
+ The step ID of the created step
515
+ """
516
+ self._ensure_active()
517
+ step_id = self._generate_step_id()
518
+
519
+ step = FinalOutputStep(
520
+ step_id=step_id,
521
+ timestamp=datetime.now(timezone.utc),
522
+ parent_step_id=parent_step_id or self._current_parent_id,
523
+ metadata=metadata,
524
+ content=content,
525
+ format=format,
526
+ )
527
+ self._trace.add_step(step)
528
+ return step_id
529
+
530
+ # Utility Methods
531
+
532
+ def get_trace(self) -> TraceRun:
533
+ """Get the recorded trace.
534
+
535
+ Returns:
536
+ The TraceRun object with all recorded steps
537
+ """
538
+ return self._trace
539
+
540
+ def to_json(self, indent: Optional[int] = None) -> str:
541
+ """Serialize the trace to JSON.
542
+
543
+ Args:
544
+ indent: Indentation level for pretty printing
545
+
546
+ Returns:
547
+ JSON string representation of the trace
548
+ """
549
+ return self._trace.to_json(indent=indent)
550
+
551
+ def save(self, path: str | Path) -> Path:
552
+ """Save the trace to a file.
553
+
554
+ Args:
555
+ path: File path to save to
556
+
557
+ Returns:
558
+ Path to the saved file
559
+ """
560
+ path = Path(path)
561
+ path.parent.mkdir(parents=True, exist_ok=True)
562
+
563
+ with open(path, "w") as f:
564
+ f.write(self.to_json(indent=2))
565
+
566
+ return path
567
+
568
+ @contextmanager
569
+ def nested(self, parent_step_id: str):
570
+ """Context manager for nested steps.
571
+
572
+ All steps recorded within this context will have
573
+ their parent_step_id set to the specified value.
574
+
575
+ Args:
576
+ parent_step_id: The parent step ID for nested steps
577
+
578
+ Example:
579
+ llm_id = t.llm_call(...)
580
+ with t.nested(llm_id):
581
+ t.tool_call(...) # Will have parent_step_id = llm_id
582
+ """
583
+ old_parent = self._current_parent_id
584
+ self._current_parent_id = parent_step_id
585
+ try:
586
+ yield
587
+ finally:
588
+ self._current_parent_id = old_parent
context_forge/py.typed ADDED
File without changes