empathy-framework 5.1.0__py3-none-any.whl → 5.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. {empathy_framework-5.1.0.dist-info → empathy_framework-5.2.1.dist-info}/METADATA +52 -3
  2. {empathy_framework-5.1.0.dist-info → empathy_framework-5.2.1.dist-info}/RECORD +71 -30
  3. empathy_os/__init__.py +1 -1
  4. empathy_os/cli_router.py +21 -0
  5. empathy_os/core_modules/__init__.py +15 -0
  6. empathy_os/mcp/__init__.py +10 -0
  7. empathy_os/mcp/server.py +506 -0
  8. empathy_os/memory/control_panel.py +1 -131
  9. empathy_os/memory/control_panel_support.py +145 -0
  10. empathy_os/memory/encryption.py +159 -0
  11. empathy_os/memory/long_term.py +41 -626
  12. empathy_os/memory/long_term_types.py +99 -0
  13. empathy_os/memory/mixins/__init__.py +25 -0
  14. empathy_os/memory/mixins/backend_init_mixin.py +244 -0
  15. empathy_os/memory/mixins/capabilities_mixin.py +199 -0
  16. empathy_os/memory/mixins/handoff_mixin.py +208 -0
  17. empathy_os/memory/mixins/lifecycle_mixin.py +49 -0
  18. empathy_os/memory/mixins/long_term_mixin.py +352 -0
  19. empathy_os/memory/mixins/promotion_mixin.py +109 -0
  20. empathy_os/memory/mixins/short_term_mixin.py +182 -0
  21. empathy_os/memory/short_term.py +7 -0
  22. empathy_os/memory/simple_storage.py +302 -0
  23. empathy_os/memory/storage_backend.py +167 -0
  24. empathy_os/memory/unified.py +21 -1120
  25. empathy_os/meta_workflows/cli_commands/__init__.py +56 -0
  26. empathy_os/meta_workflows/cli_commands/agent_commands.py +321 -0
  27. empathy_os/meta_workflows/cli_commands/analytics_commands.py +442 -0
  28. empathy_os/meta_workflows/cli_commands/config_commands.py +232 -0
  29. empathy_os/meta_workflows/cli_commands/memory_commands.py +182 -0
  30. empathy_os/meta_workflows/cli_commands/template_commands.py +354 -0
  31. empathy_os/meta_workflows/cli_commands/workflow_commands.py +382 -0
  32. empathy_os/meta_workflows/cli_meta_workflows.py +52 -1802
  33. empathy_os/meta_workflows/intent_detector.py +71 -0
  34. empathy_os/models/telemetry/__init__.py +71 -0
  35. empathy_os/models/telemetry/analytics.py +594 -0
  36. empathy_os/models/telemetry/backend.py +196 -0
  37. empathy_os/models/telemetry/data_models.py +431 -0
  38. empathy_os/models/telemetry/storage.py +489 -0
  39. empathy_os/orchestration/__init__.py +35 -0
  40. empathy_os/orchestration/execution_strategies.py +481 -0
  41. empathy_os/orchestration/meta_orchestrator.py +488 -1
  42. empathy_os/routing/workflow_registry.py +36 -0
  43. empathy_os/telemetry/cli.py +19 -724
  44. empathy_os/telemetry/commands/__init__.py +14 -0
  45. empathy_os/telemetry/commands/dashboard_commands.py +696 -0
  46. empathy_os/tools.py +183 -0
  47. empathy_os/workflows/__init__.py +5 -0
  48. empathy_os/workflows/autonomous_test_gen.py +860 -161
  49. empathy_os/workflows/base.py +6 -2
  50. empathy_os/workflows/code_review.py +4 -1
  51. empathy_os/workflows/document_gen/__init__.py +25 -0
  52. empathy_os/workflows/document_gen/config.py +30 -0
  53. empathy_os/workflows/document_gen/report_formatter.py +162 -0
  54. empathy_os/workflows/document_gen/workflow.py +1426 -0
  55. empathy_os/workflows/document_gen.py +22 -1598
  56. empathy_os/workflows/security_audit.py +2 -2
  57. empathy_os/workflows/security_audit_phase3.py +7 -4
  58. empathy_os/workflows/seo_optimization.py +633 -0
  59. empathy_os/workflows/test_gen/__init__.py +52 -0
  60. empathy_os/workflows/test_gen/ast_analyzer.py +249 -0
  61. empathy_os/workflows/test_gen/config.py +88 -0
  62. empathy_os/workflows/test_gen/data_models.py +38 -0
  63. empathy_os/workflows/test_gen/report_formatter.py +289 -0
  64. empathy_os/workflows/test_gen/test_templates.py +381 -0
  65. empathy_os/workflows/test_gen/workflow.py +655 -0
  66. empathy_os/workflows/test_gen.py +42 -1905
  67. empathy_os/memory/types 2.py +0 -441
  68. empathy_os/models/telemetry.py +0 -1660
  69. {empathy_framework-5.1.0.dist-info → empathy_framework-5.2.1.dist-info}/WHEEL +0 -0
  70. {empathy_framework-5.1.0.dist-info → empathy_framework-5.2.1.dist-info}/entry_points.txt +0 -0
  71. {empathy_framework-5.1.0.dist-info → empathy_framework-5.2.1.dist-info}/licenses/LICENSE +0 -0
  72. {empathy_framework-5.1.0.dist-info → empathy_framework-5.2.1.dist-info}/licenses/LICENSE_CHANGE_ANNOUNCEMENT.md +0 -0
  73. {empathy_framework-5.1.0.dist-info → empathy_framework-5.2.1.dist-info}/top_level.txt +0 -0
@@ -1,15 +1,34 @@
1
- """Autonomous Test Generation with Dashboard Integration.
1
+ """Autonomous Test Generation with Dashboard Integration - Enhanced Edition.
2
2
 
3
3
  Generates behavioral tests with real-time monitoring via Agent Coordination Dashboard.
4
4
 
5
+ ENHANCEMENTS (Phase 1):
6
+ - Extended thinking mode for better test planning
7
+ - Prompt caching for 90% cost reduction
8
+ - Full source code (no truncation)
9
+ - Workflow-specific prompts with mocking templates
10
+ - Few-shot learning with examples
11
+
12
+ ENHANCEMENTS (Phase 2 - Multi-Turn Refinement):
13
+ - Iterative test generation with validation loop
14
+ - Automatic failure detection and fixing
15
+ - Conversation history for context preservation
16
+
17
+ ENHANCEMENTS (Phase 3 - Coverage-Guided Generation):
18
+ - Coverage analysis integration
19
+ - Iterative coverage improvement targeting uncovered lines
20
+ - Systematic path to 80%+ coverage
21
+
5
22
  Copyright 2026 Smart-AI-Memory
6
23
  Licensed under Apache 2.0
7
24
  """
8
25
 
9
26
  import json
10
27
  import logging
28
+ import re
11
29
  import subprocess
12
30
  import sys
31
+ from dataclasses import dataclass
13
32
  from pathlib import Path
14
33
  from typing import Any
15
34
 
@@ -21,21 +40,58 @@ from empathy_os.telemetry.feedback_loop import FeedbackLoop
21
40
  logger = logging.getLogger(__name__)
22
41
 
23
42
 
24
- class AutonomousTestGenerator:
25
- """Generate tests autonomously with dashboard monitoring."""
43
+ @dataclass
44
+ class ValidationResult:
45
+ """Result of pytest validation."""
46
+ passed: bool
47
+ failures: str
48
+ error_count: int
49
+ output: str
50
+
51
+
52
+ @dataclass
53
+ class CoverageResult:
54
+ """Result of coverage analysis."""
55
+ coverage: float
56
+ missing_lines: list[int]
57
+ total_statements: int
58
+ covered_statements: int
26
59
 
27
- def __init__(self, agent_id: str, batch_num: int, modules: list[dict[str, Any]]):
60
+
61
+ class AutonomousTestGenerator:
62
+ """Generate tests autonomously with dashboard monitoring and Anthropic best practices."""
63
+
64
+ def __init__(
65
+ self,
66
+ agent_id: str,
67
+ batch_num: int,
68
+ modules: list[dict[str, Any]],
69
+ enable_refinement: bool = True,
70
+ max_refinement_iterations: int = 3,
71
+ enable_coverage_guided: bool = False,
72
+ target_coverage: float = 0.80
73
+ ):
28
74
  """Initialize generator.
29
75
 
30
76
  Args:
31
77
  agent_id: Unique agent identifier
32
78
  batch_num: Batch number (1-18)
33
79
  modules: List of modules to generate tests for
80
+ enable_refinement: Enable Phase 2 multi-turn refinement (default: True)
81
+ max_refinement_iterations: Max iterations for refinement (default: 3)
82
+ enable_coverage_guided: Enable Phase 3 coverage-guided generation (default: False)
83
+ target_coverage: Target coverage percentage (default: 0.80 = 80%)
34
84
  """
35
85
  self.agent_id = agent_id
36
86
  self.batch_num = batch_num
37
87
  self.modules = modules
38
88
 
89
+ # Phase 2 & 3 configuration
90
+ self.enable_refinement = enable_refinement
91
+ self.max_refinement_iterations = max_refinement_iterations
92
+ self.enable_coverage_guided = enable_coverage_guided
93
+ self.target_coverage = target_coverage
94
+
39
95
  # Initialize memory backend for dashboard integration
40
96
  try:
41
97
  self.memory = RedisShortTermMemory()
@@ -51,6 +107,8 @@ class AutonomousTestGenerator:
51
107
  self.output_dir = Path(f"tests/behavioral/generated/batch{batch_num}")
52
108
  self.output_dir.mkdir(parents=True, exist_ok=True)
53
109
 
110
+ logger.info(f"Generator initialized: refinement={enable_refinement}, coverage_guided={enable_coverage_guided}")
111
+
54
112
  def generate_all(self) -> dict[str, Any]:
55
113
  """Generate tests for all modules with progress tracking.
56
114
 
@@ -204,8 +262,17 @@ class AutonomousTestGenerator:
204
262
  # Extract module path for imports
205
263
  module_path = str(source_file).replace("src/", "").replace(".py", "").replace("/", ".")
206
264
 
207
- # Generate tests using LLM agent (inline - no Task tool)
208
- test_content = self._generate_with_llm(module_name, module_path, source_file, source_code)
265
+ # Generate tests using LLM agent with Anthropic best practices
266
+ # Phase 1: Basic generation
267
+ # Phase 2: Multi-turn refinement (if enabled)
268
+ # Phase 3: Coverage-guided improvement (if enabled)
269
+
270
+ if self.enable_refinement:
271
+ logger.info(f"🔄 Using Phase 2: Multi-turn refinement for {module_name}")
272
+ test_content = self._generate_with_refinement(module_name, module_path, source_file, source_code, test_file)
273
+ else:
274
+ logger.info(f"📝 Using Phase 1: Basic generation for {module_name}")
275
+ test_content = self._generate_with_llm(module_name, module_path, source_file, source_code)
209
276
 
210
277
  if not test_content:
211
278
  logger.warning(f"LLM generation failed for {module_name}")
@@ -213,7 +280,19 @@ class AutonomousTestGenerator:
213
280
 
214
281
  logger.info(f"LLM generated {len(test_content)} bytes for {module_name}")
215
282
 
216
- # Write test file
283
+ # Phase 3: Coverage-guided improvement (if enabled)
284
+ if self.enable_coverage_guided:
285
+ logger.info(f"📊 Applying Phase 3: Coverage-guided improvement for {module_name}")
286
+ improved_content = self._generate_with_coverage_target(
287
+ module_name, module_path, source_file, source_code, test_file, test_content
288
+ )
289
+ if improved_content:
290
+ test_content = improved_content
291
+ logger.info(f"✅ Coverage-guided improvement complete for {module_name}")
292
+ else:
293
+ logger.warning(f"⚠️ Coverage-guided improvement failed, using previous version for {module_name}")
294
+
295
+ # Write final test file
217
296
  test_file.write_text(test_content)
218
297
  logger.info(f"Wrote test file: {test_file}")
219
298
 
@@ -224,14 +303,170 @@ class AutonomousTestGenerator:
224
303
 
225
304
  return test_file
226
305
 
306
+ def _is_workflow_module(self, source_code: str, module_path: str) -> bool:
307
+ """Detect if module is a workflow requiring special handling.
308
+
309
+ Args:
310
+ source_code: Source code content
311
+ module_path: Python import path
312
+
313
+ Returns:
314
+ True if this is a workflow module needing LLM mocking
315
+ """
316
+ # Check for workflow indicators
317
+ indicators = [
318
+ r"class\s+\w+Workflow",
319
+ r"async\s+def\s+execute",
320
+ r"tier_routing",
321
+ r"LLMProvider",
322
+ r"TelemetryCollector",
323
+ r"from\s+anthropic\s+import",
324
+ r"messages\.create",
325
+ r"client\.messages"
326
+ ]
327
+
328
+ return any(re.search(pattern, source_code) for pattern in indicators)
329
+
330
+ def _get_example_tests(self) -> str:
331
+ """Get few-shot examples of excellent tests for prompt learning."""
332
+ return """EXAMPLE 1: Testing a utility function with mocking
333
+ ```python
334
+ import pytest
335
+ from unittest.mock import Mock, patch
336
+ from mymodule import process_data
337
+
338
+ class TestProcessData:
339
+ def test_processes_valid_data_successfully(self):
340
+ \"\"\"Given valid input data, when processing, then returns expected result.\"\"\"
341
+ # Given
342
+ input_data = {"key": "value", "count": 42}
343
+
344
+ # When
345
+ result = process_data(input_data)
346
+
347
+ # Then
348
+ assert result is not None
349
+ assert result["status"] == "success"
350
+ assert result["processed"] is True
351
+
352
+ def test_handles_invalid_data_with_error(self):
353
+ \"\"\"Given invalid input, when processing, then raises ValueError.\"\"\"
354
+ # Given
355
+ invalid_data = {"missing": "key"}
356
+
357
+ # When/Then
358
+ with pytest.raises(ValueError, match="Required key 'key' not found"):
359
+ process_data(invalid_data)
360
+ ```
361
+
362
+ EXAMPLE 2: Testing a workflow with LLM mocking
363
+ ```python
364
+ import pytest
365
+ from unittest.mock import Mock, AsyncMock, patch
366
+ from mymodule import MyWorkflow
367
+
368
+ @pytest.fixture
369
+ def mock_llm_client(mocker):
370
+ \"\"\"Mock Anthropic LLM client.\"\"\"
371
+ mock = mocker.patch('anthropic.Anthropic')
372
+ mock_response = Mock()
373
+ mock_response.content = [Mock(text="mock LLM response")]
374
+ mock_response.usage = Mock(input_tokens=100, output_tokens=50)
375
+ mock_response.stop_reason = "end_turn"
376
+ mock.return_value.messages.create = AsyncMock(return_value=mock_response)
377
+ return mock
378
+
379
+ class TestMyWorkflow:
380
+ @pytest.mark.asyncio
381
+ async def test_executes_successfully_with_mocked_llm(self, mock_llm_client):
382
+ \"\"\"Given valid input, when executing workflow, then completes successfully.\"\"\"
383
+ # Given
384
+ workflow = MyWorkflow()
385
+ input_data = {"prompt": "test prompt"}
386
+
387
+ # When
388
+ result = await workflow.execute(input_data)
389
+
390
+ # Then
391
+ assert result is not None
392
+ assert "response" in result
393
+ mock_llm_client.return_value.messages.create.assert_called_once()
394
+
395
+ @pytest.mark.asyncio
396
+ async def test_handles_api_error_gracefully(self, mock_llm_client):
397
+ \"\"\"Given API failure, when executing, then handles error appropriately.\"\"\"
398
+ # Given
399
+ workflow = MyWorkflow()
400
+ mock_llm_client.return_value.messages.create.side_effect = Exception("API Error")
401
+
402
+ # When/Then
403
+ with pytest.raises(Exception, match="API Error"):
404
+ await workflow.execute({"prompt": "test"})
405
+ ```
406
+ """
407
+
408
+ def _get_workflow_specific_prompt(self, module_name: str, module_path: str, source_code: str) -> str:
409
+ """Get workflow-specific test generation prompt with comprehensive mocking guidance."""
410
+ return f"""Generate comprehensive tests for this WORKFLOW module.
411
+
412
+ ⚠️ CRITICAL: This module makes LLM API calls and requires proper mocking.
413
+
414
+ MODULE: {module_name}
415
+ IMPORT PATH: {module_path}
416
+
417
+ SOURCE CODE (COMPLETE - NO TRUNCATION):
418
+ ```python
419
+ {source_code}
420
+ ```
421
+
422
+ WORKFLOW TESTING REQUIREMENTS:
423
+
424
+ 1. **Mock LLM API calls** - NEVER make real API calls in tests
425
+ ```python
426
+ @pytest.fixture
427
+ def mock_llm_client(mocker):
428
+ mock = mocker.patch('anthropic.Anthropic')
429
+ mock_response = Mock()
430
+ mock_response.content = [Mock(text="mock response")]
431
+ mock_response.usage = Mock(input_tokens=100, output_tokens=50)
432
+ mock_response.stop_reason = "end_turn"
433
+ mock.return_value.messages.create = AsyncMock(return_value=mock_response)
434
+ return mock
435
+ ```
436
+
437
+ 2. **Test tier routing** - Verify correct model selection (cheap/capable/premium)
438
+ 3. **Test telemetry** - Mock and verify telemetry recording
439
+ 4. **Test cost calculation** - Verify token usage and cost tracking
440
+ 5. **Test error handling** - Mock API failures, timeouts, rate limits
441
+ 6. **Test caching** - Mock cache hits/misses if applicable
442
+
443
+ TARGET COVERAGE: 40-50% (realistic for workflow classes with proper mocking)
444
+
445
+ Generate a complete test file with:
446
+ - Copyright header: "Generated by enhanced autonomous test generation system."
447
+ - Proper imports (from {module_path})
448
+ - Mock fixtures for ALL external dependencies (LLM, databases, APIs, file I/O)
449
+ - Given/When/Then structure in docstrings
450
+ - Both success and failure test cases
451
+ - Edge case handling
452
+ - Docstrings for all tests describing behavior
453
+
454
+ Return ONLY the complete Python test file, no explanations."""
455
+
227
456
  def _generate_with_llm(self, module_name: str, module_path: str, source_file: Path, source_code: str) -> str | None:
228
- """Generate comprehensive tests using LLM.
457
+ """Generate comprehensive tests using LLM with Anthropic best practices.
458
+
459
+ ENHANCEMENTS (Phase 1):
460
+ - Extended thinking (20K token budget) for thorough test planning
461
+ - Prompt caching for 90% cost reduction
462
+ - Full source code (NO TRUNCATION)
463
+ - Workflow-specific prompts when detected
229
464
 
230
465
  Args:
231
466
  module_name: Name of module being tested
232
467
  module_path: Python import path (e.g., empathy_os.config)
233
468
  source_file: Path to source file
234
- source_code: Source code content
469
+ source_code: Source code content (FULL, not truncated)
235
470
 
236
471
  Returns:
237
472
  Test file content with comprehensive tests, or None if generation failed
@@ -250,15 +485,22 @@ class AutonomousTestGenerator:
250
485
  logger.error("ANTHROPIC_API_KEY not set")
251
486
  return None
252
487
 
253
- # Craft comprehensive test generation prompt
254
- prompt = f"""Generate comprehensive behavioral tests for this Python module.
488
+ # Detect if this is a workflow module
489
+ is_workflow = self._is_workflow_module(source_code, module_path)
490
+ logger.info(f"Module {module_name}: workflow={is_workflow}, size={len(source_code)} bytes (FULL)")
491
+
492
+ # Build appropriate prompt based on module type
493
+ if is_workflow:
494
+ generation_prompt = self._get_workflow_specific_prompt(module_name, module_path, source_code)
495
+ else:
496
+ generation_prompt = f"""Generate comprehensive behavioral tests for this Python module.
255
497
 
256
498
  SOURCE FILE: {source_file}
257
499
  MODULE PATH: {module_path}
258
500
 
259
- SOURCE CODE:
501
+ SOURCE CODE (COMPLETE):
260
502
  ```python
261
- {source_code[:3000]}{"..." if len(source_code) > 3000 else ""}
503
+ {source_code}
262
504
  ```
263
505
 
264
506
  Generate a complete test file that:
@@ -287,21 +529,59 @@ Licensed under Apache 2.0
287
529
 
288
530
  Return ONLY the complete Python test file content, no explanations."""
289
531
 
532
+ # Build messages with prompt caching (90% cost reduction on retries)
533
+ messages = [
534
+ {
535
+ "role": "user",
536
+ "content": [
537
+ {
538
+ "type": "text",
539
+ "text": "You are an expert Python test engineer. Here are examples of excellent tests:",
540
+ "cache_control": {"type": "ephemeral"}
541
+ },
542
+ {
543
+ "type": "text",
544
+ "text": self._get_example_tests(),
545
+ "cache_control": {"type": "ephemeral"}
546
+ },
547
+ {
548
+ "type": "text",
549
+ "text": generation_prompt
550
+ }
551
+ ]
552
+ }
553
+ ]
554
+
290
555
  try:
291
- # Call Anthropic API with capable model
292
- logger.info(f"Calling LLM for {module_name} (source: {len(source_code)} bytes)")
556
+ # Call Anthropic API with extended thinking and caching
557
+ logger.info(f"Calling LLM with extended thinking for {module_name} (workflow={is_workflow})")
293
558
  client = anthropic.Anthropic(api_key=api_key)
294
559
  response = client.messages.create(
295
560
  model="claude-sonnet-4-5", # capable tier
296
- max_tokens=4000,
297
- messages=[{"role": "user", "content": prompt}],
561
+ max_tokens=40000, # Very generous total budget for comprehensive tests
562
+ thinking={
563
+ "type": "enabled",
564
+ "budget_tokens": 20000 # Generous thinking budget for thorough planning
565
+ },
566
+ messages=messages,
567
+ timeout=900.0, # 15 minutes timeout for extended thinking + generation
298
568
  )
299
569
 
300
570
  if not response.content:
301
571
  logger.warning(f"Empty LLM response for {module_name}")
302
572
  return None
303
573
 
304
- test_content = response.content[0].text.strip()
574
+ # Extract test content (thinking comes first, then text)
575
+ test_content = None
576
+ for block in response.content:
577
+ if block.type == "text":
578
+ test_content = block.text.strip()
579
+ break
580
+
581
+ if not test_content:
582
+ logger.warning(f"No text content in LLM response for {module_name}")
583
+ return None
584
+
305
585
  logger.info(f"LLM returned {len(test_content)} bytes for {module_name}")
306
586
 
307
587
  if len(test_content) < 100:
@@ -314,6 +594,20 @@ Return ONLY the complete Python test file content, no explanations."""
314
594
  if test_content.endswith("```"):
315
595
  test_content = test_content[:-3].strip()
316
596
 
597
+ # Check for truncation indicators
598
+ if response.stop_reason == "max_tokens":
599
+ logger.warning(f"⚠️ LLM response truncated for {module_name} (hit max_tokens)")
600
+ # Response might be incomplete but let validation catch it
601
+
602
+ # Quick syntax pre-check before returning
603
+ try:
604
+ import ast
605
+ ast.parse(test_content)
606
+ logger.info(f"✓ Quick syntax check passed for {module_name}")
607
+ except SyntaxError as e:
608
+ logger.error(f"❌ LLM generated invalid syntax for {module_name}: {e.msg} at line {e.lineno}")
609
+ return None
610
+
317
611
  logger.info(f"Test content cleaned, final size: {len(test_content)} bytes")
318
612
  return test_content
319
613
 
@@ -321,161 +615,524 @@ Return ONLY the complete Python test file content, no explanations."""
321
615
  logger.error(f"LLM generation error for {module_name}: {e}", exc_info=True)
322
616
  return None
323
617
 
324
- def _create_test_template_DEPRECATED(self, module_name: str, source_file: Path, source_code: str) -> str:
325
- """Create comprehensive behavioral test template.
618
+ def _run_pytest_validation(self, test_file: Path) -> ValidationResult:
619
+ """Run pytest on generated tests and collect failures.
620
+
621
+ Args:
622
+ test_file: Path to test file to validate
623
+
624
+ Returns:
625
+ ValidationResult with test outcomes and failure details
626
+ """
627
+ try:
628
+ result = subprocess.run(
629
+ [sys.executable, "-m", "pytest", str(test_file), "-v", "--tb=short"],
630
+ capture_output=True,
631
+ text=True,
632
+ timeout=60,
633
+ )
634
+
635
+ passed = result.returncode == 0
636
+ output = result.stdout + "\n" + result.stderr
637
+
638
+ # Count errors
639
+ error_count = output.count("FAILED") + output.count("ERROR")
640
+
641
+ # Extract failure details
642
+ failures = ""
643
+ if not passed:
644
+ # Extract relevant failure information
645
+ lines = output.split("\n")
646
+ failure_lines = []
647
+ in_failure = False
648
+ for line in lines:
649
+ if "FAILED" in line or "ERROR" in line:
650
+ in_failure = True
651
+ if in_failure:
652
+ failure_lines.append(line)
653
+ if line.startswith("="): # End of failure section
654
+ in_failure = False
655
+ failures = "\n".join(failure_lines[:100]) # Limit to 100 lines
656
+
657
+ logger.info(f"Pytest validation: passed={passed}, errors={error_count}")
658
+
659
+ return ValidationResult(
660
+ passed=passed,
661
+ failures=failures,
662
+ error_count=error_count,
663
+ output=output
664
+ )
665
+
666
+ except subprocess.TimeoutExpired:
667
+ logger.error(f"Pytest validation timeout for {test_file}")
668
+ return ValidationResult(
669
+ passed=False,
670
+ failures="Validation timeout after 60 seconds",
671
+ error_count=1,
672
+ output="Timeout"
673
+ )
674
+ except Exception as e:
675
+ logger.error(f"Pytest validation exception: {e}")
676
+ return ValidationResult(
677
+ passed=False,
678
+ failures=f"Validation exception: {e}",
679
+ error_count=1,
680
+ output=str(e)
681
+ )
682
+
683
+ def _call_llm_with_history(
684
+ self,
685
+ conversation_history: list[dict[str, Any]],
686
+ api_key: str
687
+ ) -> str | None:
688
+ """Call LLM with conversation history for refinement.
689
+
690
+ Args:
691
+ conversation_history: List of messages (role + content)
692
+ api_key: Anthropic API key
693
+
694
+ Returns:
695
+ Refined test content or None if failed
696
+ """
697
+ try:
698
+ import anthropic
699
+
700
+ client = anthropic.Anthropic(api_key=api_key)
701
+ response = client.messages.create(
702
+ model="claude-sonnet-4-5",
703
+ max_tokens=40000, # Very generous total budget for iterative refinement
704
+ thinking={
705
+ "type": "enabled",
706
+ "budget_tokens": 20000 # Generous thinking budget for thorough analysis
707
+ },
708
+ messages=conversation_history,
709
+ timeout=900.0, # 15 minutes timeout for refinement iterations
710
+ )
711
+
712
+ if not response.content:
713
+ logger.warning("Empty LLM response during refinement")
714
+ return None
715
+
716
+ # Extract text content
717
+ test_content = None
718
+ for block in response.content:
719
+ if block.type == "text":
720
+ test_content = block.text.strip()
721
+ break
722
+
723
+ if not test_content:
724
+ logger.warning("No text content in refinement response")
725
+ return None
726
+
727
+ # Clean up response
728
+ if test_content.startswith("```python"):
729
+ test_content = test_content[len("```python"):].strip()
730
+ if test_content.endswith("```"):
731
+ test_content = test_content[:-3].strip()
732
+
733
+ return test_content
734
+
735
+ except Exception as e:
736
+ logger.error(f"LLM refinement error: {e}", exc_info=True)
737
+ return None
738
+
739
+ def _generate_with_refinement(
740
+ self,
741
+ module_name: str,
742
+ module_path: str,
743
+ source_file: Path,
744
+ source_code: str,
745
+ test_file: Path
746
+ ) -> str | None:
747
+ """Generate tests with iterative refinement (Phase 2).
748
+
749
+ Process:
750
+ 1. Generate initial tests
751
+ 2. Run pytest validation
752
+ 3. If failures, ask Claude to fix
753
+ 4. Repeat until tests pass or max iterations
326
754
 
327
755
  Args:
328
756
  module_name: Name of module being tested
757
+ module_path: Python import path
329
758
  source_file: Path to source file
330
759
  source_code: Source code content
760
+ test_file: Path where tests will be written
331
761
 
332
762
  Returns:
333
- Test file content with comprehensive tests
763
+ Final test content or None if all attempts failed
334
764
  """
335
- import ast
765
+ import os
336
766
 
337
- # Extract module path for imports
338
- module_path = str(source_file).replace("src/", "").replace(".py", "").replace("/", ".")
767
+ api_key = os.getenv("ANTHROPIC_API_KEY")
768
+ if not api_key:
769
+ logger.error("ANTHROPIC_API_KEY not set")
770
+ return None
339
771
 
340
- # Parse source to find functions and classes
341
- try:
342
- tree = ast.parse(source_code)
343
- functions = [node.name for node in ast.walk(tree) if isinstance(node, ast.FunctionDef) and not node.name.startswith('_')]
344
- classes = [node.name for node in ast.walk(tree) if isinstance(node, ast.ClassDef)]
345
- except:
346
- functions = []
347
- classes = []
348
-
349
- # Generate test classes for each class found
350
- test_classes = []
351
- for cls_name in classes[:5]: # Limit to 5 classes
352
- test_classes.append(f'''
353
- class Test{cls_name}:
354
- """Behavioral tests for {cls_name} class."""
355
-
356
- def test_{cls_name.lower()}_instantiation(self):
357
- """Test {cls_name} can be instantiated."""
358
- # Given: Class is available
359
- # When: Creating instance
360
- try:
361
- from {module_path} import {cls_name}
362
- # Then: Instance created successfully
363
- assert {cls_name} is not None
364
- except ImportError:
365
- pytest.skip("Class not available")
772
+ logger.info(f"🔄 Phase 2: Multi-turn refinement enabled for {module_name} (max {self.max_refinement_iterations} iterations)")
366
773
 
367
- def test_{cls_name.lower()}_has_expected_methods(self):
368
- """Test {cls_name} has expected interface."""
369
- # Given: Class is available
370
- try:
371
- from {module_path} import {cls_name}
372
- # When: Checking methods
373
- # Then: Common methods should exist
374
- assert hasattr({cls_name}, '__init__')
375
- except ImportError:
376
- pytest.skip("Class not available")
377
- ''')
378
-
379
- # Generate tests for functions
380
- function_tests = []
381
- for func_name in functions[:10]: # Limit to 10 functions
382
- function_tests.append(f'''
383
- def test_{func_name}_callable(self):
384
- """Test {func_name} function is callable."""
385
- # Given: Function is available
386
- try:
387
- from {module_path} import {func_name}
388
- # When: Checking if callable
389
- # Then: Function should be callable
390
- assert callable({func_name})
391
- except ImportError:
392
- pytest.skip("Function not available")
774
+ # Step 1: Generate initial tests
775
+ test_content = self._generate_with_llm(module_name, module_path, source_file, source_code)
776
+ if not test_content:
777
+ logger.warning("Initial generation failed")
778
+ return None
393
779
 
394
- def test_{func_name}_with_valid_input(self):
395
- """Test {func_name} with valid input."""
396
- # Given: Function is available
397
- try:
398
- from {module_path} import {func_name}
399
- # When: Called with mocked dependencies
400
- with patch.object({module_path}, '{func_name}', return_value=Mock()) as mock_func:
401
- result = mock_func()
402
- # Then: Should return successfully
403
- assert result is not None
404
- except (ImportError, AttributeError):
405
- pytest.skip("Function not available or cannot be mocked")
406
- ''')
407
-
408
- # Combine all test content
409
- test_content = f'''"""Behavioral tests for {module_name}.
780
+ # Build conversation history for subsequent refinements
781
+ is_workflow = self._is_workflow_module(source_code, module_path)
410
782
 
411
- Generated by enhanced autonomous test generation system.
783
+ # Initial prompt (for history tracking)
784
+ if is_workflow:
785
+ initial_prompt = self._get_workflow_specific_prompt(module_name, module_path, source_code)
786
+ else:
787
+ initial_prompt = f"""Generate comprehensive behavioral tests for {module_name}.
412
788
 
413
- Copyright 2026 Smart-AI-Memory
414
- Licensed under Apache 2.0
415
- """
789
+ SOURCE CODE:
790
+ ```python
791
+ {source_code}
792
+ ```"""
793
+
794
+ conversation_history = [
795
+ {
796
+ "role": "user",
797
+ "content": [
798
+ {"type": "text", "text": "You are an expert Python test engineer. Examples:", "cache_control": {"type": "ephemeral"}},
799
+ {"type": "text", "text": self._get_example_tests(), "cache_control": {"type": "ephemeral"}},
800
+ {"type": "text", "text": initial_prompt}
801
+ ]
802
+ },
803
+ {
804
+ "role": "assistant",
805
+ "content": test_content
806
+ }
807
+ ]
416
808
 
417
- import pytest
418
- from unittest.mock import Mock, patch, MagicMock, AsyncMock
419
- from pathlib import Path
809
+ # Step 2: Iterative refinement loop
810
+ for iteration in range(self.max_refinement_iterations):
811
+ logger.info(f"📝 Refinement iteration {iteration + 1}/{self.max_refinement_iterations} for {module_name}")
812
+
813
+ # Write current version to temp file
814
+ temp_test_file = test_file.parent / f"_temp_{test_file.name}"
815
+ temp_test_file.write_text(test_content)
816
+
817
+ # Validate with pytest
818
+ validation_result = self._run_pytest_validation(temp_test_file)
819
+
820
+ if validation_result.passed:
821
+ logger.info(f"✅ Tests passed on iteration {iteration + 1} for {module_name}")
822
+ temp_test_file.unlink() # Clean up
823
+ return test_content
824
+
825
+ # Tests failed - ask Claude to fix
826
+ logger.warning(f"⚠️ Tests failed on iteration {iteration + 1}: {validation_result.error_count} errors")
827
+
828
+ refinement_prompt = f"""The tests you generated have failures. Please fix these specific issues:
420
829
 
421
- # Import module under test
422
- try:
423
- import {module_path}
424
- except ImportError as e:
425
- pytest.skip(f"Cannot import {module_path}: {{e}}", allow_module_level=True)
426
-
427
-
428
- class TestModule{module_name.title().replace("_", "")}:
429
- """Behavioral tests for {module_name} module."""
430
-
431
- def test_module_imports_successfully(self):
432
- """Test that module can be imported."""
433
- # Given: Module exists
434
- # When: Importing module
435
- # Then: No import errors
436
- assert {module_path} is not None
437
-
438
- def test_module_has_expected_attributes(self):
439
- """Test module has expected top-level attributes."""
440
- # Given: Module is imported
441
- # When: Checking for __doc__
442
- # Then: Documentation should exist
443
- assert hasattr({module_path}, '__doc__')
444
- {"".join(function_tests)}
445
-
446
- {"".join(test_classes)}
447
-
448
- class TestEdgeCases:
449
- """Edge case and error handling tests."""
450
-
451
- def test_import_does_not_raise_exceptions(self):
452
- """Test that importing module doesn't raise exceptions."""
453
- # Given: Module path is valid
454
- # When: Importing
455
- # Then: Should not raise
830
+ FAILURES:
831
+ {validation_result.failures[:2000]}
832
+
833
+ Requirements:
834
+ 1. Fix ONLY the failing tests - don't rewrite everything
835
+ 2. Ensure imports are correct
836
+ 3. Ensure mocking is properly configured
837
+ 4. Return the COMPLETE corrected test file (not just the fixes)
838
+ 5. Keep the same structure and copyright header
839
+
840
+ Return ONLY the complete Python test file, no explanations."""
841
+
842
+ # Add to conversation history
843
+ conversation_history.append({
844
+ "role": "user",
845
+ "content": refinement_prompt
846
+ })
847
+
848
+ # Call LLM for refinement
849
+ refined_content = self._call_llm_with_history(conversation_history, api_key)
850
+
851
+ if not refined_content:
852
+ logger.error(f"❌ Refinement failed on iteration {iteration + 1}")
853
+ temp_test_file.unlink()
854
+ break
855
+
856
+ # Update content and history
857
+ test_content = refined_content
858
+ conversation_history.append({
859
+ "role": "assistant",
860
+ "content": test_content
861
+ })
862
+
863
+ logger.info(f"🔄 Refinement iteration {iteration + 1} complete, retrying validation...")
864
+
865
+ # Max iterations reached
866
+ logger.warning(f"⚠️ Max refinement iterations reached for {module_name} - returning best attempt")
867
+ return test_content
868
+
869
+ def _run_coverage_analysis(self, test_file: Path, source_file: Path) -> CoverageResult:
870
+ """Run coverage analysis on tests.
871
+
872
+ Args:
873
+ test_file: Path to test file
874
+ source_file: Path to source file being tested
875
+
876
+ Returns:
877
+ CoverageResult with coverage metrics and missing lines
878
+ """
456
879
  try:
457
- import {module_path}
458
- assert True
880
+ # Run pytest with coverage
881
+ result = subprocess.run(
882
+ [
883
+ sys.executable, "-m", "pytest",
884
+ str(test_file),
885
+ f"--cov={source_file.parent}",
886
+ "--cov-report=term-missing",
887
+ "--cov-report=json",
888
+ "-v"
889
+ ],
890
+ capture_output=True,
891
+ text=True,
892
+ timeout=120,
893
+ cwd=Path.cwd()
894
+ )
895
+
896
+ # Parse coverage from JSON report
897
+ coverage_json_path = Path(".coverage.json")
898
+ if not coverage_json_path.exists():
899
+ logger.warning("Coverage JSON not generated")
900
+ return CoverageResult(
901
+ coverage=0.0,
902
+ missing_lines=[],
903
+ total_statements=0,
904
+ covered_statements=0
905
+ )
906
+
907
+ with open(coverage_json_path) as f:
908
+ coverage_data = json.load(f)
909
+
910
+ # Find coverage for our specific source file
911
+ source_key = str(source_file)
912
+ file_coverage = None
913
+ for key in coverage_data.get("files", {}).keys():
914
+ if source_file.name in key or source_key in key:
915
+ file_coverage = coverage_data["files"][key]
916
+ break
917
+
918
+ if not file_coverage:
919
+ logger.warning(f"No coverage data found for {source_file}")
920
+ return CoverageResult(
921
+ coverage=0.0,
922
+ missing_lines=[],
923
+ total_statements=0,
924
+ covered_statements=0
925
+ )
926
+
927
+ # Extract metrics
928
+ total_statements = file_coverage["summary"]["num_statements"]
929
+ covered_statements = file_coverage["summary"]["covered_lines"]
930
+ coverage_pct = file_coverage["summary"]["percent_covered"] / 100.0
931
+ missing_lines = file_coverage["missing_lines"]
932
+
933
+ logger.info(f"Coverage: {coverage_pct:.1%} ({covered_statements}/{total_statements} statements)")
934
+
935
+ return CoverageResult(
936
+ coverage=coverage_pct,
937
+ missing_lines=missing_lines,
938
+ total_statements=total_statements,
939
+ covered_statements=covered_statements
940
+ )
941
+
942
+ except subprocess.TimeoutExpired:
943
+ logger.error("Coverage analysis timeout")
944
+ return CoverageResult(coverage=0.0, missing_lines=[], total_statements=0, covered_statements=0)
459
945
  except Exception as e:
460
- pytest.fail(f"Import raised unexpected exception: {{e}}")
946
+ logger.error(f"Coverage analysis error: {e}", exc_info=True)
947
+ return CoverageResult(coverage=0.0, missing_lines=[], total_statements=0, covered_statements=0)
948
+
949
+ def _extract_uncovered_lines(self, source_file: Path, missing_lines: list[int]) -> str:
950
+ """Extract source code for uncovered lines.
951
+
952
+ Args:
953
+ source_file: Path to source file
954
+ missing_lines: List of uncovered line numbers
955
+
956
+ Returns:
957
+ Formatted string with uncovered code sections
958
+ """
959
+ if not missing_lines:
960
+ return "No uncovered lines"
461
961
 
462
- def test_module_constants_are_defined(self):
463
- """Test that common constants are properly defined."""
464
- # Given: Module is imported
465
- # When: Checking for logger or similar
466
- # Then: Should have standard attributes
467
962
  try:
468
- import {module_path}
469
- # Check for common patterns
470
- assert True # Module loaded
471
- except ImportError:
472
- pytest.skip("Module not available")
473
- '''
963
+ source_lines = source_file.read_text().split("\n")
964
+
965
+ # Group consecutive lines into ranges
966
+ ranges = []
967
+ start = missing_lines[0]
968
+ end = start
969
+
970
+ for line_num in missing_lines[1:]:
971
+ if line_num == end + 1:
972
+ end = line_num
973
+ else:
974
+ ranges.append((start, end))
975
+ start = line_num
976
+ end = start
977
+ ranges.append((start, end))
978
+
979
+ # Extract code for each range with context
980
+ uncovered_sections = []
981
+ for start, end in ranges[:10]: # Limit to 10 ranges
982
+ context_start = max(0, start - 3)
983
+ context_end = min(len(source_lines), end + 2)
984
+
985
+ section = []
986
+ section.append(f"Lines {start}-{end}:")
987
+ for i in range(context_start, context_end):
988
+ line_marker = ">>>" if start <= i + 1 <= end else " "
989
+ section.append(f"{line_marker} {i + 1:4d}: {source_lines[i]}")
990
+
991
+ uncovered_sections.append("\n".join(section))
992
+
993
+ return "\n\n".join(uncovered_sections)
994
+
995
+ except Exception as e:
996
+ logger.error(f"Error extracting uncovered lines: {e}")
997
+ return f"Error extracting lines: {e}"
998
+
999
+ def _generate_with_coverage_target(
1000
+ self,
1001
+ module_name: str,
1002
+ module_path: str,
1003
+ source_file: Path,
1004
+ source_code: str,
1005
+ test_file: Path,
1006
+ initial_test_content: str
1007
+ ) -> str | None:
1008
+ """Generate tests iteratively until coverage target met (Phase 3).
1009
+
1010
+ Process:
1011
+ 1. Start with initial tests
1012
+ 2. Run coverage analysis
1013
+ 3. If target not met, identify uncovered lines
1014
+ 4. Ask Claude to add tests for uncovered code
1015
+ 5. Repeat until target coverage reached or max iterations
1016
+
1017
+ Args:
1018
+ module_name: Name of module being tested
1019
+ module_path: Python import path
1020
+ source_file: Path to source file
1021
+ source_code: Source code content
1022
+ test_file: Path to test file
1023
+ initial_test_content: Initial test content from Phase 1/2
1024
+
1025
+ Returns:
1026
+ Final test content with improved coverage or None if failed
1027
+ """
1028
+ import os
1029
+
1030
+ api_key = os.getenv("ANTHROPIC_API_KEY")
1031
+ if not api_key:
1032
+ logger.error("ANTHROPIC_API_KEY not set")
1033
+ return None
1034
+
1035
+ logger.info(f"📊 Phase 3: Coverage-guided generation enabled (target: {self.target_coverage:.0%})")
1036
+
1037
+ test_content = initial_test_content
1038
+ max_coverage_iterations = 5
1039
+
1040
+ for iteration in range(max_coverage_iterations):
1041
+ logger.info(f"📈 Coverage iteration {iteration + 1}/{max_coverage_iterations} for {module_name}")
1042
+
1043
+ # Write current tests
1044
+ test_file.write_text(test_content)
1045
+
1046
+ # Run coverage analysis
1047
+ coverage_result = self._run_coverage_analysis(test_file, source_file)
1048
+
1049
+ logger.info(f"Current coverage: {coverage_result.coverage:.1%}, target: {self.target_coverage:.0%}")
1050
+
1051
+ # Check if target reached
1052
+ if coverage_result.coverage >= self.target_coverage:
1053
+ logger.info(f"✅ Coverage target reached: {coverage_result.coverage:.1%}")
1054
+ return test_content
1055
+
1056
+ # Not enough progress
1057
+ if iteration > 0 and coverage_result.coverage <= 0.05:
1058
+ logger.warning("⚠️ Coverage not improving, stopping")
1059
+ break
1060
+
1061
+ # Identify uncovered code
1062
+ uncovered_code = self._extract_uncovered_lines(source_file, coverage_result.missing_lines)
1063
+
1064
+ # Ask Claude to add tests for uncovered lines
1065
+ refinement_prompt = f"""Current coverage: {coverage_result.coverage:.1%}
1066
+ Target coverage: {self.target_coverage:.0%}
1067
+ Missing: {len(coverage_result.missing_lines)} lines
1068
+
1069
+ UNCOVERED CODE:
1070
+ {uncovered_code[:3000]}
1071
+
1072
+ Please ADD tests to cover these specific uncovered lines. Requirements:
1073
+ 1. Focus ONLY on the uncovered lines shown above
1074
+ 2. Add new test methods to the existing test classes
1075
+ 3. Return the COMPLETE test file with additions (not just new tests)
1076
+ 4. Use appropriate mocking for external dependencies
1077
+ 5. Keep existing tests intact - just add new ones
1078
+
1079
+ Return ONLY the complete Python test file with additions, no explanations."""
1080
+
1081
+ # Build conversation with caching
1082
+ messages = [
1083
+ {
1084
+ "role": "user",
1085
+ "content": [
1086
+ {"type": "text", "text": "You are an expert Python test engineer. Examples:", "cache_control": {"type": "ephemeral"}},
1087
+ {"type": "text", "text": self._get_example_tests(), "cache_control": {"type": "ephemeral"}},
1088
+ {"type": "text", "text": f"Source code:\n```python\n{source_code}\n```", "cache_control": {"type": "ephemeral"}},
1089
+ {"type": "text", "text": f"Current tests:\n```python\n{test_content}\n```"},
1090
+ {"type": "text", "text": refinement_prompt}
1091
+ ]
1092
+ }
1093
+ ]
1094
+
1095
+ # Call LLM for coverage improvement
1096
+ try:
1097
+ import anthropic
1098
+ client = anthropic.Anthropic(api_key=api_key)
1099
+ response = client.messages.create(
1100
+ model="claude-sonnet-4-5",
1101
+ max_tokens=40000, # Very generous total budget for coverage improvement
1102
+ thinking={"type": "enabled", "budget_tokens": 20000}, # Thorough thinking for coverage gaps
1103
+ messages=messages,
1104
+ timeout=900.0, # 15 minutes timeout for coverage-guided iterations
1105
+ )
1106
+
1107
+ refined_content = None
1108
+ for block in response.content:
1109
+ if block.type == "text":
1110
+ refined_content = block.text.strip()
1111
+ break
1112
+
1113
+ if not refined_content:
1114
+ logger.warning(f"No content in coverage refinement iteration {iteration + 1}")
1115
+ break
474
1116
 
1117
+ # Clean up
1118
+ if refined_content.startswith("```python"):
1119
+ refined_content = refined_content[len("```python"):].strip()
1120
+ if refined_content.endswith("```"):
1121
+ refined_content = refined_content[:-3].strip()
1122
+
1123
+ test_content = refined_content
1124
+ logger.info(f"🔄 Coverage iteration {iteration + 1} complete, retrying analysis...")
1125
+
1126
+ except Exception as e:
1127
+ logger.error(f"Coverage refinement error on iteration {iteration + 1}: {e}")
1128
+ break
1129
+
1130
+ # Return best attempt
1131
+ logger.info(f"Coverage-guided generation complete: final coverage ~{coverage_result.coverage:.1%}")
475
1132
  return test_content
476
1133
 
477
1134
  def _validate_test_file(self, test_file: Path) -> bool:
478
- """Validate test file can be imported.
1135
+ """Validate test file can be imported and has valid syntax.
479
1136
 
480
1137
  Args:
481
1138
  test_file: Path to test file
@@ -483,6 +1140,20 @@ class TestEdgeCases:
483
1140
  Returns:
484
1141
  True if valid, False otherwise
485
1142
  """
1143
+ # Step 1: Check for syntax errors with ast.parse (fast)
1144
+ try:
1145
+ import ast
1146
+ content = test_file.read_text()
1147
+ ast.parse(content)
1148
+ logger.info(f"✓ Syntax check passed for {test_file.name}")
1149
+ except SyntaxError as e:
1150
+ logger.error(f"❌ Syntax error in {test_file.name} at line {e.lineno}: {e.msg}")
1151
+ return False
1152
+ except Exception as e:
1153
+ logger.error(f"❌ Cannot parse {test_file.name}: {e}")
1154
+ return False
1155
+
1156
+ # Step 2: Check if pytest can collect the tests
486
1157
  try:
487
1158
  result = subprocess.run(
488
1159
  [sys.executable, "-m", "pytest", "--collect-only", str(test_file)],
@@ -492,14 +1163,18 @@ class TestEdgeCases:
492
1163
  )
493
1164
 
494
1165
  if result.returncode != 0:
495
- logger.warning(f"Validation failed for {test_file.name}: {result.stderr[:500]}")
496
- # Don't fail validation on collection errors - test might still be valuable
497
- # Just log the error and keep the file
498
- return True # Changed from False - be permissive
1166
+ logger.error(f" Pytest collection failed for {test_file.name}")
1167
+ logger.error(f" Error: {result.stderr[:500]}")
1168
+ return False
499
1169
 
1170
+ logger.info(f"✓ Pytest collection passed for {test_file.name}")
500
1171
  return True
1172
+
1173
+ except subprocess.TimeoutExpired:
1174
+ logger.error(f"❌ Validation timeout for {test_file.name}")
1175
+ return False
501
1176
  except Exception as e:
502
- logger.error(f"Validation exception for {test_file}: {e}")
1177
+ logger.error(f"Validation exception for {test_file}: {e}")
503
1178
  return False
504
1179
 
505
1180
  def _count_tests(self) -> int:
@@ -524,25 +1199,42 @@ class TestEdgeCases:
524
1199
  return 0
525
1200
 
526
1201
 
527
- def run_batch_generation(batch_num: int, modules_json: str) -> None:
1202
+ def run_batch_generation(
1203
+ batch_num: int,
1204
+ modules_json: str,
1205
+ enable_refinement: bool = True,
1206
+ enable_coverage_guided: bool = False
1207
+ ) -> None:
528
1208
  """Run test generation for a batch.
529
1209
 
530
1210
  Args:
531
1211
  batch_num: Batch number
532
1212
  modules_json: JSON string of modules to process
1213
+ enable_refinement: Enable Phase 2 multi-turn refinement (default: True)
1214
+ enable_coverage_guided: Enable Phase 3 coverage-guided generation (default: False)
533
1215
  """
534
1216
  # Parse modules
535
1217
  modules = json.loads(modules_json)
536
1218
 
537
- # Create agent
1219
+ # Create agent with Phase 2 & 3 configuration
538
1220
  agent_id = f"test-gen-batch{batch_num}"
539
- generator = AutonomousTestGenerator(agent_id, batch_num, modules)
1221
+ generator = AutonomousTestGenerator(
1222
+ agent_id,
1223
+ batch_num,
1224
+ modules,
1225
+ enable_refinement=enable_refinement,
1226
+ enable_coverage_guided=enable_coverage_guided
1227
+ )
540
1228
 
541
1229
  # Generate tests
542
1230
  print(f"Starting autonomous test generation for batch {batch_num}")
543
1231
  print(f"Modules to process: {len(modules)}")
544
1232
  print(f"Agent ID: {agent_id}")
545
- print("Monitor at: http://localhost:8000\n")
1233
+ print("\nENHANCEMENTS:")
1234
+ print(" Phase 1: Extended thinking + Prompt caching + Workflow detection")
1235
+ print(f" Phase 2: Multi-turn refinement = {'ENABLED' if enable_refinement else 'DISABLED'}")
1236
+ print(f" Phase 3: Coverage-guided = {'ENABLED' if enable_coverage_guided else 'DISABLED'}")
1237
+ print("\nMonitor at: http://localhost:8000\n")
546
1238
 
547
1239
  results = generator.generate_all()
548
1240
 
@@ -559,11 +1251,18 @@ def run_batch_generation(batch_num: int, modules_json: str) -> None:
559
1251
  if __name__ == "__main__":
560
1252
  import sys
561
1253
 
562
- if len(sys.argv) != 3:
563
- print("Usage: python -m empathy_os.workflows.autonomous_test_gen <batch_num> <modules_json>")
1254
+ if len(sys.argv) < 3:
1255
+ print("Usage: python -m empathy_os.workflows.autonomous_test_gen <batch_num> <modules_json> [--no-refinement] [--coverage-guided]")
1256
+ print("\nOptions:")
1257
+ print(" --no-refinement Disable Phase 2 multi-turn refinement")
1258
+ print(" --coverage-guided Enable Phase 3 coverage-guided generation (slower)")
564
1259
  sys.exit(1)
565
1260
 
566
1261
  batch_num = int(sys.argv[1])
567
1262
  modules_json = sys.argv[2]
568
1263
 
569
- run_batch_generation(batch_num, modules_json)
1264
+ # Parse optional flags
1265
+ enable_refinement = "--no-refinement" not in sys.argv
1266
+ enable_coverage_guided = "--coverage-guided" in sys.argv
1267
+
1268
+ run_batch_generation(batch_num, modules_json, enable_refinement, enable_coverage_guided)