empathy-framework 5.1.1__py3-none-any.whl → 5.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {empathy_framework-5.1.1.dist-info → empathy_framework-5.3.0.dist-info}/METADATA +79 -6
- {empathy_framework-5.1.1.dist-info → empathy_framework-5.3.0.dist-info}/RECORD +83 -64
- empathy_os/__init__.py +1 -1
- empathy_os/cache/hybrid.py +5 -1
- empathy_os/cli/commands/batch.py +8 -0
- empathy_os/cli/commands/profiling.py +4 -0
- empathy_os/cli/commands/workflow.py +8 -4
- empathy_os/cli_router.py +9 -0
- empathy_os/config.py +15 -2
- empathy_os/core_modules/__init__.py +15 -0
- empathy_os/dashboard/simple_server.py +62 -30
- empathy_os/mcp/__init__.py +10 -0
- empathy_os/mcp/server.py +506 -0
- empathy_os/memory/control_panel.py +1 -131
- empathy_os/memory/control_panel_support.py +145 -0
- empathy_os/memory/encryption.py +159 -0
- empathy_os/memory/long_term.py +46 -631
- empathy_os/memory/long_term_types.py +99 -0
- empathy_os/memory/mixins/__init__.py +25 -0
- empathy_os/memory/mixins/backend_init_mixin.py +249 -0
- empathy_os/memory/mixins/capabilities_mixin.py +208 -0
- empathy_os/memory/mixins/handoff_mixin.py +208 -0
- empathy_os/memory/mixins/lifecycle_mixin.py +49 -0
- empathy_os/memory/mixins/long_term_mixin.py +352 -0
- empathy_os/memory/mixins/promotion_mixin.py +109 -0
- empathy_os/memory/mixins/short_term_mixin.py +182 -0
- empathy_os/memory/short_term.py +61 -12
- empathy_os/memory/simple_storage.py +302 -0
- empathy_os/memory/storage_backend.py +167 -0
- empathy_os/memory/types.py +8 -3
- empathy_os/memory/unified.py +21 -1120
- empathy_os/meta_workflows/cli_commands/__init__.py +56 -0
- empathy_os/meta_workflows/cli_commands/agent_commands.py +321 -0
- empathy_os/meta_workflows/cli_commands/analytics_commands.py +442 -0
- empathy_os/meta_workflows/cli_commands/config_commands.py +232 -0
- empathy_os/meta_workflows/cli_commands/memory_commands.py +182 -0
- empathy_os/meta_workflows/cli_commands/template_commands.py +354 -0
- empathy_os/meta_workflows/cli_commands/workflow_commands.py +382 -0
- empathy_os/meta_workflows/cli_meta_workflows.py +52 -1802
- empathy_os/models/telemetry/__init__.py +71 -0
- empathy_os/models/telemetry/analytics.py +594 -0
- empathy_os/models/telemetry/backend.py +196 -0
- empathy_os/models/telemetry/data_models.py +431 -0
- empathy_os/models/telemetry/storage.py +489 -0
- empathy_os/orchestration/__init__.py +35 -0
- empathy_os/orchestration/execution_strategies.py +481 -0
- empathy_os/orchestration/meta_orchestrator.py +488 -1
- empathy_os/routing/workflow_registry.py +36 -0
- empathy_os/telemetry/agent_coordination.py +2 -3
- empathy_os/telemetry/agent_tracking.py +26 -7
- empathy_os/telemetry/approval_gates.py +18 -24
- empathy_os/telemetry/cli.py +19 -724
- empathy_os/telemetry/commands/__init__.py +14 -0
- empathy_os/telemetry/commands/dashboard_commands.py +696 -0
- empathy_os/telemetry/event_streaming.py +7 -3
- empathy_os/telemetry/feedback_loop.py +28 -15
- empathy_os/tools.py +183 -0
- empathy_os/workflows/__init__.py +5 -0
- empathy_os/workflows/autonomous_test_gen.py +860 -161
- empathy_os/workflows/base.py +6 -2
- empathy_os/workflows/code_review.py +4 -1
- empathy_os/workflows/document_gen/__init__.py +25 -0
- empathy_os/workflows/document_gen/config.py +30 -0
- empathy_os/workflows/document_gen/report_formatter.py +162 -0
- empathy_os/workflows/{document_gen.py → document_gen/workflow.py} +5 -184
- empathy_os/workflows/output.py +4 -1
- empathy_os/workflows/progress.py +8 -2
- empathy_os/workflows/security_audit.py +2 -2
- empathy_os/workflows/security_audit_phase3.py +7 -4
- empathy_os/workflows/seo_optimization.py +633 -0
- empathy_os/workflows/test_gen/__init__.py +52 -0
- empathy_os/workflows/test_gen/ast_analyzer.py +249 -0
- empathy_os/workflows/test_gen/config.py +88 -0
- empathy_os/workflows/test_gen/data_models.py +38 -0
- empathy_os/workflows/test_gen/report_formatter.py +289 -0
- empathy_os/workflows/test_gen/test_templates.py +381 -0
- empathy_os/workflows/test_gen/workflow.py +655 -0
- empathy_os/workflows/test_gen.py +42 -1905
- empathy_os/cli/parsers/cache 2.py +0 -65
- empathy_os/cli_router 2.py +0 -416
- empathy_os/dashboard/app 2.py +0 -512
- empathy_os/dashboard/simple_server 2.py +0 -403
- empathy_os/dashboard/standalone_server 2.py +0 -536
- empathy_os/memory/types 2.py +0 -441
- empathy_os/models/adaptive_routing 2.py +0 -437
- empathy_os/models/telemetry.py +0 -1660
- empathy_os/project_index/scanner_parallel 2.py +0 -291
- empathy_os/telemetry/agent_coordination 2.py +0 -478
- empathy_os/telemetry/agent_tracking 2.py +0 -350
- empathy_os/telemetry/approval_gates 2.py +0 -563
- empathy_os/telemetry/event_streaming 2.py +0 -405
- empathy_os/telemetry/feedback_loop 2.py +0 -557
- empathy_os/vscode_bridge 2.py +0 -173
- empathy_os/workflows/progressive/__init__ 2.py +0 -92
- empathy_os/workflows/progressive/cli 2.py +0 -242
- empathy_os/workflows/progressive/core 2.py +0 -488
- empathy_os/workflows/progressive/orchestrator 2.py +0 -701
- empathy_os/workflows/progressive/reports 2.py +0 -528
- empathy_os/workflows/progressive/telemetry 2.py +0 -280
- empathy_os/workflows/progressive/test_gen 2.py +0 -514
- empathy_os/workflows/progressive/workflow 2.py +0 -628
- {empathy_framework-5.1.1.dist-info → empathy_framework-5.3.0.dist-info}/WHEEL +0 -0
- {empathy_framework-5.1.1.dist-info → empathy_framework-5.3.0.dist-info}/entry_points.txt +0 -0
- {empathy_framework-5.1.1.dist-info → empathy_framework-5.3.0.dist-info}/licenses/LICENSE +0 -0
- {empathy_framework-5.1.1.dist-info → empathy_framework-5.3.0.dist-info}/licenses/LICENSE_CHANGE_ANNOUNCEMENT.md +0 -0
- {empathy_framework-5.1.1.dist-info → empathy_framework-5.3.0.dist-info}/top_level.txt +0 -0
|
@@ -1,15 +1,34 @@
|
|
|
1
|
-
"""Autonomous Test Generation with Dashboard Integration.
|
|
1
|
+
"""Autonomous Test Generation with Dashboard Integration - Enhanced Edition.
|
|
2
2
|
|
|
3
3
|
Generates behavioral tests with real-time monitoring via Agent Coordination Dashboard.
|
|
4
4
|
|
|
5
|
+
ENHANCEMENTS (Phase 1):
|
|
6
|
+
- Extended thinking mode for better test planning
|
|
7
|
+
- Prompt caching for 90% cost reduction
|
|
8
|
+
- Full source code (no truncation)
|
|
9
|
+
- Workflow-specific prompts with mocking templates
|
|
10
|
+
- Few-shot learning with examples
|
|
11
|
+
|
|
12
|
+
ENHANCEMENTS (Phase 2 - Multi-Turn Refinement):
|
|
13
|
+
- Iterative test generation with validation loop
|
|
14
|
+
- Automatic failure detection and fixing
|
|
15
|
+
- Conversation history for context preservation
|
|
16
|
+
|
|
17
|
+
ENHANCEMENTS (Phase 3 - Coverage-Guided Generation):
|
|
18
|
+
- Coverage analysis integration
|
|
19
|
+
- Iterative coverage improvement targeting uncovered lines
|
|
20
|
+
- Systematic path to 80%+ coverage
|
|
21
|
+
|
|
5
22
|
Copyright 2026 Smart-AI-Memory
|
|
6
23
|
Licensed under Apache 2.0
|
|
7
24
|
"""
|
|
8
25
|
|
|
9
26
|
import json
|
|
10
27
|
import logging
|
|
28
|
+
import re
|
|
11
29
|
import subprocess
|
|
12
30
|
import sys
|
|
31
|
+
from dataclasses import dataclass
|
|
13
32
|
from pathlib import Path
|
|
14
33
|
from typing import Any
|
|
15
34
|
|
|
@@ -21,21 +40,58 @@ from empathy_os.telemetry.feedback_loop import FeedbackLoop
|
|
|
21
40
|
logger = logging.getLogger(__name__)
|
|
22
41
|
|
|
23
42
|
|
|
24
|
-
|
|
25
|
-
|
|
43
|
+
@dataclass
|
|
44
|
+
class ValidationResult:
|
|
45
|
+
"""Result of pytest validation."""
|
|
46
|
+
passed: bool
|
|
47
|
+
failures: str
|
|
48
|
+
error_count: int
|
|
49
|
+
output: str
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class CoverageResult:
|
|
54
|
+
"""Result of coverage analysis."""
|
|
55
|
+
coverage: float
|
|
56
|
+
missing_lines: list[int]
|
|
57
|
+
total_statements: int
|
|
58
|
+
covered_statements: int
|
|
26
59
|
|
|
27
|
-
|
|
60
|
+
|
|
61
|
+
class AutonomousTestGenerator:
|
|
62
|
+
"""Generate tests autonomously with dashboard monitoring and Anthropic best practices."""
|
|
63
|
+
|
|
64
|
+
def __init__(
|
|
65
|
+
self,
|
|
66
|
+
agent_id: str,
|
|
67
|
+
batch_num: int,
|
|
68
|
+
modules: list[dict[str, Any]],
|
|
69
|
+
enable_refinement: bool = True,
|
|
70
|
+
max_refinement_iterations: int = 3,
|
|
71
|
+
enable_coverage_guided: bool = False,
|
|
72
|
+
target_coverage: float = 0.80
|
|
73
|
+
):
|
|
28
74
|
"""Initialize generator.
|
|
29
75
|
|
|
30
76
|
Args:
|
|
31
77
|
agent_id: Unique agent identifier
|
|
32
78
|
batch_num: Batch number (1-18)
|
|
33
79
|
modules: List of modules to generate tests for
|
|
80
|
+
enable_refinement: Enable Phase 2 multi-turn refinement (default: True)
|
|
81
|
+
max_refinement_iterations: Max iterations for refinement (default: 3)
|
|
82
|
+
enable_coverage_guided: Enable Phase 3 coverage-guided generation (default: False)
|
|
83
|
+
target_coverage: Target coverage percentage (default: 0.80 = 80%)
|
|
34
84
|
"""
|
|
35
85
|
self.agent_id = agent_id
|
|
36
86
|
self.batch_num = batch_num
|
|
37
87
|
self.modules = modules
|
|
38
88
|
|
|
89
|
+
# Phase 2 & 3 configuration
|
|
90
|
+
self.enable_refinement = enable_refinement
|
|
91
|
+
self.max_refinement_iterations = max_refinement_iterations
|
|
92
|
+
self.enable_coverage_guided = enable_coverage_guided
|
|
93
|
+
self.target_coverage = target_coverage
|
|
94
|
+
|
|
39
95
|
# Initialize memory backend for dashboard integration
|
|
40
96
|
try:
|
|
41
97
|
self.memory = RedisShortTermMemory()
|
|
@@ -51,6 +107,8 @@ class AutonomousTestGenerator:
|
|
|
51
107
|
self.output_dir = Path(f"tests/behavioral/generated/batch{batch_num}")
|
|
52
108
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
53
109
|
|
|
110
|
+
logger.info(f"Generator initialized: refinement={enable_refinement}, coverage_guided={enable_coverage_guided}")
|
|
111
|
+
|
|
54
112
|
def generate_all(self) -> dict[str, Any]:
|
|
55
113
|
"""Generate tests for all modules with progress tracking.
|
|
56
114
|
|
|
@@ -204,8 +262,17 @@ class AutonomousTestGenerator:
|
|
|
204
262
|
# Extract module path for imports
|
|
205
263
|
module_path = str(source_file).replace("src/", "").replace(".py", "").replace("/", ".")
|
|
206
264
|
|
|
207
|
-
# Generate tests using LLM agent
|
|
208
|
-
|
|
265
|
+
# Generate tests using LLM agent with Anthropic best practices
|
|
266
|
+
# Phase 1: Basic generation
|
|
267
|
+
# Phase 2: Multi-turn refinement (if enabled)
|
|
268
|
+
# Phase 3: Coverage-guided improvement (if enabled)
|
|
269
|
+
|
|
270
|
+
if self.enable_refinement:
|
|
271
|
+
logger.info(f"🔄 Using Phase 2: Multi-turn refinement for {module_name}")
|
|
272
|
+
test_content = self._generate_with_refinement(module_name, module_path, source_file, source_code, test_file)
|
|
273
|
+
else:
|
|
274
|
+
logger.info(f"📝 Using Phase 1: Basic generation for {module_name}")
|
|
275
|
+
test_content = self._generate_with_llm(module_name, module_path, source_file, source_code)
|
|
209
276
|
|
|
210
277
|
if not test_content:
|
|
211
278
|
logger.warning(f"LLM generation failed for {module_name}")
|
|
@@ -213,7 +280,19 @@ class AutonomousTestGenerator:
|
|
|
213
280
|
|
|
214
281
|
logger.info(f"LLM generated {len(test_content)} bytes for {module_name}")
|
|
215
282
|
|
|
216
|
-
#
|
|
283
|
+
# Phase 3: Coverage-guided improvement (if enabled)
|
|
284
|
+
if self.enable_coverage_guided:
|
|
285
|
+
logger.info(f"📊 Applying Phase 3: Coverage-guided improvement for {module_name}")
|
|
286
|
+
improved_content = self._generate_with_coverage_target(
|
|
287
|
+
module_name, module_path, source_file, source_code, test_file, test_content
|
|
288
|
+
)
|
|
289
|
+
if improved_content:
|
|
290
|
+
test_content = improved_content
|
|
291
|
+
logger.info(f"✅ Coverage-guided improvement complete for {module_name}")
|
|
292
|
+
else:
|
|
293
|
+
logger.warning(f"⚠️ Coverage-guided improvement failed, using previous version for {module_name}")
|
|
294
|
+
|
|
295
|
+
# Write final test file
|
|
217
296
|
test_file.write_text(test_content)
|
|
218
297
|
logger.info(f"Wrote test file: {test_file}")
|
|
219
298
|
|
|
@@ -224,14 +303,170 @@ class AutonomousTestGenerator:
|
|
|
224
303
|
|
|
225
304
|
return test_file
|
|
226
305
|
|
|
306
|
+
def _is_workflow_module(self, source_code: str, module_path: str) -> bool:
|
|
307
|
+
"""Detect if module is a workflow requiring special handling.
|
|
308
|
+
|
|
309
|
+
Args:
|
|
310
|
+
source_code: Source code content
|
|
311
|
+
module_path: Python import path
|
|
312
|
+
|
|
313
|
+
Returns:
|
|
314
|
+
True if this is a workflow module needing LLM mocking
|
|
315
|
+
"""
|
|
316
|
+
# Check for workflow indicators
|
|
317
|
+
indicators = [
|
|
318
|
+
r"class\s+\w+Workflow",
|
|
319
|
+
r"async\s+def\s+execute",
|
|
320
|
+
r"tier_routing",
|
|
321
|
+
r"LLMProvider",
|
|
322
|
+
r"TelemetryCollector",
|
|
323
|
+
r"from\s+anthropic\s+import",
|
|
324
|
+
r"messages\.create",
|
|
325
|
+
r"client\.messages"
|
|
326
|
+
]
|
|
327
|
+
|
|
328
|
+
return any(re.search(pattern, source_code) for pattern in indicators)
|
|
329
|
+
|
|
330
|
+
def _get_example_tests(self) -> str:
|
|
331
|
+
"""Get few-shot examples of excellent tests for prompt learning."""
|
|
332
|
+
return """EXAMPLE 1: Testing a utility function with mocking
|
|
333
|
+
```python
|
|
334
|
+
import pytest
|
|
335
|
+
from unittest.mock import Mock, patch
|
|
336
|
+
from mymodule import process_data
|
|
337
|
+
|
|
338
|
+
class TestProcessData:
|
|
339
|
+
def test_processes_valid_data_successfully(self):
|
|
340
|
+
\"\"\"Given valid input data, when processing, then returns expected result.\"\"\"
|
|
341
|
+
# Given
|
|
342
|
+
input_data = {"key": "value", "count": 42}
|
|
343
|
+
|
|
344
|
+
# When
|
|
345
|
+
result = process_data(input_data)
|
|
346
|
+
|
|
347
|
+
# Then
|
|
348
|
+
assert result is not None
|
|
349
|
+
assert result["status"] == "success"
|
|
350
|
+
assert result["processed"] is True
|
|
351
|
+
|
|
352
|
+
def test_handles_invalid_data_with_error(self):
|
|
353
|
+
\"\"\"Given invalid input, when processing, then raises ValueError.\"\"\"
|
|
354
|
+
# Given
|
|
355
|
+
invalid_data = {"missing": "key"}
|
|
356
|
+
|
|
357
|
+
# When/Then
|
|
358
|
+
with pytest.raises(ValueError, match="Required key 'key' not found"):
|
|
359
|
+
process_data(invalid_data)
|
|
360
|
+
```
|
|
361
|
+
|
|
362
|
+
EXAMPLE 2: Testing a workflow with LLM mocking
|
|
363
|
+
```python
|
|
364
|
+
import pytest
|
|
365
|
+
from unittest.mock import Mock, AsyncMock, patch
|
|
366
|
+
from mymodule import MyWorkflow
|
|
367
|
+
|
|
368
|
+
@pytest.fixture
|
|
369
|
+
def mock_llm_client(mocker):
|
|
370
|
+
\"\"\"Mock Anthropic LLM client.\"\"\"
|
|
371
|
+
mock = mocker.patch('anthropic.Anthropic')
|
|
372
|
+
mock_response = Mock()
|
|
373
|
+
mock_response.content = [Mock(text="mock LLM response")]
|
|
374
|
+
mock_response.usage = Mock(input_tokens=100, output_tokens=50)
|
|
375
|
+
mock_response.stop_reason = "end_turn"
|
|
376
|
+
mock.return_value.messages.create = AsyncMock(return_value=mock_response)
|
|
377
|
+
return mock
|
|
378
|
+
|
|
379
|
+
class TestMyWorkflow:
|
|
380
|
+
@pytest.mark.asyncio
|
|
381
|
+
async def test_executes_successfully_with_mocked_llm(self, mock_llm_client):
|
|
382
|
+
\"\"\"Given valid input, when executing workflow, then completes successfully.\"\"\"
|
|
383
|
+
# Given
|
|
384
|
+
workflow = MyWorkflow()
|
|
385
|
+
input_data = {"prompt": "test prompt"}
|
|
386
|
+
|
|
387
|
+
# When
|
|
388
|
+
result = await workflow.execute(input_data)
|
|
389
|
+
|
|
390
|
+
# Then
|
|
391
|
+
assert result is not None
|
|
392
|
+
assert "response" in result
|
|
393
|
+
mock_llm_client.return_value.messages.create.assert_called_once()
|
|
394
|
+
|
|
395
|
+
@pytest.mark.asyncio
|
|
396
|
+
async def test_handles_api_error_gracefully(self, mock_llm_client):
|
|
397
|
+
\"\"\"Given API failure, when executing, then handles error appropriately.\"\"\"
|
|
398
|
+
# Given
|
|
399
|
+
workflow = MyWorkflow()
|
|
400
|
+
mock_llm_client.return_value.messages.create.side_effect = Exception("API Error")
|
|
401
|
+
|
|
402
|
+
# When/Then
|
|
403
|
+
with pytest.raises(Exception, match="API Error"):
|
|
404
|
+
await workflow.execute({"prompt": "test"})
|
|
405
|
+
```
|
|
406
|
+
"""
|
|
407
|
+
|
|
408
|
+
def _get_workflow_specific_prompt(self, module_name: str, module_path: str, source_code: str) -> str:
|
|
409
|
+
"""Get workflow-specific test generation prompt with comprehensive mocking guidance."""
|
|
410
|
+
return f"""Generate comprehensive tests for this WORKFLOW module.
|
|
411
|
+
|
|
412
|
+
⚠️ CRITICAL: This module makes LLM API calls and requires proper mocking.
|
|
413
|
+
|
|
414
|
+
MODULE: {module_name}
|
|
415
|
+
IMPORT PATH: {module_path}
|
|
416
|
+
|
|
417
|
+
SOURCE CODE (COMPLETE - NO TRUNCATION):
|
|
418
|
+
```python
|
|
419
|
+
{source_code}
|
|
420
|
+
```
|
|
421
|
+
|
|
422
|
+
WORKFLOW TESTING REQUIREMENTS:
|
|
423
|
+
|
|
424
|
+
1. **Mock LLM API calls** - NEVER make real API calls in tests
|
|
425
|
+
```python
|
|
426
|
+
@pytest.fixture
|
|
427
|
+
def mock_llm_client(mocker):
|
|
428
|
+
mock = mocker.patch('anthropic.Anthropic')
|
|
429
|
+
mock_response = Mock()
|
|
430
|
+
mock_response.content = [Mock(text="mock response")]
|
|
431
|
+
mock_response.usage = Mock(input_tokens=100, output_tokens=50)
|
|
432
|
+
mock_response.stop_reason = "end_turn"
|
|
433
|
+
mock.return_value.messages.create = AsyncMock(return_value=mock_response)
|
|
434
|
+
return mock
|
|
435
|
+
```
|
|
436
|
+
|
|
437
|
+
2. **Test tier routing** - Verify correct model selection (cheap/capable/premium)
|
|
438
|
+
3. **Test telemetry** - Mock and verify telemetry recording
|
|
439
|
+
4. **Test cost calculation** - Verify token usage and cost tracking
|
|
440
|
+
5. **Test error handling** - Mock API failures, timeouts, rate limits
|
|
441
|
+
6. **Test caching** - Mock cache hits/misses if applicable
|
|
442
|
+
|
|
443
|
+
TARGET COVERAGE: 40-50% (realistic for workflow classes with proper mocking)
|
|
444
|
+
|
|
445
|
+
Generate a complete test file with:
|
|
446
|
+
- Copyright header: "Generated by enhanced autonomous test generation system."
|
|
447
|
+
- Proper imports (from {module_path})
|
|
448
|
+
- Mock fixtures for ALL external dependencies (LLM, databases, APIs, file I/O)
|
|
449
|
+
- Given/When/Then structure in docstrings
|
|
450
|
+
- Both success and failure test cases
|
|
451
|
+
- Edge case handling
|
|
452
|
+
- Docstrings for all tests describing behavior
|
|
453
|
+
|
|
454
|
+
Return ONLY the complete Python test file, no explanations."""
|
|
455
|
+
|
|
227
456
|
def _generate_with_llm(self, module_name: str, module_path: str, source_file: Path, source_code: str) -> str | None:
|
|
228
|
-
"""Generate comprehensive tests using LLM.
|
|
457
|
+
"""Generate comprehensive tests using LLM with Anthropic best practices.
|
|
458
|
+
|
|
459
|
+
ENHANCEMENTS (Phase 1):
|
|
460
|
+
- Extended thinking (20K token budget) for thorough test planning
|
|
461
|
+
- Prompt caching for 90% cost reduction
|
|
462
|
+
- Full source code (NO TRUNCATION)
|
|
463
|
+
- Workflow-specific prompts when detected
|
|
229
464
|
|
|
230
465
|
Args:
|
|
231
466
|
module_name: Name of module being tested
|
|
232
467
|
module_path: Python import path (e.g., empathy_os.config)
|
|
233
468
|
source_file: Path to source file
|
|
234
|
-
source_code: Source code content
|
|
469
|
+
source_code: Source code content (FULL, not truncated)
|
|
235
470
|
|
|
236
471
|
Returns:
|
|
237
472
|
Test file content with comprehensive tests, or None if generation failed
|
|
@@ -250,15 +485,22 @@ class AutonomousTestGenerator:
|
|
|
250
485
|
logger.error("ANTHROPIC_API_KEY not set")
|
|
251
486
|
return None
|
|
252
487
|
|
|
253
|
-
#
|
|
254
|
-
|
|
488
|
+
# Detect if this is a workflow module
|
|
489
|
+
is_workflow = self._is_workflow_module(source_code, module_path)
|
|
490
|
+
logger.info(f"Module {module_name}: workflow={is_workflow}, size={len(source_code)} bytes (FULL)")
|
|
491
|
+
|
|
492
|
+
# Build appropriate prompt based on module type
|
|
493
|
+
if is_workflow:
|
|
494
|
+
generation_prompt = self._get_workflow_specific_prompt(module_name, module_path, source_code)
|
|
495
|
+
else:
|
|
496
|
+
generation_prompt = f"""Generate comprehensive behavioral tests for this Python module.
|
|
255
497
|
|
|
256
498
|
SOURCE FILE: {source_file}
|
|
257
499
|
MODULE PATH: {module_path}
|
|
258
500
|
|
|
259
|
-
SOURCE CODE:
|
|
501
|
+
SOURCE CODE (COMPLETE):
|
|
260
502
|
```python
|
|
261
|
-
{source_code
|
|
503
|
+
{source_code}
|
|
262
504
|
```
|
|
263
505
|
|
|
264
506
|
Generate a complete test file that:
|
|
@@ -287,21 +529,59 @@ Licensed under Apache 2.0
|
|
|
287
529
|
|
|
288
530
|
Return ONLY the complete Python test file content, no explanations."""
|
|
289
531
|
|
|
532
|
+
# Build messages with prompt caching (90% cost reduction on retries)
|
|
533
|
+
messages = [
|
|
534
|
+
{
|
|
535
|
+
"role": "user",
|
|
536
|
+
"content": [
|
|
537
|
+
{
|
|
538
|
+
"type": "text",
|
|
539
|
+
"text": "You are an expert Python test engineer. Here are examples of excellent tests:",
|
|
540
|
+
"cache_control": {"type": "ephemeral"}
|
|
541
|
+
},
|
|
542
|
+
{
|
|
543
|
+
"type": "text",
|
|
544
|
+
"text": self._get_example_tests(),
|
|
545
|
+
"cache_control": {"type": "ephemeral"}
|
|
546
|
+
},
|
|
547
|
+
{
|
|
548
|
+
"type": "text",
|
|
549
|
+
"text": generation_prompt
|
|
550
|
+
}
|
|
551
|
+
]
|
|
552
|
+
}
|
|
553
|
+
]
|
|
554
|
+
|
|
290
555
|
try:
|
|
291
|
-
# Call Anthropic API with
|
|
292
|
-
logger.info(f"Calling LLM for {module_name} (
|
|
556
|
+
# Call Anthropic API with extended thinking and caching
|
|
557
|
+
logger.info(f"Calling LLM with extended thinking for {module_name} (workflow={is_workflow})")
|
|
293
558
|
client = anthropic.Anthropic(api_key=api_key)
|
|
294
559
|
response = client.messages.create(
|
|
295
560
|
model="claude-sonnet-4-5", # capable tier
|
|
296
|
-
max_tokens=
|
|
297
|
-
|
|
561
|
+
max_tokens=40000, # Very generous total budget for comprehensive tests
|
|
562
|
+
thinking={
|
|
563
|
+
"type": "enabled",
|
|
564
|
+
"budget_tokens": 20000 # Generous thinking budget for thorough planning
|
|
565
|
+
},
|
|
566
|
+
messages=messages,
|
|
567
|
+
timeout=900.0, # 15 minutes timeout for extended thinking + generation
|
|
298
568
|
)
|
|
299
569
|
|
|
300
570
|
if not response.content:
|
|
301
571
|
logger.warning(f"Empty LLM response for {module_name}")
|
|
302
572
|
return None
|
|
303
573
|
|
|
304
|
-
|
|
574
|
+
# Extract test content (thinking comes first, then text)
|
|
575
|
+
test_content = None
|
|
576
|
+
for block in response.content:
|
|
577
|
+
if block.type == "text":
|
|
578
|
+
test_content = block.text.strip()
|
|
579
|
+
break
|
|
580
|
+
|
|
581
|
+
if not test_content:
|
|
582
|
+
logger.warning(f"No text content in LLM response for {module_name}")
|
|
583
|
+
return None
|
|
584
|
+
|
|
305
585
|
logger.info(f"LLM returned {len(test_content)} bytes for {module_name}")
|
|
306
586
|
|
|
307
587
|
if len(test_content) < 100:
|
|
@@ -314,6 +594,20 @@ Return ONLY the complete Python test file content, no explanations."""
|
|
|
314
594
|
if test_content.endswith("```"):
|
|
315
595
|
test_content = test_content[:-3].strip()
|
|
316
596
|
|
|
597
|
+
# Check for truncation indicators
|
|
598
|
+
if response.stop_reason == "max_tokens":
|
|
599
|
+
logger.warning(f"⚠️ LLM response truncated for {module_name} (hit max_tokens)")
|
|
600
|
+
# Response might be incomplete but let validation catch it
|
|
601
|
+
|
|
602
|
+
# Quick syntax pre-check before returning
|
|
603
|
+
try:
|
|
604
|
+
import ast
|
|
605
|
+
ast.parse(test_content)
|
|
606
|
+
logger.info(f"✓ Quick syntax check passed for {module_name}")
|
|
607
|
+
except SyntaxError as e:
|
|
608
|
+
logger.error(f"❌ LLM generated invalid syntax for {module_name}: {e.msg} at line {e.lineno}")
|
|
609
|
+
return None
|
|
610
|
+
|
|
317
611
|
logger.info(f"Test content cleaned, final size: {len(test_content)} bytes")
|
|
318
612
|
return test_content
|
|
319
613
|
|
|
@@ -321,161 +615,524 @@ Return ONLY the complete Python test file content, no explanations."""
|
|
|
321
615
|
logger.error(f"LLM generation error for {module_name}: {e}", exc_info=True)
|
|
322
616
|
return None
|
|
323
617
|
|
|
324
|
-
def
|
|
325
|
-
"""
|
|
618
|
+
def _run_pytest_validation(self, test_file: Path) -> ValidationResult:
|
|
619
|
+
"""Run pytest on generated tests and collect failures.
|
|
620
|
+
|
|
621
|
+
Args:
|
|
622
|
+
test_file: Path to test file to validate
|
|
623
|
+
|
|
624
|
+
Returns:
|
|
625
|
+
ValidationResult with test outcomes and failure details
|
|
626
|
+
"""
|
|
627
|
+
try:
|
|
628
|
+
result = subprocess.run(
|
|
629
|
+
[sys.executable, "-m", "pytest", str(test_file), "-v", "--tb=short"],
|
|
630
|
+
capture_output=True,
|
|
631
|
+
text=True,
|
|
632
|
+
timeout=60,
|
|
633
|
+
)
|
|
634
|
+
|
|
635
|
+
passed = result.returncode == 0
|
|
636
|
+
output = result.stdout + "\n" + result.stderr
|
|
637
|
+
|
|
638
|
+
# Count errors
|
|
639
|
+
error_count = output.count("FAILED") + output.count("ERROR")
|
|
640
|
+
|
|
641
|
+
# Extract failure details
|
|
642
|
+
failures = ""
|
|
643
|
+
if not passed:
|
|
644
|
+
# Extract relevant failure information
|
|
645
|
+
lines = output.split("\n")
|
|
646
|
+
failure_lines = []
|
|
647
|
+
in_failure = False
|
|
648
|
+
for line in lines:
|
|
649
|
+
if "FAILED" in line or "ERROR" in line:
|
|
650
|
+
in_failure = True
|
|
651
|
+
if in_failure:
|
|
652
|
+
failure_lines.append(line)
|
|
653
|
+
if line.startswith("="): # End of failure section
|
|
654
|
+
in_failure = False
|
|
655
|
+
failures = "\n".join(failure_lines[:100]) # Limit to 100 lines
|
|
656
|
+
|
|
657
|
+
logger.info(f"Pytest validation: passed={passed}, errors={error_count}")
|
|
658
|
+
|
|
659
|
+
return ValidationResult(
|
|
660
|
+
passed=passed,
|
|
661
|
+
failures=failures,
|
|
662
|
+
error_count=error_count,
|
|
663
|
+
output=output
|
|
664
|
+
)
|
|
665
|
+
|
|
666
|
+
except subprocess.TimeoutExpired:
|
|
667
|
+
logger.error(f"Pytest validation timeout for {test_file}")
|
|
668
|
+
return ValidationResult(
|
|
669
|
+
passed=False,
|
|
670
|
+
failures="Validation timeout after 60 seconds",
|
|
671
|
+
error_count=1,
|
|
672
|
+
output="Timeout"
|
|
673
|
+
)
|
|
674
|
+
except Exception as e:
|
|
675
|
+
logger.error(f"Pytest validation exception: {e}")
|
|
676
|
+
return ValidationResult(
|
|
677
|
+
passed=False,
|
|
678
|
+
failures=f"Validation exception: {e}",
|
|
679
|
+
error_count=1,
|
|
680
|
+
output=str(e)
|
|
681
|
+
)
|
|
682
|
+
|
|
683
|
+
def _call_llm_with_history(
|
|
684
|
+
self,
|
|
685
|
+
conversation_history: list[dict[str, Any]],
|
|
686
|
+
api_key: str
|
|
687
|
+
) -> str | None:
|
|
688
|
+
"""Call LLM with conversation history for refinement.
|
|
689
|
+
|
|
690
|
+
Args:
|
|
691
|
+
conversation_history: List of messages (role + content)
|
|
692
|
+
api_key: Anthropic API key
|
|
693
|
+
|
|
694
|
+
Returns:
|
|
695
|
+
Refined test content or None if failed
|
|
696
|
+
"""
|
|
697
|
+
try:
|
|
698
|
+
import anthropic
|
|
699
|
+
|
|
700
|
+
client = anthropic.Anthropic(api_key=api_key)
|
|
701
|
+
response = client.messages.create(
|
|
702
|
+
model="claude-sonnet-4-5",
|
|
703
|
+
max_tokens=40000, # Very generous total budget for iterative refinement
|
|
704
|
+
thinking={
|
|
705
|
+
"type": "enabled",
|
|
706
|
+
"budget_tokens": 20000 # Generous thinking budget for thorough analysis
|
|
707
|
+
},
|
|
708
|
+
messages=conversation_history,
|
|
709
|
+
timeout=900.0, # 15 minutes timeout for refinement iterations
|
|
710
|
+
)
|
|
711
|
+
|
|
712
|
+
if not response.content:
|
|
713
|
+
logger.warning("Empty LLM response during refinement")
|
|
714
|
+
return None
|
|
715
|
+
|
|
716
|
+
# Extract text content
|
|
717
|
+
test_content = None
|
|
718
|
+
for block in response.content:
|
|
719
|
+
if block.type == "text":
|
|
720
|
+
test_content = block.text.strip()
|
|
721
|
+
break
|
|
722
|
+
|
|
723
|
+
if not test_content:
|
|
724
|
+
logger.warning("No text content in refinement response")
|
|
725
|
+
return None
|
|
726
|
+
|
|
727
|
+
# Clean up response
|
|
728
|
+
if test_content.startswith("```python"):
|
|
729
|
+
test_content = test_content[len("```python"):].strip()
|
|
730
|
+
if test_content.endswith("```"):
|
|
731
|
+
test_content = test_content[:-3].strip()
|
|
732
|
+
|
|
733
|
+
return test_content
|
|
734
|
+
|
|
735
|
+
except Exception as e:
|
|
736
|
+
logger.error(f"LLM refinement error: {e}", exc_info=True)
|
|
737
|
+
return None
|
|
738
|
+
|
|
739
|
+
def _generate_with_refinement(
|
|
740
|
+
self,
|
|
741
|
+
module_name: str,
|
|
742
|
+
module_path: str,
|
|
743
|
+
source_file: Path,
|
|
744
|
+
source_code: str,
|
|
745
|
+
test_file: Path
|
|
746
|
+
) -> str | None:
|
|
747
|
+
"""Generate tests with iterative refinement (Phase 2).
|
|
748
|
+
|
|
749
|
+
Process:
|
|
750
|
+
1. Generate initial tests
|
|
751
|
+
2. Run pytest validation
|
|
752
|
+
3. If failures, ask Claude to fix
|
|
753
|
+
4. Repeat until tests pass or max iterations
|
|
326
754
|
|
|
327
755
|
Args:
|
|
328
756
|
module_name: Name of module being tested
|
|
757
|
+
module_path: Python import path
|
|
329
758
|
source_file: Path to source file
|
|
330
759
|
source_code: Source code content
|
|
760
|
+
test_file: Path where tests will be written
|
|
331
761
|
|
|
332
762
|
Returns:
|
|
333
|
-
|
|
763
|
+
Final test content or None if all attempts failed
|
|
334
764
|
"""
|
|
335
|
-
import
|
|
765
|
+
import os
|
|
336
766
|
|
|
337
|
-
|
|
338
|
-
|
|
767
|
+
api_key = os.getenv("ANTHROPIC_API_KEY")
|
|
768
|
+
if not api_key:
|
|
769
|
+
logger.error("ANTHROPIC_API_KEY not set")
|
|
770
|
+
return None
|
|
339
771
|
|
|
340
|
-
|
|
341
|
-
try:
|
|
342
|
-
tree = ast.parse(source_code)
|
|
343
|
-
functions = [node.name for node in ast.walk(tree) if isinstance(node, ast.FunctionDef) and not node.name.startswith('_')]
|
|
344
|
-
classes = [node.name for node in ast.walk(tree) if isinstance(node, ast.ClassDef)]
|
|
345
|
-
except:
|
|
346
|
-
functions = []
|
|
347
|
-
classes = []
|
|
348
|
-
|
|
349
|
-
# Generate test classes for each class found
|
|
350
|
-
test_classes = []
|
|
351
|
-
for cls_name in classes[:5]: # Limit to 5 classes
|
|
352
|
-
test_classes.append(f'''
|
|
353
|
-
class Test{cls_name}:
|
|
354
|
-
"""Behavioral tests for {cls_name} class."""
|
|
355
|
-
|
|
356
|
-
def test_{cls_name.lower()}_instantiation(self):
|
|
357
|
-
"""Test {cls_name} can be instantiated."""
|
|
358
|
-
# Given: Class is available
|
|
359
|
-
# When: Creating instance
|
|
360
|
-
try:
|
|
361
|
-
from {module_path} import {cls_name}
|
|
362
|
-
# Then: Instance created successfully
|
|
363
|
-
assert {cls_name} is not None
|
|
364
|
-
except ImportError:
|
|
365
|
-
pytest.skip("Class not available")
|
|
772
|
+
logger.info(f"🔄 Phase 2: Multi-turn refinement enabled for {module_name} (max {self.max_refinement_iterations} iterations)")
|
|
366
773
|
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
# When: Checking methods
|
|
373
|
-
# Then: Common methods should exist
|
|
374
|
-
assert hasattr({cls_name}, '__init__')
|
|
375
|
-
except ImportError:
|
|
376
|
-
pytest.skip("Class not available")
|
|
377
|
-
''')
|
|
378
|
-
|
|
379
|
-
# Generate tests for functions
|
|
380
|
-
function_tests = []
|
|
381
|
-
for func_name in functions[:10]: # Limit to 10 functions
|
|
382
|
-
function_tests.append(f'''
|
|
383
|
-
def test_{func_name}_callable(self):
|
|
384
|
-
"""Test {func_name} function is callable."""
|
|
385
|
-
# Given: Function is available
|
|
386
|
-
try:
|
|
387
|
-
from {module_path} import {func_name}
|
|
388
|
-
# When: Checking if callable
|
|
389
|
-
# Then: Function should be callable
|
|
390
|
-
assert callable({func_name})
|
|
391
|
-
except ImportError:
|
|
392
|
-
pytest.skip("Function not available")
|
|
774
|
+
# Step 1: Generate initial tests
|
|
775
|
+
test_content = self._generate_with_llm(module_name, module_path, source_file, source_code)
|
|
776
|
+
if not test_content:
|
|
777
|
+
logger.warning("Initial generation failed")
|
|
778
|
+
return None
|
|
393
779
|
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
# Given: Function is available
|
|
397
|
-
try:
|
|
398
|
-
from {module_path} import {func_name}
|
|
399
|
-
# When: Called with mocked dependencies
|
|
400
|
-
with patch.object({module_path}, '{func_name}', return_value=Mock()) as mock_func:
|
|
401
|
-
result = mock_func()
|
|
402
|
-
# Then: Should return successfully
|
|
403
|
-
assert result is not None
|
|
404
|
-
except (ImportError, AttributeError):
|
|
405
|
-
pytest.skip("Function not available or cannot be mocked")
|
|
406
|
-
''')
|
|
407
|
-
|
|
408
|
-
# Combine all test content
|
|
409
|
-
test_content = f'''"""Behavioral tests for {module_name}.
|
|
780
|
+
# Build conversation history for subsequent refinements
|
|
781
|
+
is_workflow = self._is_workflow_module(source_code, module_path)
|
|
410
782
|
|
|
411
|
-
|
|
783
|
+
# Initial prompt (for history tracking)
|
|
784
|
+
if is_workflow:
|
|
785
|
+
initial_prompt = self._get_workflow_specific_prompt(module_name, module_path, source_code)
|
|
786
|
+
else:
|
|
787
|
+
initial_prompt = f"""Generate comprehensive behavioral tests for {module_name}.
|
|
412
788
|
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
789
|
+
SOURCE CODE:
|
|
790
|
+
```python
|
|
791
|
+
{source_code}
|
|
792
|
+
```"""
|
|
793
|
+
|
|
794
|
+
conversation_history = [
|
|
795
|
+
{
|
|
796
|
+
"role": "user",
|
|
797
|
+
"content": [
|
|
798
|
+
{"type": "text", "text": "You are an expert Python test engineer. Examples:", "cache_control": {"type": "ephemeral"}},
|
|
799
|
+
{"type": "text", "text": self._get_example_tests(), "cache_control": {"type": "ephemeral"}},
|
|
800
|
+
{"type": "text", "text": initial_prompt}
|
|
801
|
+
]
|
|
802
|
+
},
|
|
803
|
+
{
|
|
804
|
+
"role": "assistant",
|
|
805
|
+
"content": test_content
|
|
806
|
+
}
|
|
807
|
+
]
|
|
416
808
|
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
809
|
+
# Step 2: Iterative refinement loop
|
|
810
|
+
for iteration in range(self.max_refinement_iterations):
|
|
811
|
+
logger.info(f"📝 Refinement iteration {iteration + 1}/{self.max_refinement_iterations} for {module_name}")
|
|
812
|
+
|
|
813
|
+
# Write current version to temp file
|
|
814
|
+
temp_test_file = test_file.parent / f"_temp_{test_file.name}"
|
|
815
|
+
temp_test_file.write_text(test_content)
|
|
816
|
+
|
|
817
|
+
# Validate with pytest
|
|
818
|
+
validation_result = self._run_pytest_validation(temp_test_file)
|
|
819
|
+
|
|
820
|
+
if validation_result.passed:
|
|
821
|
+
logger.info(f"✅ Tests passed on iteration {iteration + 1} for {module_name}")
|
|
822
|
+
temp_test_file.unlink() # Clean up
|
|
823
|
+
return test_content
|
|
824
|
+
|
|
825
|
+
# Tests failed - ask Claude to fix
|
|
826
|
+
logger.warning(f"⚠️ Tests failed on iteration {iteration + 1}: {validation_result.error_count} errors")
|
|
827
|
+
|
|
828
|
+
refinement_prompt = f"""The tests you generated have failures. Please fix these specific issues:
|
|
420
829
|
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
830
|
+
FAILURES:
|
|
831
|
+
{validation_result.failures[:2000]}
|
|
832
|
+
|
|
833
|
+
Requirements:
|
|
834
|
+
1. Fix ONLY the failing tests - don't rewrite everything
|
|
835
|
+
2. Ensure imports are correct
|
|
836
|
+
3. Ensure mocking is properly configured
|
|
837
|
+
4. Return the COMPLETE corrected test file (not just the fixes)
|
|
838
|
+
5. Keep the same structure and copyright header
|
|
839
|
+
|
|
840
|
+
Return ONLY the complete Python test file, no explanations."""
|
|
841
|
+
|
|
842
|
+
# Add to conversation history
|
|
843
|
+
conversation_history.append({
|
|
844
|
+
"role": "user",
|
|
845
|
+
"content": refinement_prompt
|
|
846
|
+
})
|
|
847
|
+
|
|
848
|
+
# Call LLM for refinement
|
|
849
|
+
refined_content = self._call_llm_with_history(conversation_history, api_key)
|
|
850
|
+
|
|
851
|
+
if not refined_content:
|
|
852
|
+
logger.error(f"❌ Refinement failed on iteration {iteration + 1}")
|
|
853
|
+
temp_test_file.unlink()
|
|
854
|
+
break
|
|
855
|
+
|
|
856
|
+
# Update content and history
|
|
857
|
+
test_content = refined_content
|
|
858
|
+
conversation_history.append({
|
|
859
|
+
"role": "assistant",
|
|
860
|
+
"content": test_content
|
|
861
|
+
})
|
|
862
|
+
|
|
863
|
+
logger.info(f"🔄 Refinement iteration {iteration + 1} complete, retrying validation...")
|
|
864
|
+
|
|
865
|
+
# Max iterations reached
|
|
866
|
+
logger.warning(f"⚠️ Max refinement iterations reached for {module_name} - returning best attempt")
|
|
867
|
+
return test_content
|
|
868
|
+
|
|
869
|
+
def _run_coverage_analysis(self, test_file: Path, source_file: Path) -> CoverageResult:
|
|
870
|
+
"""Run coverage analysis on tests.
|
|
871
|
+
|
|
872
|
+
Args:
|
|
873
|
+
test_file: Path to test file
|
|
874
|
+
source_file: Path to source file being tested
|
|
875
|
+
|
|
876
|
+
Returns:
|
|
877
|
+
CoverageResult with coverage metrics and missing lines
|
|
878
|
+
"""
|
|
456
879
|
try:
|
|
457
|
-
|
|
458
|
-
|
|
880
|
+
# Run pytest with coverage
|
|
881
|
+
result = subprocess.run(
|
|
882
|
+
[
|
|
883
|
+
sys.executable, "-m", "pytest",
|
|
884
|
+
str(test_file),
|
|
885
|
+
f"--cov={source_file.parent}",
|
|
886
|
+
"--cov-report=term-missing",
|
|
887
|
+
"--cov-report=json",
|
|
888
|
+
"-v"
|
|
889
|
+
],
|
|
890
|
+
capture_output=True,
|
|
891
|
+
text=True,
|
|
892
|
+
timeout=120,
|
|
893
|
+
cwd=Path.cwd()
|
|
894
|
+
)
|
|
895
|
+
|
|
896
|
+
# Parse coverage from JSON report
|
|
897
|
+
coverage_json_path = Path(".coverage.json")
|
|
898
|
+
if not coverage_json_path.exists():
|
|
899
|
+
logger.warning("Coverage JSON not generated")
|
|
900
|
+
return CoverageResult(
|
|
901
|
+
coverage=0.0,
|
|
902
|
+
missing_lines=[],
|
|
903
|
+
total_statements=0,
|
|
904
|
+
covered_statements=0
|
|
905
|
+
)
|
|
906
|
+
|
|
907
|
+
with open(coverage_json_path) as f:
|
|
908
|
+
coverage_data = json.load(f)
|
|
909
|
+
|
|
910
|
+
# Find coverage for our specific source file
|
|
911
|
+
source_key = str(source_file)
|
|
912
|
+
file_coverage = None
|
|
913
|
+
for key in coverage_data.get("files", {}).keys():
|
|
914
|
+
if source_file.name in key or source_key in key:
|
|
915
|
+
file_coverage = coverage_data["files"][key]
|
|
916
|
+
break
|
|
917
|
+
|
|
918
|
+
if not file_coverage:
|
|
919
|
+
logger.warning(f"No coverage data found for {source_file}")
|
|
920
|
+
return CoverageResult(
|
|
921
|
+
coverage=0.0,
|
|
922
|
+
missing_lines=[],
|
|
923
|
+
total_statements=0,
|
|
924
|
+
covered_statements=0
|
|
925
|
+
)
|
|
926
|
+
|
|
927
|
+
# Extract metrics
|
|
928
|
+
total_statements = file_coverage["summary"]["num_statements"]
|
|
929
|
+
covered_statements = file_coverage["summary"]["covered_lines"]
|
|
930
|
+
coverage_pct = file_coverage["summary"]["percent_covered"] / 100.0
|
|
931
|
+
missing_lines = file_coverage["missing_lines"]
|
|
932
|
+
|
|
933
|
+
logger.info(f"Coverage: {coverage_pct:.1%} ({covered_statements}/{total_statements} statements)")
|
|
934
|
+
|
|
935
|
+
return CoverageResult(
|
|
936
|
+
coverage=coverage_pct,
|
|
937
|
+
missing_lines=missing_lines,
|
|
938
|
+
total_statements=total_statements,
|
|
939
|
+
covered_statements=covered_statements
|
|
940
|
+
)
|
|
941
|
+
|
|
942
|
+
except subprocess.TimeoutExpired:
|
|
943
|
+
logger.error("Coverage analysis timeout")
|
|
944
|
+
return CoverageResult(coverage=0.0, missing_lines=[], total_statements=0, covered_statements=0)
|
|
459
945
|
except Exception as e:
|
|
460
|
-
|
|
946
|
+
logger.error(f"Coverage analysis error: {e}", exc_info=True)
|
|
947
|
+
return CoverageResult(coverage=0.0, missing_lines=[], total_statements=0, covered_statements=0)
|
|
948
|
+
|
|
949
|
+
def _extract_uncovered_lines(self, source_file: Path, missing_lines: list[int]) -> str:
|
|
950
|
+
"""Extract source code for uncovered lines.
|
|
951
|
+
|
|
952
|
+
Args:
|
|
953
|
+
source_file: Path to source file
|
|
954
|
+
missing_lines: List of uncovered line numbers
|
|
955
|
+
|
|
956
|
+
Returns:
|
|
957
|
+
Formatted string with uncovered code sections
|
|
958
|
+
"""
|
|
959
|
+
if not missing_lines:
|
|
960
|
+
return "No uncovered lines"
|
|
461
961
|
|
|
462
|
-
def test_module_constants_are_defined(self):
|
|
463
|
-
"""Test that common constants are properly defined."""
|
|
464
|
-
# Given: Module is imported
|
|
465
|
-
# When: Checking for logger or similar
|
|
466
|
-
# Then: Should have standard attributes
|
|
467
962
|
try:
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
963
|
+
source_lines = source_file.read_text().split("\n")
|
|
964
|
+
|
|
965
|
+
# Group consecutive lines into ranges
|
|
966
|
+
ranges = []
|
|
967
|
+
start = missing_lines[0]
|
|
968
|
+
end = start
|
|
969
|
+
|
|
970
|
+
for line_num in missing_lines[1:]:
|
|
971
|
+
if line_num == end + 1:
|
|
972
|
+
end = line_num
|
|
973
|
+
else:
|
|
974
|
+
ranges.append((start, end))
|
|
975
|
+
start = line_num
|
|
976
|
+
end = start
|
|
977
|
+
ranges.append((start, end))
|
|
978
|
+
|
|
979
|
+
# Extract code for each range with context
|
|
980
|
+
uncovered_sections = []
|
|
981
|
+
for start, end in ranges[:10]: # Limit to 10 ranges
|
|
982
|
+
context_start = max(0, start - 3)
|
|
983
|
+
context_end = min(len(source_lines), end + 2)
|
|
984
|
+
|
|
985
|
+
section = []
|
|
986
|
+
section.append(f"Lines {start}-{end}:")
|
|
987
|
+
for i in range(context_start, context_end):
|
|
988
|
+
line_marker = ">>>" if start <= i + 1 <= end else " "
|
|
989
|
+
section.append(f"{line_marker} {i + 1:4d}: {source_lines[i]}")
|
|
990
|
+
|
|
991
|
+
uncovered_sections.append("\n".join(section))
|
|
992
|
+
|
|
993
|
+
return "\n\n".join(uncovered_sections)
|
|
994
|
+
|
|
995
|
+
except Exception as e:
|
|
996
|
+
logger.error(f"Error extracting uncovered lines: {e}")
|
|
997
|
+
return f"Error extracting lines: {e}"
|
|
998
|
+
|
|
999
|
+
def _generate_with_coverage_target(
|
|
1000
|
+
self,
|
|
1001
|
+
module_name: str,
|
|
1002
|
+
module_path: str,
|
|
1003
|
+
source_file: Path,
|
|
1004
|
+
source_code: str,
|
|
1005
|
+
test_file: Path,
|
|
1006
|
+
initial_test_content: str
|
|
1007
|
+
) -> str | None:
|
|
1008
|
+
"""Generate tests iteratively until coverage target met (Phase 3).
|
|
1009
|
+
|
|
1010
|
+
Process:
|
|
1011
|
+
1. Start with initial tests
|
|
1012
|
+
2. Run coverage analysis
|
|
1013
|
+
3. If target not met, identify uncovered lines
|
|
1014
|
+
4. Ask Claude to add tests for uncovered code
|
|
1015
|
+
5. Repeat until target coverage reached or max iterations
|
|
1016
|
+
|
|
1017
|
+
Args:
|
|
1018
|
+
module_name: Name of module being tested
|
|
1019
|
+
module_path: Python import path
|
|
1020
|
+
source_file: Path to source file
|
|
1021
|
+
source_code: Source code content
|
|
1022
|
+
test_file: Path to test file
|
|
1023
|
+
initial_test_content: Initial test content from Phase 1/2
|
|
1024
|
+
|
|
1025
|
+
Returns:
|
|
1026
|
+
Final test content with improved coverage or None if failed
|
|
1027
|
+
"""
|
|
1028
|
+
import os
|
|
1029
|
+
|
|
1030
|
+
api_key = os.getenv("ANTHROPIC_API_KEY")
|
|
1031
|
+
if not api_key:
|
|
1032
|
+
logger.error("ANTHROPIC_API_KEY not set")
|
|
1033
|
+
return None
|
|
1034
|
+
|
|
1035
|
+
logger.info(f"📊 Phase 3: Coverage-guided generation enabled (target: {self.target_coverage:.0%})")
|
|
1036
|
+
|
|
1037
|
+
test_content = initial_test_content
|
|
1038
|
+
max_coverage_iterations = 5
|
|
1039
|
+
|
|
1040
|
+
for iteration in range(max_coverage_iterations):
|
|
1041
|
+
logger.info(f"📈 Coverage iteration {iteration + 1}/{max_coverage_iterations} for {module_name}")
|
|
1042
|
+
|
|
1043
|
+
# Write current tests
|
|
1044
|
+
test_file.write_text(test_content)
|
|
1045
|
+
|
|
1046
|
+
# Run coverage analysis
|
|
1047
|
+
coverage_result = self._run_coverage_analysis(test_file, source_file)
|
|
1048
|
+
|
|
1049
|
+
logger.info(f"Current coverage: {coverage_result.coverage:.1%}, target: {self.target_coverage:.0%}")
|
|
1050
|
+
|
|
1051
|
+
# Check if target reached
|
|
1052
|
+
if coverage_result.coverage >= self.target_coverage:
|
|
1053
|
+
logger.info(f"✅ Coverage target reached: {coverage_result.coverage:.1%}")
|
|
1054
|
+
return test_content
|
|
1055
|
+
|
|
1056
|
+
# Not enough progress
|
|
1057
|
+
if iteration > 0 and coverage_result.coverage <= 0.05:
|
|
1058
|
+
logger.warning("⚠️ Coverage not improving, stopping")
|
|
1059
|
+
break
|
|
1060
|
+
|
|
1061
|
+
# Identify uncovered code
|
|
1062
|
+
uncovered_code = self._extract_uncovered_lines(source_file, coverage_result.missing_lines)
|
|
1063
|
+
|
|
1064
|
+
# Ask Claude to add tests for uncovered lines
|
|
1065
|
+
refinement_prompt = f"""Current coverage: {coverage_result.coverage:.1%}
|
|
1066
|
+
Target coverage: {self.target_coverage:.0%}
|
|
1067
|
+
Missing: {len(coverage_result.missing_lines)} lines
|
|
1068
|
+
|
|
1069
|
+
UNCOVERED CODE:
|
|
1070
|
+
{uncovered_code[:3000]}
|
|
1071
|
+
|
|
1072
|
+
Please ADD tests to cover these specific uncovered lines. Requirements:
|
|
1073
|
+
1. Focus ONLY on the uncovered lines shown above
|
|
1074
|
+
2. Add new test methods to the existing test classes
|
|
1075
|
+
3. Return the COMPLETE test file with additions (not just new tests)
|
|
1076
|
+
4. Use appropriate mocking for external dependencies
|
|
1077
|
+
5. Keep existing tests intact - just add new ones
|
|
1078
|
+
|
|
1079
|
+
Return ONLY the complete Python test file with additions, no explanations."""
|
|
1080
|
+
|
|
1081
|
+
# Build conversation with caching
|
|
1082
|
+
messages = [
|
|
1083
|
+
{
|
|
1084
|
+
"role": "user",
|
|
1085
|
+
"content": [
|
|
1086
|
+
{"type": "text", "text": "You are an expert Python test engineer. Examples:", "cache_control": {"type": "ephemeral"}},
|
|
1087
|
+
{"type": "text", "text": self._get_example_tests(), "cache_control": {"type": "ephemeral"}},
|
|
1088
|
+
{"type": "text", "text": f"Source code:\n```python\n{source_code}\n```", "cache_control": {"type": "ephemeral"}},
|
|
1089
|
+
{"type": "text", "text": f"Current tests:\n```python\n{test_content}\n```"},
|
|
1090
|
+
{"type": "text", "text": refinement_prompt}
|
|
1091
|
+
]
|
|
1092
|
+
}
|
|
1093
|
+
]
|
|
1094
|
+
|
|
1095
|
+
# Call LLM for coverage improvement
|
|
1096
|
+
try:
|
|
1097
|
+
import anthropic
|
|
1098
|
+
client = anthropic.Anthropic(api_key=api_key)
|
|
1099
|
+
response = client.messages.create(
|
|
1100
|
+
model="claude-sonnet-4-5",
|
|
1101
|
+
max_tokens=40000, # Very generous total budget for coverage improvement
|
|
1102
|
+
thinking={"type": "enabled", "budget_tokens": 20000}, # Thorough thinking for coverage gaps
|
|
1103
|
+
messages=messages,
|
|
1104
|
+
timeout=900.0, # 15 minutes timeout for coverage-guided iterations
|
|
1105
|
+
)
|
|
1106
|
+
|
|
1107
|
+
refined_content = None
|
|
1108
|
+
for block in response.content:
|
|
1109
|
+
if block.type == "text":
|
|
1110
|
+
refined_content = block.text.strip()
|
|
1111
|
+
break
|
|
1112
|
+
|
|
1113
|
+
if not refined_content:
|
|
1114
|
+
logger.warning(f"No content in coverage refinement iteration {iteration + 1}")
|
|
1115
|
+
break
|
|
474
1116
|
|
|
1117
|
+
# Clean up
|
|
1118
|
+
if refined_content.startswith("```python"):
|
|
1119
|
+
refined_content = refined_content[len("```python"):].strip()
|
|
1120
|
+
if refined_content.endswith("```"):
|
|
1121
|
+
refined_content = refined_content[:-3].strip()
|
|
1122
|
+
|
|
1123
|
+
test_content = refined_content
|
|
1124
|
+
logger.info(f"🔄 Coverage iteration {iteration + 1} complete, retrying analysis...")
|
|
1125
|
+
|
|
1126
|
+
except Exception as e:
|
|
1127
|
+
logger.error(f"Coverage refinement error on iteration {iteration + 1}: {e}")
|
|
1128
|
+
break
|
|
1129
|
+
|
|
1130
|
+
# Return best attempt
|
|
1131
|
+
logger.info(f"Coverage-guided generation complete: final coverage ~{coverage_result.coverage:.1%}")
|
|
475
1132
|
return test_content
|
|
476
1133
|
|
|
477
1134
|
def _validate_test_file(self, test_file: Path) -> bool:
|
|
478
|
-
"""Validate test file can be imported.
|
|
1135
|
+
"""Validate test file can be imported and has valid syntax.
|
|
479
1136
|
|
|
480
1137
|
Args:
|
|
481
1138
|
test_file: Path to test file
|
|
@@ -483,6 +1140,20 @@ class TestEdgeCases:
|
|
|
483
1140
|
Returns:
|
|
484
1141
|
True if valid, False otherwise
|
|
485
1142
|
"""
|
|
1143
|
+
# Step 1: Check for syntax errors with ast.parse (fast)
|
|
1144
|
+
try:
|
|
1145
|
+
import ast
|
|
1146
|
+
content = test_file.read_text()
|
|
1147
|
+
ast.parse(content)
|
|
1148
|
+
logger.info(f"✓ Syntax check passed for {test_file.name}")
|
|
1149
|
+
except SyntaxError as e:
|
|
1150
|
+
logger.error(f"❌ Syntax error in {test_file.name} at line {e.lineno}: {e.msg}")
|
|
1151
|
+
return False
|
|
1152
|
+
except Exception as e:
|
|
1153
|
+
logger.error(f"❌ Cannot parse {test_file.name}: {e}")
|
|
1154
|
+
return False
|
|
1155
|
+
|
|
1156
|
+
# Step 2: Check if pytest can collect the tests
|
|
486
1157
|
try:
|
|
487
1158
|
result = subprocess.run(
|
|
488
1159
|
[sys.executable, "-m", "pytest", "--collect-only", str(test_file)],
|
|
@@ -492,14 +1163,18 @@ class TestEdgeCases:
|
|
|
492
1163
|
)
|
|
493
1164
|
|
|
494
1165
|
if result.returncode != 0:
|
|
495
|
-
logger.
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
return True # Changed from False - be permissive
|
|
1166
|
+
logger.error(f"❌ Pytest collection failed for {test_file.name}")
|
|
1167
|
+
logger.error(f" Error: {result.stderr[:500]}")
|
|
1168
|
+
return False
|
|
499
1169
|
|
|
1170
|
+
logger.info(f"✓ Pytest collection passed for {test_file.name}")
|
|
500
1171
|
return True
|
|
1172
|
+
|
|
1173
|
+
except subprocess.TimeoutExpired:
|
|
1174
|
+
logger.error(f"❌ Validation timeout for {test_file.name}")
|
|
1175
|
+
return False
|
|
501
1176
|
except Exception as e:
|
|
502
|
-
logger.error(f"Validation exception for {test_file}: {e}")
|
|
1177
|
+
logger.error(f"❌ Validation exception for {test_file}: {e}")
|
|
503
1178
|
return False
|
|
504
1179
|
|
|
505
1180
|
def _count_tests(self) -> int:
|
|
@@ -524,25 +1199,42 @@ class TestEdgeCases:
|
|
|
524
1199
|
return 0
|
|
525
1200
|
|
|
526
1201
|
|
|
527
|
-
def run_batch_generation(
|
|
1202
|
+
def run_batch_generation(
|
|
1203
|
+
batch_num: int,
|
|
1204
|
+
modules_json: str,
|
|
1205
|
+
enable_refinement: bool = True,
|
|
1206
|
+
enable_coverage_guided: bool = False
|
|
1207
|
+
) -> None:
|
|
528
1208
|
"""Run test generation for a batch.
|
|
529
1209
|
|
|
530
1210
|
Args:
|
|
531
1211
|
batch_num: Batch number
|
|
532
1212
|
modules_json: JSON string of modules to process
|
|
1213
|
+
enable_refinement: Enable Phase 2 multi-turn refinement (default: True)
|
|
1214
|
+
enable_coverage_guided: Enable Phase 3 coverage-guided generation (default: False)
|
|
533
1215
|
"""
|
|
534
1216
|
# Parse modules
|
|
535
1217
|
modules = json.loads(modules_json)
|
|
536
1218
|
|
|
537
|
-
# Create agent
|
|
1219
|
+
# Create agent with Phase 2 & 3 configuration
|
|
538
1220
|
agent_id = f"test-gen-batch{batch_num}"
|
|
539
|
-
generator = AutonomousTestGenerator(
|
|
1221
|
+
generator = AutonomousTestGenerator(
|
|
1222
|
+
agent_id,
|
|
1223
|
+
batch_num,
|
|
1224
|
+
modules,
|
|
1225
|
+
enable_refinement=enable_refinement,
|
|
1226
|
+
enable_coverage_guided=enable_coverage_guided
|
|
1227
|
+
)
|
|
540
1228
|
|
|
541
1229
|
# Generate tests
|
|
542
1230
|
print(f"Starting autonomous test generation for batch {batch_num}")
|
|
543
1231
|
print(f"Modules to process: {len(modules)}")
|
|
544
1232
|
print(f"Agent ID: {agent_id}")
|
|
545
|
-
print("
|
|
1233
|
+
print("\nENHANCEMENTS:")
|
|
1234
|
+
print(" Phase 1: Extended thinking + Prompt caching + Workflow detection")
|
|
1235
|
+
print(f" Phase 2: Multi-turn refinement = {'ENABLED' if enable_refinement else 'DISABLED'}")
|
|
1236
|
+
print(f" Phase 3: Coverage-guided = {'ENABLED' if enable_coverage_guided else 'DISABLED'}")
|
|
1237
|
+
print("\nMonitor at: http://localhost:8000\n")
|
|
546
1238
|
|
|
547
1239
|
results = generator.generate_all()
|
|
548
1240
|
|
|
@@ -559,11 +1251,18 @@ def run_batch_generation(batch_num: int, modules_json: str) -> None:
|
|
|
559
1251
|
if __name__ == "__main__":
|
|
560
1252
|
import sys
|
|
561
1253
|
|
|
562
|
-
if len(sys.argv)
|
|
563
|
-
print("Usage: python -m empathy_os.workflows.autonomous_test_gen <batch_num> <modules_json>")
|
|
1254
|
+
if len(sys.argv) < 3:
|
|
1255
|
+
print("Usage: python -m empathy_os.workflows.autonomous_test_gen <batch_num> <modules_json> [--no-refinement] [--coverage-guided]")
|
|
1256
|
+
print("\nOptions:")
|
|
1257
|
+
print(" --no-refinement Disable Phase 2 multi-turn refinement")
|
|
1258
|
+
print(" --coverage-guided Enable Phase 3 coverage-guided generation (slower)")
|
|
564
1259
|
sys.exit(1)
|
|
565
1260
|
|
|
566
1261
|
batch_num = int(sys.argv[1])
|
|
567
1262
|
modules_json = sys.argv[2]
|
|
568
1263
|
|
|
569
|
-
|
|
1264
|
+
# Parse optional flags
|
|
1265
|
+
enable_refinement = "--no-refinement" not in sys.argv
|
|
1266
|
+
enable_coverage_guided = "--coverage-guided" in sys.argv
|
|
1267
|
+
|
|
1268
|
+
run_batch_generation(batch_num, modules_json, enable_refinement, enable_coverage_guided)
|