empathy-framework 5.0.0__py3-none-any.whl → 5.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. {empathy_framework-5.0.0.dist-info → empathy_framework-5.0.3.dist-info}/METADATA +53 -9
  2. {empathy_framework-5.0.0.dist-info → empathy_framework-5.0.3.dist-info}/RECORD +28 -31
  3. empathy_llm_toolkit/providers.py +175 -35
  4. empathy_llm_toolkit/utils/tokens.py +150 -30
  5. empathy_os/__init__.py +1 -1
  6. empathy_os/cli/commands/batch.py +256 -0
  7. empathy_os/cli/commands/cache.py +248 -0
  8. empathy_os/cli/commands/inspect.py +1 -2
  9. empathy_os/cli/commands/metrics.py +1 -1
  10. empathy_os/cli/commands/routing.py +285 -0
  11. empathy_os/cli/commands/workflow.py +2 -2
  12. empathy_os/cli/parsers/__init__.py +6 -0
  13. empathy_os/cli/parsers/batch.py +118 -0
  14. empathy_os/cli/parsers/cache.py +65 -0
  15. empathy_os/cli/parsers/routing.py +110 -0
  16. empathy_os/dashboard/standalone_server.py +22 -11
  17. empathy_os/metrics/collector.py +31 -0
  18. empathy_os/models/token_estimator.py +21 -13
  19. empathy_os/telemetry/agent_coordination.py +12 -14
  20. empathy_os/telemetry/agent_tracking.py +18 -19
  21. empathy_os/telemetry/approval_gates.py +27 -39
  22. empathy_os/telemetry/event_streaming.py +19 -19
  23. empathy_os/telemetry/feedback_loop.py +13 -16
  24. empathy_os/workflows/batch_processing.py +56 -10
  25. empathy_os/vscode_bridge 2.py +0 -173
  26. empathy_os/workflows/progressive/README 2.md +0 -454
  27. empathy_os/workflows/progressive/__init__ 2.py +0 -92
  28. empathy_os/workflows/progressive/cli 2.py +0 -242
  29. empathy_os/workflows/progressive/core 2.py +0 -488
  30. empathy_os/workflows/progressive/orchestrator 2.py +0 -701
  31. empathy_os/workflows/progressive/reports 2.py +0 -528
  32. empathy_os/workflows/progressive/telemetry 2.py +0 -280
  33. empathy_os/workflows/progressive/test_gen 2.py +0 -514
  34. empathy_os/workflows/progressive/workflow 2.py +0 -628
  35. {empathy_framework-5.0.0.dist-info → empathy_framework-5.0.3.dist-info}/WHEEL +0 -0
  36. {empathy_framework-5.0.0.dist-info → empathy_framework-5.0.3.dist-info}/entry_points.txt +0 -0
  37. {empathy_framework-5.0.0.dist-info → empathy_framework-5.0.3.dist-info}/licenses/LICENSE +0 -0
  38. {empathy_framework-5.0.0.dist-info → empathy_framework-5.0.3.dist-info}/top_level.txt +0 -0
@@ -1,514 +0,0 @@
1
- """Progressive test generation workflow with tier escalation.
2
-
3
- This module implements test generation with automatic escalation from cheap
4
- to capable to premium tiers based on test quality metrics.
5
- """
6
-
7
- import ast
8
- import logging
9
- import subprocess
10
- from datetime import datetime
11
- from pathlib import Path
12
- from typing import Any
13
-
14
- from empathy_os.workflows.progressive.core import (
15
- EscalationConfig,
16
- FailureAnalysis,
17
- ProgressiveWorkflowResult,
18
- Tier,
19
- TierResult,
20
- )
21
- from empathy_os.workflows.progressive.workflow import ProgressiveWorkflow
22
-
23
- logger = logging.getLogger(__name__)
24
-
25
-
26
- class ProgressiveTestGenWorkflow(ProgressiveWorkflow):
27
- """Test generation workflow with progressive tier escalation.
28
-
29
- Generates tests for Python functions using a cost-efficient progressive
30
- approach:
31
- 1. Start with cheap tier (gpt-4o-mini) for volume
32
- 2. Escalate failed tests to capable tier (claude-3-5-sonnet)
33
- 3. Escalate persistent failures to premium tier (claude-opus-4)
34
-
35
- Quality metrics tracked:
36
- - Syntax errors (AST parsing)
37
- - Test execution (pass/fail)
38
- - Code coverage
39
- - Assertion depth
40
-
41
- Example:
42
- >>> config = EscalationConfig(enabled=True, max_cost=10.00)
43
- >>> workflow = ProgressiveTestGenWorkflow(config)
44
- >>> result = workflow.execute(target_file="app.py")
45
- >>> print(result.generate_report())
46
- """
47
-
48
- def __init__(self, config: EscalationConfig | None = None):
49
- """Initialize progressive test generation workflow.
50
-
51
- Args:
52
- config: Escalation configuration (uses defaults if None)
53
- """
54
- super().__init__(config)
55
- self.target_file: Path | None = None
56
-
57
- def execute(self, target_file: str, **kwargs) -> ProgressiveWorkflowResult:
58
- """Generate tests for target file with progressive escalation.
59
-
60
- Args:
61
- target_file: Path to Python file to generate tests for
62
- **kwargs: Additional parameters
63
-
64
- Returns:
65
- Complete workflow results with progression history
66
-
67
- Raises:
68
- FileNotFoundError: If target_file doesn't exist
69
- BudgetExceededError: If cost exceeds budget
70
- UserCancelledError: If user declines approval
71
-
72
- Example:
73
- >>> result = workflow.execute(target_file="src/app.py")
74
- >>> print(f"Generated {len(result.final_result.generated_items)} tests")
75
- """
76
- self.target_file = Path(target_file)
77
-
78
- if not self.target_file.exists():
79
- raise FileNotFoundError(f"Target file not found: {target_file}")
80
-
81
- logger.info(f"Generating tests for {target_file}")
82
-
83
- # Parse target file to extract functions
84
- functions = self._parse_functions(self.target_file)
85
-
86
- if not functions:
87
- logger.warning(f"No functions found in {target_file}")
88
- return self._create_empty_result("test-gen")
89
-
90
- logger.info(f"Found {len(functions)} functions to test")
91
-
92
- # Execute with progressive escalation
93
- return self._execute_progressive(
94
- items=functions,
95
- workflow_name="test-gen",
96
- **kwargs
97
- )
98
-
99
- def _parse_functions(self, file_path: Path) -> list[dict[str, Any]]:
100
- """Parse Python file to extract function definitions.
101
-
102
- Args:
103
- file_path: Path to Python file
104
-
105
- Returns:
106
- List of function metadata dicts with keys:
107
- - name: Function name
108
- - lineno: Line number
109
- - args: List of argument names
110
- - docstring: Function docstring (if present)
111
- - code: Full function source code
112
-
113
- Example:
114
- >>> functions = workflow._parse_functions(Path("app.py"))
115
- >>> print(functions[0]["name"])
116
- 'calculate_total'
117
- """
118
- try:
119
- source = file_path.read_text()
120
- tree = ast.parse(source)
121
- except SyntaxError as e:
122
- logger.error(f"Syntax error in {file_path}: {e}")
123
- return []
124
-
125
- functions = []
126
-
127
- for node in ast.walk(tree):
128
- if isinstance(node, ast.FunctionDef):
129
- # Extract function info
130
- func_info = {
131
- "name": node.name,
132
- "lineno": node.lineno,
133
- "args": [arg.arg for arg in node.args.args],
134
- "docstring": ast.get_docstring(node) or "",
135
- "code": ast.unparse(node), # Python 3.9+
136
- "file": str(file_path)
137
- }
138
- functions.append(func_info)
139
-
140
- return functions
141
-
142
- def _execute_tier_impl(
143
- self,
144
- tier: Tier,
145
- items: list[Any],
146
- context: dict[str, Any] | None,
147
- **kwargs
148
- ) -> list[dict[str, Any]]:
149
- """Execute test generation at specific tier.
150
-
151
- Args:
152
- tier: Which tier to execute at
153
- items: Functions to generate tests for
154
- context: Context from previous tier (if escalating)
155
- **kwargs: Additional parameters
156
-
157
- Returns:
158
- List of generated test items with quality scores
159
-
160
- Note:
161
- This is a placeholder implementation. In production, this would
162
- call the actual LLM API to generate tests.
163
- """
164
- logger.info(f"Generating {len(items)} tests at {tier.value} tier")
165
-
166
- # Build prompt for this tier
167
- base_task = self._build_test_gen_task(items)
168
- prompt = self.meta_orchestrator.build_tier_prompt(
169
- tier,
170
- base_task,
171
- context
172
- )
173
-
174
- # TODO: Call LLM API with prompt
175
- # For now, simulate test generation
176
- generated_tests = self._simulate_test_generation(tier, items)
177
-
178
- return generated_tests
179
-
180
- def _build_test_gen_task(self, functions: list[dict[str, Any]]) -> str:
181
- """Build task description for test generation.
182
-
183
- Args:
184
- functions: List of function metadata
185
-
186
- Returns:
187
- Task description string
188
-
189
- Example:
190
- >>> task = workflow._build_test_gen_task([{"name": "foo", ...}])
191
- >>> print(task)
192
- 'Generate pytest tests for 1 functions from app.py'
193
- """
194
- file_name = self.target_file.name if self.target_file else "module"
195
- func_names = [f["name"] for f in functions]
196
-
197
- task = f"Generate pytest tests for {len(functions)} function(s) from {file_name}"
198
-
199
- if len(func_names) <= 3:
200
- task += f": {', '.join(func_names)}"
201
-
202
- return task
203
-
204
- def _simulate_test_generation(
205
- self,
206
- tier: Tier,
207
- functions: list[dict[str, Any]]
208
- ) -> list[dict[str, Any]]:
209
- """Simulate test generation (placeholder for LLM integration).
210
-
211
- In production, this would call the LLM API. For now, it generates
212
- mock test data with varying quality based on tier.
213
-
214
- Args:
215
- tier: Which tier is generating
216
- functions: Functions to generate tests for
217
-
218
- Returns:
219
- List of generated test items with quality metrics
220
-
221
- Note:
222
- This is temporary scaffolding. Real implementation will:
223
- 1. Call LLM API with tier-appropriate model
224
- 2. Parse generated test code
225
- 3. Validate syntax
226
- 4. Execute tests
227
- 5. Calculate coverage
228
- """
229
- generated_tests = []
230
-
231
- # Simulate different quality levels per tier
232
- base_quality = {
233
- Tier.CHEAP: 70,
234
- Tier.CAPABLE: 85,
235
- Tier.PREMIUM: 95
236
- }[tier]
237
-
238
- for func in functions:
239
- # Generate mock test code
240
- test_code = self._generate_mock_test(func)
241
-
242
- # Analyze test quality
243
- analysis = self._analyze_generated_test(test_code, func)
244
-
245
- # Calculate quality score
246
- quality_score = analysis.calculate_quality_score()
247
-
248
- generated_tests.append({
249
- "function_name": func["name"],
250
- "test_code": test_code,
251
- "quality_score": quality_score,
252
- "passed": analysis.test_pass_rate > 0.5,
253
- "coverage": analysis.coverage_percent,
254
- "assertions": analysis.assertion_depth,
255
- "confidence": analysis.confidence_score,
256
- "syntax_errors": [str(e) for e in analysis.syntax_errors],
257
- "error": "" if not analysis.syntax_errors else str(analysis.syntax_errors[0])
258
- })
259
-
260
- return generated_tests
261
-
262
- def _generate_mock_test(self, func: dict[str, Any]) -> str:
263
- """Generate mock test code (placeholder).
264
-
265
- Args:
266
- func: Function metadata
267
-
268
- Returns:
269
- Generated test code as string
270
- """
271
- func_name = func["name"]
272
- args = func["args"]
273
-
274
- # Generate simple test template
275
- test_code = f'''def test_{func_name}():
276
- """Test {func_name} function."""
277
- # Arrange
278
- {self._generate_test_setup(args)}
279
-
280
- # Act
281
- result = {func_name}({", ".join(args)})
282
-
283
- # Assert
284
- assert result is not None
285
- '''
286
-
287
- return test_code
288
-
289
- def _generate_test_setup(self, args: list[str]) -> str:
290
- """Generate test setup code for arguments.
291
-
292
- Args:
293
- args: List of argument names
294
-
295
- Returns:
296
- Setup code as string
297
- """
298
- if not args:
299
- return "pass"
300
-
301
- setup_lines = []
302
- for arg in args:
303
- # Simple type inference based on name
304
- if "count" in arg or "num" in arg or "index" in arg:
305
- setup_lines.append(f"{arg} = 1")
306
- elif "name" in arg or "text" in arg or "message" in arg:
307
- setup_lines.append(f'{arg} = "test"')
308
- elif "items" in arg or "list" in arg:
309
- setup_lines.append(f"{arg} = []")
310
- else:
311
- setup_lines.append(f'{arg} = "value"')
312
-
313
- return "\n ".join(setup_lines)
314
-
315
- def _analyze_generated_test(
316
- self,
317
- test_code: str,
318
- func: dict[str, Any]
319
- ) -> FailureAnalysis:
320
- """Analyze quality of generated test.
321
-
322
- Args:
323
- test_code: Generated test code
324
- func: Original function metadata
325
-
326
- Returns:
327
- Failure analysis with quality metrics
328
- """
329
- analysis = FailureAnalysis()
330
-
331
- # 1. Check syntax
332
- try:
333
- ast.parse(test_code)
334
- except SyntaxError as e:
335
- analysis.syntax_errors.append(e)
336
- return analysis # Can't proceed with invalid syntax
337
-
338
- # 2. Count assertions
339
- try:
340
- tree = ast.parse(test_code)
341
- assertion_count = sum(
342
- 1 for node in ast.walk(tree)
343
- if isinstance(node, ast.Assert)
344
- )
345
- analysis.assertion_depth = assertion_count
346
- except Exception as e:
347
- logger.warning(f"Failed to count assertions: {e}")
348
- analysis.assertion_depth = 0
349
-
350
- # 3. Simulate test execution (placeholder)
351
- # In production, would actually run the test
352
- analysis.test_pass_rate = 0.8 # Mock: 80% pass rate
353
-
354
- # 4. Simulate coverage (placeholder)
355
- # In production, would use coverage.py
356
- analysis.coverage_percent = 75.0 # Mock: 75% coverage
357
-
358
- # 5. Estimate confidence (placeholder)
359
- # In production, would parse from LLM response
360
- analysis.confidence_score = 0.85 # Mock: 85% confidence
361
-
362
- return analysis
363
-
364
- def _create_empty_result(self, workflow_name: str) -> ProgressiveWorkflowResult:
365
- """Create empty result when no functions found.
366
-
367
- Args:
368
- workflow_name: Name of workflow
369
-
370
- Returns:
371
- Empty workflow result
372
- """
373
- empty_result = TierResult(
374
- tier=Tier.CHEAP,
375
- model=self._get_model_for_tier(Tier.CHEAP),
376
- attempt=1,
377
- timestamp=datetime.now(),
378
- generated_items=[],
379
- failure_analysis=FailureAnalysis(),
380
- cost=0.0,
381
- duration=0.0
382
- )
383
-
384
- task_id = f"{workflow_name}-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
385
-
386
- return ProgressiveWorkflowResult(
387
- workflow_name=workflow_name,
388
- task_id=task_id,
389
- tier_results=[empty_result],
390
- final_result=empty_result,
391
- total_cost=0.0,
392
- total_duration=0.0,
393
- success=False
394
- )
395
-
396
-
397
- def execute_test_file(test_file: Path) -> dict[str, Any]:
398
- """Execute a test file using pytest.
399
-
400
- Args:
401
- test_file: Path to test file
402
-
403
- Returns:
404
- Dict with execution results:
405
- - passed: Number of tests passed
406
- - failed: Number of tests failed
407
- - pass_rate: Percentage passed (0.0-1.0)
408
- - output: pytest output
409
-
410
- Example:
411
- >>> result = execute_test_file(Path("test_app.py"))
412
- >>> print(f"Pass rate: {result['pass_rate']:.1%}")
413
- """
414
- try:
415
- result = subprocess.run(
416
- ["pytest", str(test_file), "-v", "--tb=short"],
417
- capture_output=True,
418
- text=True,
419
- timeout=60
420
- )
421
-
422
- # Parse pytest output to get pass/fail counts
423
- # This is a simple parser - production would be more robust
424
- output = result.stdout + result.stderr
425
-
426
- passed = output.count(" PASSED")
427
- failed = output.count(" FAILED")
428
- total = passed + failed
429
-
430
- pass_rate = passed / total if total > 0 else 0.0
431
-
432
- return {
433
- "passed": passed,
434
- "failed": failed,
435
- "total": total,
436
- "pass_rate": pass_rate,
437
- "output": output,
438
- "returncode": result.returncode
439
- }
440
-
441
- except subprocess.TimeoutExpired:
442
- return {
443
- "passed": 0,
444
- "failed": 0,
445
- "total": 0,
446
- "pass_rate": 0.0,
447
- "output": "Test execution timed out",
448
- "returncode": -1
449
- }
450
- except Exception as e:
451
- logger.error(f"Failed to execute tests: {e}")
452
- return {
453
- "passed": 0,
454
- "failed": 0,
455
- "total": 0,
456
- "pass_rate": 0.0,
457
- "output": str(e),
458
- "returncode": -1
459
- }
460
-
461
-
462
- def calculate_coverage(test_file: Path, source_file: Path) -> float:
463
- """Calculate code coverage for a test file.
464
-
465
- Args:
466
- test_file: Path to test file
467
- source_file: Path to source file being tested
468
-
469
- Returns:
470
- Coverage percentage (0.0-100.0)
471
-
472
- Example:
473
- >>> coverage = calculate_coverage(
474
- ... Path("test_app.py"),
475
- ... Path("app.py")
476
- ... )
477
- >>> print(f"Coverage: {coverage:.1f}%")
478
- """
479
- try:
480
- # Run pytest with coverage
481
- result = subprocess.run(
482
- [
483
- "pytest",
484
- str(test_file),
485
- f"--cov={source_file.stem}",
486
- "--cov-report=term-missing",
487
- "--no-cov-on-fail"
488
- ],
489
- capture_output=True,
490
- text=True,
491
- timeout=60,
492
- cwd=source_file.parent
493
- )
494
-
495
- output = result.stdout + result.stderr
496
-
497
- # Parse coverage percentage from output
498
- # Look for line like: "app.py 85%"
499
- for line in output.split("\n"):
500
- if source_file.name in line and "%" in line:
501
- # Extract percentage
502
- parts = line.split()
503
- for part in parts:
504
- if "%" in part:
505
- try:
506
- return float(part.rstrip("%"))
507
- except ValueError:
508
- pass
509
-
510
- return 0.0
511
-
512
- except Exception as e:
513
- logger.error(f"Failed to calculate coverage: {e}")
514
- return 0.0