empathy-framework 5.1.1__py3-none-any.whl → 5.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. {empathy_framework-5.1.1.dist-info → empathy_framework-5.3.0.dist-info}/METADATA +79 -6
  2. {empathy_framework-5.1.1.dist-info → empathy_framework-5.3.0.dist-info}/RECORD +83 -64
  3. empathy_os/__init__.py +1 -1
  4. empathy_os/cache/hybrid.py +5 -1
  5. empathy_os/cli/commands/batch.py +8 -0
  6. empathy_os/cli/commands/profiling.py +4 -0
  7. empathy_os/cli/commands/workflow.py +8 -4
  8. empathy_os/cli_router.py +9 -0
  9. empathy_os/config.py +15 -2
  10. empathy_os/core_modules/__init__.py +15 -0
  11. empathy_os/dashboard/simple_server.py +62 -30
  12. empathy_os/mcp/__init__.py +10 -0
  13. empathy_os/mcp/server.py +506 -0
  14. empathy_os/memory/control_panel.py +1 -131
  15. empathy_os/memory/control_panel_support.py +145 -0
  16. empathy_os/memory/encryption.py +159 -0
  17. empathy_os/memory/long_term.py +46 -631
  18. empathy_os/memory/long_term_types.py +99 -0
  19. empathy_os/memory/mixins/__init__.py +25 -0
  20. empathy_os/memory/mixins/backend_init_mixin.py +249 -0
  21. empathy_os/memory/mixins/capabilities_mixin.py +208 -0
  22. empathy_os/memory/mixins/handoff_mixin.py +208 -0
  23. empathy_os/memory/mixins/lifecycle_mixin.py +49 -0
  24. empathy_os/memory/mixins/long_term_mixin.py +352 -0
  25. empathy_os/memory/mixins/promotion_mixin.py +109 -0
  26. empathy_os/memory/mixins/short_term_mixin.py +182 -0
  27. empathy_os/memory/short_term.py +61 -12
  28. empathy_os/memory/simple_storage.py +302 -0
  29. empathy_os/memory/storage_backend.py +167 -0
  30. empathy_os/memory/types.py +8 -3
  31. empathy_os/memory/unified.py +21 -1120
  32. empathy_os/meta_workflows/cli_commands/__init__.py +56 -0
  33. empathy_os/meta_workflows/cli_commands/agent_commands.py +321 -0
  34. empathy_os/meta_workflows/cli_commands/analytics_commands.py +442 -0
  35. empathy_os/meta_workflows/cli_commands/config_commands.py +232 -0
  36. empathy_os/meta_workflows/cli_commands/memory_commands.py +182 -0
  37. empathy_os/meta_workflows/cli_commands/template_commands.py +354 -0
  38. empathy_os/meta_workflows/cli_commands/workflow_commands.py +382 -0
  39. empathy_os/meta_workflows/cli_meta_workflows.py +52 -1802
  40. empathy_os/models/telemetry/__init__.py +71 -0
  41. empathy_os/models/telemetry/analytics.py +594 -0
  42. empathy_os/models/telemetry/backend.py +196 -0
  43. empathy_os/models/telemetry/data_models.py +431 -0
  44. empathy_os/models/telemetry/storage.py +489 -0
  45. empathy_os/orchestration/__init__.py +35 -0
  46. empathy_os/orchestration/execution_strategies.py +481 -0
  47. empathy_os/orchestration/meta_orchestrator.py +488 -1
  48. empathy_os/routing/workflow_registry.py +36 -0
  49. empathy_os/telemetry/agent_coordination.py +2 -3
  50. empathy_os/telemetry/agent_tracking.py +26 -7
  51. empathy_os/telemetry/approval_gates.py +18 -24
  52. empathy_os/telemetry/cli.py +19 -724
  53. empathy_os/telemetry/commands/__init__.py +14 -0
  54. empathy_os/telemetry/commands/dashboard_commands.py +696 -0
  55. empathy_os/telemetry/event_streaming.py +7 -3
  56. empathy_os/telemetry/feedback_loop.py +28 -15
  57. empathy_os/tools.py +183 -0
  58. empathy_os/workflows/__init__.py +5 -0
  59. empathy_os/workflows/autonomous_test_gen.py +860 -161
  60. empathy_os/workflows/base.py +6 -2
  61. empathy_os/workflows/code_review.py +4 -1
  62. empathy_os/workflows/document_gen/__init__.py +25 -0
  63. empathy_os/workflows/document_gen/config.py +30 -0
  64. empathy_os/workflows/document_gen/report_formatter.py +162 -0
  65. empathy_os/workflows/{document_gen.py → document_gen/workflow.py} +5 -184
  66. empathy_os/workflows/output.py +4 -1
  67. empathy_os/workflows/progress.py +8 -2
  68. empathy_os/workflows/security_audit.py +2 -2
  69. empathy_os/workflows/security_audit_phase3.py +7 -4
  70. empathy_os/workflows/seo_optimization.py +633 -0
  71. empathy_os/workflows/test_gen/__init__.py +52 -0
  72. empathy_os/workflows/test_gen/ast_analyzer.py +249 -0
  73. empathy_os/workflows/test_gen/config.py +88 -0
  74. empathy_os/workflows/test_gen/data_models.py +38 -0
  75. empathy_os/workflows/test_gen/report_formatter.py +289 -0
  76. empathy_os/workflows/test_gen/test_templates.py +381 -0
  77. empathy_os/workflows/test_gen/workflow.py +655 -0
  78. empathy_os/workflows/test_gen.py +42 -1905
  79. empathy_os/cli/parsers/cache 2.py +0 -65
  80. empathy_os/cli_router 2.py +0 -416
  81. empathy_os/dashboard/app 2.py +0 -512
  82. empathy_os/dashboard/simple_server 2.py +0 -403
  83. empathy_os/dashboard/standalone_server 2.py +0 -536
  84. empathy_os/memory/types 2.py +0 -441
  85. empathy_os/models/adaptive_routing 2.py +0 -437
  86. empathy_os/models/telemetry.py +0 -1660
  87. empathy_os/project_index/scanner_parallel 2.py +0 -291
  88. empathy_os/telemetry/agent_coordination 2.py +0 -478
  89. empathy_os/telemetry/agent_tracking 2.py +0 -350
  90. empathy_os/telemetry/approval_gates 2.py +0 -563
  91. empathy_os/telemetry/event_streaming 2.py +0 -405
  92. empathy_os/telemetry/feedback_loop 2.py +0 -557
  93. empathy_os/vscode_bridge 2.py +0 -173
  94. empathy_os/workflows/progressive/__init__ 2.py +0 -92
  95. empathy_os/workflows/progressive/cli 2.py +0 -242
  96. empathy_os/workflows/progressive/core 2.py +0 -488
  97. empathy_os/workflows/progressive/orchestrator 2.py +0 -701
  98. empathy_os/workflows/progressive/reports 2.py +0 -528
  99. empathy_os/workflows/progressive/telemetry 2.py +0 -280
  100. empathy_os/workflows/progressive/test_gen 2.py +0 -514
  101. empathy_os/workflows/progressive/workflow 2.py +0 -628
  102. {empathy_framework-5.1.1.dist-info → empathy_framework-5.3.0.dist-info}/WHEEL +0 -0
  103. {empathy_framework-5.1.1.dist-info → empathy_framework-5.3.0.dist-info}/entry_points.txt +0 -0
  104. {empathy_framework-5.1.1.dist-info → empathy_framework-5.3.0.dist-info}/licenses/LICENSE +0 -0
  105. {empathy_framework-5.1.1.dist-info → empathy_framework-5.3.0.dist-info}/licenses/LICENSE_CHANGE_ANNOUNCEMENT.md +0 -0
  106. {empathy_framework-5.1.1.dist-info → empathy_framework-5.3.0.dist-info}/top_level.txt +0 -0
@@ -1,1660 +0,0 @@
1
- """Structured Telemetry for Multi-Model Workflows
2
-
3
- Provides normalized schemas for tracking LLM calls and workflow runs:
4
- - LLMCallRecord: Per-call metrics (model, tokens, cost, latency)
5
- - WorkflowRunRecord: Per-workflow metrics (stages, total cost, duration)
6
- - TelemetryBackend: Abstract interface for telemetry storage
7
- - TelemetryStore: JSONL file-based backend (default)
8
- - Analytics helpers for cost analysis and optimization
9
-
10
- Tier 1 Automation Monitoring:
11
- - TaskRoutingRecord: Task routing decisions and outcomes
12
- - TestExecutionRecord: Test execution results and coverage
13
- - CoverageRecord: Test coverage metrics and trends
14
- - AgentAssignmentRecord: Agent assignments for simple tasks
15
-
16
- Copyright 2025 Smart-AI-Memory
17
- Licensed under Fair Source License 0.9
18
- """
19
-
20
- import heapq
21
- import json
22
- from dataclasses import asdict, dataclass, field
23
- from datetime import datetime
24
- from pathlib import Path
25
- from typing import Any, Protocol, runtime_checkable
26
-
27
-
28
- @dataclass
29
- class LLMCallRecord:
30
- """Record of a single LLM API call.
31
-
32
- Captures all relevant metrics for cost tracking, performance analysis,
33
- and debugging.
34
- """
35
-
36
- # Identification
37
- call_id: str
38
- timestamp: str # ISO format
39
-
40
- # Context
41
- workflow_name: str | None = None
42
- step_name: str | None = None
43
- user_id: str | None = None
44
- session_id: str | None = None
45
-
46
- # Task routing
47
- task_type: str = "unknown"
48
- provider: str = "anthropic"
49
- tier: str = "capable"
50
- model_id: str = ""
51
-
52
- # Token usage
53
- input_tokens: int = 0
54
- output_tokens: int = 0
55
-
56
- # Cost (in USD)
57
- estimated_cost: float = 0.0
58
- actual_cost: float | None = None
59
-
60
- # Performance
61
- latency_ms: int = 0
62
-
63
- # Fallback and resilience tracking
64
- fallback_used: bool = False
65
- fallback_chain: list[str] = field(default_factory=list)
66
- original_provider: str | None = None
67
- original_model: str | None = None
68
- retry_count: int = 0 # Number of retries before success
69
- circuit_breaker_state: str | None = None # "closed", "open", "half-open"
70
-
71
- # Error tracking
72
- success: bool = True
73
- error_type: str | None = None
74
- error_message: str | None = None
75
-
76
- # Additional metadata
77
- metadata: dict[str, Any] = field(default_factory=dict)
78
-
79
- def to_dict(self) -> dict[str, Any]:
80
- """Convert to dictionary for JSON serialization."""
81
- return asdict(self)
82
-
83
- @classmethod
84
- def from_dict(cls, data: dict[str, Any]) -> "LLMCallRecord":
85
- """Create from dictionary."""
86
- return cls(**data)
87
-
88
-
89
- @dataclass
90
- class WorkflowStageRecord:
91
- """Record of a single workflow stage execution."""
92
-
93
- stage_name: str
94
- tier: str
95
- model_id: str
96
- input_tokens: int = 0
97
- output_tokens: int = 0
98
- cost: float = 0.0
99
- latency_ms: int = 0
100
- success: bool = True
101
- skipped: bool = False
102
- skip_reason: str | None = None
103
- error: str | None = None
104
-
105
-
106
- @dataclass
107
- class WorkflowRunRecord:
108
- """Record of a complete workflow execution.
109
-
110
- Aggregates stage-level metrics and provides workflow-level analytics.
111
- """
112
-
113
- # Identification
114
- run_id: str
115
- workflow_name: str
116
- started_at: str # ISO format
117
- completed_at: str | None = None
118
-
119
- # Context
120
- user_id: str | None = None
121
- session_id: str | None = None
122
-
123
- # Stages
124
- stages: list[WorkflowStageRecord] = field(default_factory=list)
125
-
126
- # Aggregated metrics
127
- total_input_tokens: int = 0
128
- total_output_tokens: int = 0
129
- total_cost: float = 0.0
130
- baseline_cost: float = 0.0 # If all stages used premium
131
- savings: float = 0.0
132
- savings_percent: float = 0.0
133
-
134
- # Performance
135
- total_duration_ms: int = 0
136
-
137
- # Status
138
- success: bool = True
139
- error: str | None = None
140
-
141
- # Provider usage
142
- providers_used: list[str] = field(default_factory=list)
143
- tiers_used: list[str] = field(default_factory=list)
144
-
145
- def to_dict(self) -> dict[str, Any]:
146
- """Convert to dictionary for JSON serialization."""
147
- data = asdict(self)
148
- data["stages"] = [asdict(s) for s in self.stages]
149
- return data
150
-
151
- @classmethod
152
- def from_dict(cls, data: dict[str, Any]) -> "WorkflowRunRecord":
153
- """Create from dictionary."""
154
- stages = [WorkflowStageRecord(**s) for s in data.pop("stages", [])]
155
- return cls(stages=stages, **data)
156
-
157
-
158
- @dataclass
159
- class TaskRoutingRecord:
160
- """Record of task routing decision for Tier 1 automation.
161
-
162
- Tracks which agent/workflow handles each task, routing strategy,
163
- and execution outcome for automation monitoring.
164
- """
165
-
166
- # Identification (required)
167
- routing_id: str
168
- timestamp: str # ISO format
169
-
170
- # Task context (required)
171
- task_description: str
172
- task_type: str # "code_review", "test_gen", "bug_fix", "refactor", etc.
173
- task_complexity: str # "simple", "moderate", "complex"
174
-
175
- # Routing decision (required)
176
- assigned_agent: str # "test_gen_workflow", "code_review_workflow", etc.
177
- assigned_tier: str # "cheap", "capable", "premium"
178
- routing_strategy: str # "rule_based", "ml_predicted", "manual_override"
179
-
180
- # Optional fields with defaults
181
- task_dependencies: list[str] = field(default_factory=list) # Task IDs this depends on
182
- confidence_score: float = 1.0 # 0.0-1.0 for ML predictions
183
-
184
- # Execution tracking
185
- status: str = "pending" # "pending", "running", "completed", "failed"
186
- started_at: str | None = None
187
- completed_at: str | None = None
188
-
189
- # Outcome
190
- success: bool = False
191
- quality_score: float | None = None # 0.0-1.0 if applicable
192
- retry_count: int = 0
193
- error_type: str | None = None
194
- error_message: str | None = None
195
-
196
- # Cost tracking
197
- estimated_cost: float = 0.0
198
- actual_cost: float | None = None
199
-
200
- # Metadata
201
- user_id: str | None = None
202
- session_id: str | None = None
203
- metadata: dict[str, Any] = field(default_factory=dict)
204
-
205
- def to_dict(self) -> dict[str, Any]:
206
- """Convert to dictionary for JSON serialization."""
207
- return asdict(self)
208
-
209
- @classmethod
210
- def from_dict(cls, data: dict[str, Any]) -> "TaskRoutingRecord":
211
- """Create from dictionary."""
212
- return cls(**data)
213
-
214
-
215
- @dataclass
216
- class TestExecutionRecord:
217
- """Record of test execution for Tier 1 QA automation.
218
-
219
- Tracks test execution results, coverage metrics, and failure details
220
- for quality assurance monitoring.
221
- """
222
-
223
- # Identification (required)
224
- execution_id: str
225
- timestamp: str # ISO format
226
-
227
- # Test context (required)
228
- test_suite: str # "unit", "integration", "e2e", "all"
229
-
230
- # Optional fields with defaults
231
- test_files: list[str] = field(default_factory=list) # Specific test files executed
232
- triggered_by: str = "manual" # "workflow", "manual", "ci", "pre_commit"
233
-
234
- # Execution details
235
- command: str = ""
236
- working_directory: str = ""
237
- duration_seconds: float = 0.0
238
-
239
- # Results
240
- total_tests: int = 0
241
- passed: int = 0
242
- failed: int = 0
243
- skipped: int = 0
244
- errors: int = 0
245
-
246
- # Coverage (if available)
247
- coverage_percentage: float | None = None
248
- coverage_report_path: str | None = None
249
-
250
- # Failures
251
- failed_tests: list[dict[str, Any]] = field(
252
- default_factory=list
253
- ) # [{name, file, error, traceback}]
254
-
255
- # Status
256
- success: bool = False # True if all tests passed
257
- exit_code: int = 0
258
-
259
- # Metadata
260
- workflow_id: str | None = None # Link to workflow that triggered this
261
- metadata: dict[str, Any] = field(default_factory=dict)
262
-
263
- def to_dict(self) -> dict[str, Any]:
264
- """Convert to dictionary for JSON serialization."""
265
- return asdict(self)
266
-
267
- @classmethod
268
- def from_dict(cls, data: dict[str, Any]) -> "TestExecutionRecord":
269
- """Create from dictionary."""
270
- return cls(**data)
271
-
272
-
273
- @dataclass
274
- class CoverageRecord:
275
- """Record of test coverage metrics for Tier 1 QA monitoring.
276
-
277
- Tracks coverage percentage, trends, and critical gaps for
278
- continuous quality improvement.
279
- """
280
-
281
- # Identification (required)
282
- record_id: str
283
- timestamp: str # ISO format
284
-
285
- # Coverage metrics (required)
286
- overall_percentage: float
287
- lines_total: int
288
- lines_covered: int
289
-
290
- # Optional fields with defaults
291
- branches_total: int = 0
292
- branches_covered: int = 0
293
-
294
- # File-level breakdown
295
- files_total: int = 0
296
- files_well_covered: int = 0 # >= 80%
297
- files_critical: int = 0 # < 50%
298
- untested_files: list[str] = field(default_factory=list)
299
-
300
- # Critical gaps
301
- critical_gaps: list[dict[str, Any]] = field(
302
- default_factory=list
303
- ) # [{file, coverage, priority}]
304
-
305
- # Trend data
306
- previous_percentage: float | None = None
307
- trend: str | None = None # "improving", "declining", "stable"
308
-
309
- # Source
310
- coverage_format: str = "xml" # "xml", "json", "lcov"
311
- coverage_file: str = ""
312
-
313
- # Metadata
314
- workflow_id: str | None = None
315
- metadata: dict[str, Any] = field(default_factory=dict)
316
-
317
- def to_dict(self) -> dict[str, Any]:
318
- """Convert to dictionary for JSON serialization."""
319
- return asdict(self)
320
-
321
- @classmethod
322
- def from_dict(cls, data: dict[str, Any]) -> "CoverageRecord":
323
- """Create from dictionary."""
324
- return cls(**data)
325
-
326
-
327
- @dataclass
328
- class AgentAssignmentRecord:
329
- """Record of agent assignment for simple tasks (Tier 1).
330
-
331
- Tracks task assignments to agents/workflows with clear specs
332
- and no complex dependencies for automation monitoring.
333
- """
334
-
335
- # Identification (required)
336
- assignment_id: str
337
- timestamp: str # ISO format
338
-
339
- # Task details (required)
340
- task_id: str
341
- task_title: str
342
- task_description: str
343
-
344
- # Assignment (required)
345
- assigned_agent: str # Agent/workflow name
346
-
347
- # Optional fields with defaults
348
- task_spec_clarity: float = 0.0 # 0.0-1.0, higher = clearer spec
349
- assignment_reason: str = "" # Why this agent was chosen
350
- estimated_duration_hours: float = 0.0
351
-
352
- # Criteria checks
353
- has_clear_spec: bool = False
354
- has_dependencies: bool = False
355
- requires_human_review: bool = False
356
- automated_eligible: bool = False # True for Tier 1
357
-
358
- # Execution tracking
359
- status: str = "assigned" # "assigned", "in_progress", "completed", "blocked"
360
- started_at: str | None = None
361
- completed_at: str | None = None
362
- actual_duration_hours: float | None = None
363
-
364
- # Outcome
365
- success: bool = False
366
- quality_check_passed: bool = False
367
- human_review_required: bool = False
368
-
369
- # Metadata
370
- workflow_id: str | None = None
371
- metadata: dict[str, Any] = field(default_factory=dict)
372
-
373
- def to_dict(self) -> dict[str, Any]:
374
- """Convert to dictionary for JSON serialization."""
375
- return asdict(self)
376
-
377
- @classmethod
378
- def from_dict(cls, data: dict[str, Any]) -> "AgentAssignmentRecord":
379
- """Create from dictionary."""
380
- return cls(**data)
381
-
382
-
383
- @dataclass
384
- class FileTestRecord:
385
- """Record of test execution for a specific source file.
386
-
387
- Tracks when tests for an individual file were last run, results,
388
- and coverage - enabling per-file test status tracking.
389
-
390
- This complements TestExecutionRecord (suite-level) by providing
391
- granular file-level test tracking for better test maintenance.
392
- """
393
-
394
- # Identification (required)
395
- file_path: str # Source file path (relative to project root)
396
- timestamp: str # ISO format - when tests were run
397
-
398
- # Test results (required)
399
- last_test_result: str # "passed", "failed", "error", "skipped", "no_tests"
400
- test_count: int # Number of tests for this file
401
-
402
- # Detailed results with defaults
403
- passed: int = 0
404
- failed: int = 0
405
- skipped: int = 0
406
- errors: int = 0
407
-
408
- # Timing
409
- duration_seconds: float = 0.0
410
-
411
- # Coverage for this file (if available)
412
- coverage_percent: float | None = None
413
- lines_total: int = 0
414
- lines_covered: int = 0
415
-
416
- # Test file info
417
- test_file_path: str | None = None # Associated test file
418
-
419
- # Failure details (if any)
420
- failed_tests: list[dict[str, Any]] = field(default_factory=list)
421
-
422
- # Staleness tracking
423
- source_modified_at: str | None = None # When source file was last modified
424
- tests_modified_at: str | None = None # When test file was last modified
425
- is_stale: bool = False # Tests haven't been run since source changed
426
-
427
- # Link to execution
428
- execution_id: str | None = None # Link to TestExecutionRecord
429
- workflow_id: str | None = None
430
-
431
- # Metadata
432
- metadata: dict[str, Any] = field(default_factory=dict)
433
-
434
- def to_dict(self) -> dict[str, Any]:
435
- """Convert to dictionary for JSON serialization."""
436
- return asdict(self)
437
-
438
- @classmethod
439
- def from_dict(cls, data: dict[str, Any]) -> "FileTestRecord":
440
- """Create from dictionary."""
441
- return cls(**data)
442
-
443
- @property
444
- def success(self) -> bool:
445
- """Check if all tests passed."""
446
- return self.last_test_result == "passed" and self.failed == 0 and self.errors == 0
447
-
448
-
449
- @runtime_checkable
450
- class TelemetryBackend(Protocol):
451
- """Protocol for telemetry storage backends.
452
-
453
- Implementations can store telemetry data in different backends:
454
- - JSONL files (default, via TelemetryStore)
455
- - Database (PostgreSQL, SQLite, etc.)
456
- - Cloud services (DataDog, New Relic, etc.)
457
- - Custom backends
458
-
459
- Supports both core telemetry (LLM calls, workflows) and Tier 1
460
- automation monitoring (task routing, tests, coverage, assignments).
461
-
462
- Example implementing a custom backend:
463
- >>> class DatabaseBackend:
464
- ... def log_call(self, record: LLMCallRecord) -> None:
465
- ... # Insert into database
466
- ... pass
467
- ...
468
- ... def log_workflow(self, record: WorkflowRunRecord) -> None:
469
- ... # Insert into database
470
- ... pass
471
- ...
472
- ... def get_calls(self, since=None, workflow_name=None, limit=1000):
473
- ... # Query database
474
- ... return []
475
- ...
476
- ... def get_workflows(self, since=None, workflow_name=None, limit=100):
477
- ... # Query database
478
- ... return []
479
- """
480
-
481
- def log_call(self, record: LLMCallRecord) -> None:
482
- """Log an LLM call record."""
483
- ...
484
-
485
- def log_workflow(self, record: WorkflowRunRecord) -> None:
486
- """Log a workflow run record."""
487
- ...
488
-
489
- def get_calls(
490
- self,
491
- since: datetime | None = None,
492
- workflow_name: str | None = None,
493
- limit: int = 1000,
494
- ) -> list[LLMCallRecord]:
495
- """Get LLM call records with optional filters."""
496
- ...
497
-
498
- def get_workflows(
499
- self,
500
- since: datetime | None = None,
501
- workflow_name: str | None = None,
502
- limit: int = 100,
503
- ) -> list[WorkflowRunRecord]:
504
- """Get workflow run records with optional filters."""
505
- ...
506
-
507
- # Tier 1 automation monitoring methods
508
- def log_task_routing(self, record: TaskRoutingRecord) -> None:
509
- """Log a task routing decision."""
510
- ...
511
-
512
- def log_test_execution(self, record: TestExecutionRecord) -> None:
513
- """Log a test execution."""
514
- ...
515
-
516
- def log_coverage(self, record: CoverageRecord) -> None:
517
- """Log coverage metrics."""
518
- ...
519
-
520
- def log_agent_assignment(self, record: AgentAssignmentRecord) -> None:
521
- """Log an agent assignment."""
522
- ...
523
-
524
- def get_task_routings(
525
- self,
526
- since: datetime | None = None,
527
- status: str | None = None,
528
- limit: int = 1000,
529
- ) -> list[TaskRoutingRecord]:
530
- """Get task routing records with optional filters."""
531
- ...
532
-
533
- def get_test_executions(
534
- self,
535
- since: datetime | None = None,
536
- success_only: bool = False,
537
- limit: int = 100,
538
- ) -> list[TestExecutionRecord]:
539
- """Get test execution records with optional filters."""
540
- ...
541
-
542
- def get_coverage_history(
543
- self,
544
- since: datetime | None = None,
545
- limit: int = 100,
546
- ) -> list[CoverageRecord]:
547
- """Get coverage history records."""
548
- ...
549
-
550
- def get_agent_assignments(
551
- self,
552
- since: datetime | None = None,
553
- automated_only: bool = True,
554
- limit: int = 1000,
555
- ) -> list[AgentAssignmentRecord]:
556
- """Get agent assignment records with optional filters."""
557
- ...
558
-
559
- # Per-file test tracking methods
560
- def log_file_test(self, record: "FileTestRecord") -> None:
561
- """Log a per-file test execution record."""
562
- ...
563
-
564
- def get_file_tests(
565
- self,
566
- file_path: str | None = None,
567
- since: datetime | None = None,
568
- result_filter: str | None = None,
569
- limit: int = 1000,
570
- ) -> list["FileTestRecord"]:
571
- """Get per-file test records with optional filters."""
572
- ...
573
-
574
- def get_latest_file_test(self, file_path: str) -> "FileTestRecord | None":
575
- """Get the most recent test record for a specific file."""
576
- ...
577
-
578
-
579
- def _parse_timestamp(timestamp_str: str) -> datetime:
580
- """Parse ISO format timestamp, handling 'Z' suffix for Python 3.10 compatibility.
581
-
582
- Args:
583
- timestamp_str: ISO format timestamp string, possibly with 'Z' suffix
584
-
585
- Returns:
586
- Parsed datetime object (timezone-naive UTC)
587
- """
588
- # Python 3.10's fromisoformat() doesn't handle 'Z' suffix
589
- if timestamp_str.endswith("Z"):
590
- timestamp_str = timestamp_str[:-1]
591
-
592
- dt = datetime.fromisoformat(timestamp_str)
593
-
594
- # Convert to naive UTC if timezone-aware
595
- if dt.tzinfo is not None:
596
- dt = dt.replace(tzinfo=None)
597
-
598
- return dt
599
-
600
-
601
- class TelemetryStore:
602
- """JSONL file-based telemetry backend (default implementation).
603
-
604
- Stores records in JSONL format for easy streaming and analysis.
605
- Implements the TelemetryBackend protocol.
606
-
607
- Supports both core telemetry and Tier 1 automation monitoring.
608
- """
609
-
610
- def __init__(self, storage_dir: str = ".empathy"):
611
- """Initialize telemetry store.
612
-
613
- Args:
614
- storage_dir: Directory for telemetry files
615
-
616
- """
617
- self.storage_dir = Path(storage_dir)
618
- self.storage_dir.mkdir(parents=True, exist_ok=True)
619
-
620
- # Core telemetry files
621
- self.calls_file = self.storage_dir / "llm_calls.jsonl"
622
- self.workflows_file = self.storage_dir / "workflow_runs.jsonl"
623
-
624
- # Tier 1 automation monitoring files
625
- self.task_routing_file = self.storage_dir / "task_routing.jsonl"
626
- self.test_executions_file = self.storage_dir / "test_executions.jsonl"
627
- self.coverage_history_file = self.storage_dir / "coverage_history.jsonl"
628
- self.agent_assignments_file = self.storage_dir / "agent_assignments.jsonl"
629
-
630
- # Per-file test tracking
631
- self.file_tests_file = self.storage_dir / "file_tests.jsonl"
632
-
633
- def log_call(self, record: LLMCallRecord) -> None:
634
- """Log an LLM call record."""
635
- with open(self.calls_file, "a") as f:
636
- f.write(json.dumps(record.to_dict()) + "\n")
637
-
638
- def log_workflow(self, record: WorkflowRunRecord) -> None:
639
- """Log a workflow run record."""
640
- with open(self.workflows_file, "a") as f:
641
- f.write(json.dumps(record.to_dict()) + "\n")
642
-
643
- def get_calls(
644
- self,
645
- since: datetime | None = None,
646
- workflow_name: str | None = None,
647
- limit: int = 1000,
648
- ) -> list[LLMCallRecord]:
649
- """Get LLM call records.
650
-
651
- Args:
652
- since: Only return records after this time
653
- workflow_name: Filter by workflow name
654
- limit: Maximum records to return
655
-
656
- Returns:
657
- List of LLMCallRecord
658
-
659
- """
660
- records: list[LLMCallRecord] = []
661
- if not self.calls_file.exists():
662
- return records
663
-
664
- with open(self.calls_file) as f:
665
- for line in f:
666
- if not line.strip():
667
- continue
668
- try:
669
- data = json.loads(line)
670
- record = LLMCallRecord.from_dict(data)
671
-
672
- # Apply filters
673
- if since:
674
- record_time = _parse_timestamp(record.timestamp)
675
- if record_time < since:
676
- continue
677
-
678
- if workflow_name and record.workflow_name != workflow_name:
679
- continue
680
-
681
- records.append(record)
682
-
683
- if len(records) >= limit:
684
- break
685
- except (json.JSONDecodeError, KeyError):
686
- continue
687
-
688
- return records
689
-
690
- def get_workflows(
691
- self,
692
- since: datetime | None = None,
693
- workflow_name: str | None = None,
694
- limit: int = 100,
695
- ) -> list[WorkflowRunRecord]:
696
- """Get workflow run records.
697
-
698
- Args:
699
- since: Only return records after this time
700
- workflow_name: Filter by workflow name
701
- limit: Maximum records to return
702
-
703
- Returns:
704
- List of WorkflowRunRecord
705
-
706
- """
707
- records: list[WorkflowRunRecord] = []
708
- if not self.workflows_file.exists():
709
- return records
710
-
711
- with open(self.workflows_file) as f:
712
- for line in f:
713
- if not line.strip():
714
- continue
715
- try:
716
- data = json.loads(line)
717
- record = WorkflowRunRecord.from_dict(data)
718
-
719
- # Apply filters
720
- if since:
721
- record_time = _parse_timestamp(record.started_at)
722
- if record_time < since:
723
- continue
724
-
725
- if workflow_name and record.workflow_name != workflow_name:
726
- continue
727
-
728
- records.append(record)
729
-
730
- if len(records) >= limit:
731
- break
732
- except (json.JSONDecodeError, KeyError):
733
- continue
734
-
735
- return records
736
-
737
- # Tier 1 automation monitoring methods
738
-
739
- def log_task_routing(self, record: TaskRoutingRecord) -> None:
740
- """Log a task routing decision."""
741
- with open(self.task_routing_file, "a") as f:
742
- f.write(json.dumps(record.to_dict()) + "\n")
743
-
744
- def log_test_execution(self, record: TestExecutionRecord) -> None:
745
- """Log a test execution."""
746
- with open(self.test_executions_file, "a") as f:
747
- f.write(json.dumps(record.to_dict()) + "\n")
748
-
749
- def log_coverage(self, record: CoverageRecord) -> None:
750
- """Log coverage metrics."""
751
- with open(self.coverage_history_file, "a") as f:
752
- f.write(json.dumps(record.to_dict()) + "\n")
753
-
754
- def log_agent_assignment(self, record: AgentAssignmentRecord) -> None:
755
- """Log an agent assignment."""
756
- with open(self.agent_assignments_file, "a") as f:
757
- f.write(json.dumps(record.to_dict()) + "\n")
758
-
759
- def get_task_routings(
760
- self,
761
- since: datetime | None = None,
762
- status: str | None = None,
763
- limit: int = 1000,
764
- ) -> list[TaskRoutingRecord]:
765
- """Get task routing records.
766
-
767
- Args:
768
- since: Only return records after this time
769
- status: Filter by status (pending, running, completed, failed)
770
- limit: Maximum records to return
771
-
772
- Returns:
773
- List of TaskRoutingRecord
774
-
775
- """
776
- records: list[TaskRoutingRecord] = []
777
- if not self.task_routing_file.exists():
778
- return records
779
-
780
- with open(self.task_routing_file) as f:
781
- for line in f:
782
- if not line.strip():
783
- continue
784
- try:
785
- data = json.loads(line)
786
- record = TaskRoutingRecord.from_dict(data)
787
-
788
- # Apply filters
789
- if since:
790
- record_time = _parse_timestamp(record.timestamp)
791
- if record_time < since:
792
- continue
793
-
794
- if status and record.status != status:
795
- continue
796
-
797
- records.append(record)
798
-
799
- if len(records) >= limit:
800
- break
801
- except (json.JSONDecodeError, KeyError):
802
- continue
803
-
804
- return records
805
-
806
- def get_test_executions(
807
- self,
808
- since: datetime | None = None,
809
- success_only: bool = False,
810
- limit: int = 100,
811
- ) -> list[TestExecutionRecord]:
812
- """Get test execution records.
813
-
814
- Args:
815
- since: Only return records after this time
816
- success_only: Only return successful test runs
817
- limit: Maximum records to return
818
-
819
- Returns:
820
- List of TestExecutionRecord
821
-
822
- """
823
- records: list[TestExecutionRecord] = []
824
- if not self.test_executions_file.exists():
825
- return records
826
-
827
- with open(self.test_executions_file) as f:
828
- for line in f:
829
- if not line.strip():
830
- continue
831
- try:
832
- data = json.loads(line)
833
- record = TestExecutionRecord.from_dict(data)
834
-
835
- # Apply filters
836
- if since:
837
- record_time = _parse_timestamp(record.timestamp)
838
- if record_time < since:
839
- continue
840
-
841
- if success_only and not record.success:
842
- continue
843
-
844
- records.append(record)
845
-
846
- if len(records) >= limit:
847
- break
848
- except (json.JSONDecodeError, KeyError):
849
- continue
850
-
851
- return records
852
-
853
- def get_coverage_history(
854
- self,
855
- since: datetime | None = None,
856
- limit: int = 100,
857
- ) -> list[CoverageRecord]:
858
- """Get coverage history records.
859
-
860
- Args:
861
- since: Only return records after this time
862
- limit: Maximum records to return
863
-
864
- Returns:
865
- List of CoverageRecord
866
-
867
- """
868
- records: list[CoverageRecord] = []
869
- if not self.coverage_history_file.exists():
870
- return records
871
-
872
- with open(self.coverage_history_file) as f:
873
- for line in f:
874
- if not line.strip():
875
- continue
876
- try:
877
- data = json.loads(line)
878
- record = CoverageRecord.from_dict(data)
879
-
880
- # Apply filters
881
- if since:
882
- record_time = _parse_timestamp(record.timestamp)
883
- if record_time < since:
884
- continue
885
-
886
- records.append(record)
887
-
888
- if len(records) >= limit:
889
- break
890
- except (json.JSONDecodeError, KeyError):
891
- continue
892
-
893
- return records
894
-
895
- def get_agent_assignments(
896
- self,
897
- since: datetime | None = None,
898
- automated_only: bool = True,
899
- limit: int = 1000,
900
- ) -> list[AgentAssignmentRecord]:
901
- """Get agent assignment records.
902
-
903
- Args:
904
- since: Only return records after this time
905
- automated_only: Only return assignments eligible for Tier 1 automation
906
- limit: Maximum records to return
907
-
908
- Returns:
909
- List of AgentAssignmentRecord
910
-
911
- """
912
- records: list[AgentAssignmentRecord] = []
913
- if not self.agent_assignments_file.exists():
914
- return records
915
-
916
- with open(self.agent_assignments_file) as f:
917
- for line in f:
918
- if not line.strip():
919
- continue
920
- try:
921
- data = json.loads(line)
922
- record = AgentAssignmentRecord.from_dict(data)
923
-
924
- # Apply filters
925
- if since:
926
- record_time = _parse_timestamp(record.timestamp)
927
- if record_time < since:
928
- continue
929
-
930
- if automated_only and not record.automated_eligible:
931
- continue
932
-
933
- records.append(record)
934
-
935
- if len(records) >= limit:
936
- break
937
- except (json.JSONDecodeError, KeyError):
938
- continue
939
-
940
- return records
941
-
942
- # Per-file test tracking methods
943
-
944
- def log_file_test(self, record: "FileTestRecord") -> None:
945
- """Log a per-file test execution record.
946
-
947
- Args:
948
- record: FileTestRecord to log
949
- """
950
- with open(self.file_tests_file, "a") as f:
951
- f.write(json.dumps(record.to_dict()) + "\n")
952
-
953
- def get_file_tests(
954
- self,
955
- file_path: str | None = None,
956
- since: datetime | None = None,
957
- result_filter: str | None = None,
958
- limit: int = 1000,
959
- ) -> list["FileTestRecord"]:
960
- """Get per-file test records with optional filters.
961
-
962
- Args:
963
- file_path: Filter by specific file path
964
- since: Only return records after this time
965
- result_filter: Filter by result (passed, failed, error, skipped, no_tests)
966
- limit: Maximum records to return
967
-
968
- Returns:
969
- List of FileTestRecord
970
- """
971
- records: list[FileTestRecord] = []
972
- if not self.file_tests_file.exists():
973
- return records
974
-
975
- with open(self.file_tests_file) as f:
976
- for line in f:
977
- if not line.strip():
978
- continue
979
- try:
980
- data = json.loads(line)
981
- record = FileTestRecord.from_dict(data)
982
-
983
- # Apply filters
984
- if file_path and record.file_path != file_path:
985
- continue
986
-
987
- if since:
988
- record_time = _parse_timestamp(record.timestamp)
989
- if record_time < since:
990
- continue
991
-
992
- if result_filter and record.last_test_result != result_filter:
993
- continue
994
-
995
- records.append(record)
996
-
997
- if len(records) >= limit:
998
- break
999
- except (json.JSONDecodeError, KeyError):
1000
- continue
1001
-
1002
- return records
1003
-
1004
- def get_latest_file_test(self, file_path: str) -> "FileTestRecord | None":
1005
- """Get the most recent test record for a specific file.
1006
-
1007
- Args:
1008
- file_path: Path to the source file
1009
-
1010
- Returns:
1011
- Most recent FileTestRecord or None if not found
1012
- """
1013
- records = self.get_file_tests(file_path=file_path, limit=10000)
1014
- if not records:
1015
- return None
1016
-
1017
- # Return the most recent record (last one since we read in chronological order)
1018
- return records[-1]
1019
-
1020
- def get_files_needing_tests(
1021
- self,
1022
- stale_only: bool = False,
1023
- failed_only: bool = False,
1024
- ) -> list["FileTestRecord"]:
1025
- """Get files that need test attention.
1026
-
1027
- Args:
1028
- stale_only: Only return files with stale tests
1029
- failed_only: Only return files with failed tests
1030
-
1031
- Returns:
1032
- List of FileTestRecord for files needing attention
1033
- """
1034
- all_records = self.get_file_tests(limit=100000)
1035
-
1036
- # Get latest record per file
1037
- latest_by_file: dict[str, FileTestRecord] = {}
1038
- for record in all_records:
1039
- existing = latest_by_file.get(record.file_path)
1040
- if existing is None:
1041
- latest_by_file[record.file_path] = record
1042
- else:
1043
- # Keep the more recent one
1044
- if record.timestamp > existing.timestamp:
1045
- latest_by_file[record.file_path] = record
1046
-
1047
- # Filter based on criteria
1048
- results = []
1049
- for record in latest_by_file.values():
1050
- if stale_only and not record.is_stale:
1051
- continue
1052
- if failed_only and record.last_test_result not in ("failed", "error"):
1053
- continue
1054
- if not stale_only and not failed_only:
1055
- # Return all files needing attention (stale OR failed OR no_tests)
1056
- if (
1057
- record.last_test_result not in ("failed", "error", "no_tests")
1058
- and not record.is_stale
1059
- ):
1060
- continue
1061
- results.append(record)
1062
-
1063
- return results
1064
-
1065
-
1066
- class TelemetryAnalytics:
1067
- """Analytics helpers for telemetry data.
1068
-
1069
- Provides insights into cost optimization, provider usage, and performance.
1070
- """
1071
-
1072
- def __init__(self, store: TelemetryStore | None = None):
1073
- """Initialize analytics.
1074
-
1075
- Args:
1076
- store: TelemetryStore to analyze (creates default if None)
1077
-
1078
- """
1079
- self.store = store or TelemetryStore()
1080
-
1081
- def top_expensive_workflows(
1082
- self,
1083
- n: int = 10,
1084
- since: datetime | None = None,
1085
- ) -> list[dict[str, Any]]:
1086
- """Get the most expensive workflows.
1087
-
1088
- Args:
1089
- n: Number of workflows to return
1090
- since: Only consider workflows after this time
1091
-
1092
- Returns:
1093
- List of dicts with workflow_name, total_cost, run_count
1094
-
1095
- """
1096
- workflows = self.store.get_workflows(since=since, limit=10000)
1097
-
1098
- # Aggregate by workflow name
1099
- costs: dict[str, dict[str, Any]] = {}
1100
- for wf in workflows:
1101
- if wf.workflow_name not in costs:
1102
- costs[wf.workflow_name] = {
1103
- "workflow_name": wf.workflow_name,
1104
- "total_cost": 0.0,
1105
- "run_count": 0,
1106
- "total_savings": 0.0,
1107
- "avg_duration_ms": 0,
1108
- }
1109
- costs[wf.workflow_name]["total_cost"] += wf.total_cost
1110
- costs[wf.workflow_name]["run_count"] += 1
1111
- costs[wf.workflow_name]["total_savings"] += wf.savings
1112
-
1113
- # Calculate averages and sort
1114
- result = list(costs.values())
1115
- for item in result:
1116
- if item["run_count"] > 0:
1117
- item["avg_cost"] = item["total_cost"] / item["run_count"]
1118
-
1119
- result.sort(key=lambda x: x["total_cost"], reverse=True)
1120
- return result[:n]
1121
-
1122
- def provider_usage_summary(
1123
- self,
1124
- since: datetime | None = None,
1125
- ) -> dict[str, dict[str, Any]]:
1126
- """Get usage summary by provider.
1127
-
1128
- Args:
1129
- since: Only consider calls after this time
1130
-
1131
- Returns:
1132
- Dict mapping provider to usage stats
1133
-
1134
- """
1135
- calls = self.store.get_calls(since=since, limit=100000)
1136
-
1137
- summary: dict[str, dict[str, Any]] = {}
1138
- for call in calls:
1139
- if call.provider not in summary:
1140
- summary[call.provider] = {
1141
- "call_count": 0,
1142
- "total_tokens": 0,
1143
- "total_cost": 0.0,
1144
- "error_count": 0,
1145
- "avg_latency_ms": 0,
1146
- "by_tier": {"cheap": 0, "capable": 0, "premium": 0},
1147
- }
1148
-
1149
- s = summary[call.provider]
1150
- s["call_count"] += 1
1151
- s["total_tokens"] += call.input_tokens + call.output_tokens
1152
- s["total_cost"] += call.estimated_cost
1153
- if not call.success:
1154
- s["error_count"] += 1
1155
- if call.tier in s["by_tier"]:
1156
- s["by_tier"][call.tier] += 1
1157
-
1158
- # Calculate averages
1159
- for _provider, stats in summary.items():
1160
- if stats["call_count"] > 0:
1161
- stats["avg_cost"] = stats["total_cost"] / stats["call_count"]
1162
-
1163
- return summary
1164
-
1165
- def tier_distribution(
1166
- self,
1167
- since: datetime | None = None,
1168
- ) -> dict[str, dict[str, Any]]:
1169
- """Get call distribution by tier.
1170
-
1171
- Args:
1172
- since: Only consider calls after this time
1173
-
1174
- Returns:
1175
- Dict mapping tier to stats
1176
-
1177
- """
1178
- calls = self.store.get_calls(since=since, limit=100000)
1179
-
1180
- dist: dict[str, dict[str, Any]] = {
1181
- "cheap": {"count": 0, "cost": 0.0, "tokens": 0},
1182
- "capable": {"count": 0, "cost": 0.0, "tokens": 0},
1183
- "premium": {"count": 0, "cost": 0.0, "tokens": 0},
1184
- }
1185
-
1186
- for call in calls:
1187
- if call.tier in dist:
1188
- dist[call.tier]["count"] += 1
1189
- dist[call.tier]["cost"] += call.estimated_cost
1190
- dist[call.tier]["tokens"] += call.input_tokens + call.output_tokens
1191
-
1192
- total_calls = sum(d["count"] for d in dist.values())
1193
- for _tier, stats in dist.items():
1194
- stats["percent"] = (stats["count"] / total_calls * 100) if total_calls > 0 else 0
1195
-
1196
- return dist
1197
-
1198
- def fallback_stats(
1199
- self,
1200
- since: datetime | None = None,
1201
- ) -> dict[str, Any]:
1202
- """Get fallback usage statistics.
1203
-
1204
- Args:
1205
- since: Only consider calls after this time
1206
-
1207
- Returns:
1208
- Dict with fallback stats
1209
-
1210
- """
1211
- calls = self.store.get_calls(since=since, limit=100000)
1212
-
1213
- total = len(calls)
1214
- fallback_count = sum(1 for c in calls if c.fallback_used)
1215
- error_count = sum(1 for c in calls if not c.success)
1216
-
1217
- # Count by original provider
1218
- by_provider: dict[str, int] = {}
1219
- for call in calls:
1220
- if call.fallback_used and call.original_provider:
1221
- by_provider[call.original_provider] = by_provider.get(call.original_provider, 0) + 1
1222
-
1223
- return {
1224
- "total_calls": total,
1225
- "fallback_count": fallback_count,
1226
- "fallback_percent": (fallback_count / total * 100) if total > 0 else 0,
1227
- "error_count": error_count,
1228
- "error_rate": (error_count / total * 100) if total > 0 else 0,
1229
- "by_original_provider": by_provider,
1230
- }
1231
-
1232
- def sonnet_opus_fallback_analysis(
1233
- self,
1234
- since: datetime | None = None,
1235
- ) -> dict[str, Any]:
1236
- """Analyze Sonnet 4.5 → Opus 4.5 fallback performance and cost savings.
1237
-
1238
- Tracks:
1239
- - How often Sonnet 4.5 succeeds vs needs Opus fallback
1240
- - Cost savings from using Sonnet instead of always using Opus
1241
- - Success rates by model
1242
-
1243
- Args:
1244
- since: Only consider calls after this time
1245
-
1246
- Returns:
1247
- Dict with fallback analysis and cost savings
1248
- """
1249
- calls = self.store.get_calls(since=since, limit=100000)
1250
-
1251
- # Filter for Anthropic calls (Sonnet/Opus)
1252
- anthropic_calls = [
1253
- c
1254
- for c in calls
1255
- if c.provider == "anthropic"
1256
- and c.model_id in ["claude-sonnet-4-5", "claude-opus-4-5-20251101"]
1257
- ]
1258
-
1259
- if not anthropic_calls:
1260
- return {
1261
- "total_calls": 0,
1262
- "sonnet_attempts": 0,
1263
- "sonnet_successes": 0,
1264
- "opus_fallbacks": 0,
1265
- "success_rate_sonnet": 0.0,
1266
- "fallback_rate": 0.0,
1267
- "actual_cost": 0.0,
1268
- "always_opus_cost": 0.0,
1269
- "savings": 0.0,
1270
- "savings_percent": 0.0,
1271
- }
1272
-
1273
- total = len(anthropic_calls)
1274
-
1275
- # Count Sonnet attempts and successes
1276
- sonnet_calls = [c for c in anthropic_calls if c.model_id == "claude-sonnet-4-5"]
1277
- sonnet_successes = sum(1 for c in sonnet_calls if c.success)
1278
-
1279
- # Count Opus fallbacks (calls with fallback_used and ended up on Opus)
1280
- opus_fallbacks = sum(
1281
- 1
1282
- for c in anthropic_calls
1283
- if c.model_id == "claude-opus-4-5-20251101" and c.fallback_used
1284
- )
1285
-
1286
- # Calculate costs
1287
- actual_cost = sum(c.estimated_cost for c in anthropic_calls)
1288
-
1289
- # Calculate what it would cost if everything used Opus
1290
- opus_input_cost = 15.00 / 1_000_000 # per token
1291
- opus_output_cost = 75.00 / 1_000_000 # per token
1292
- always_opus_cost = sum(
1293
- (c.input_tokens * opus_input_cost) + (c.output_tokens * opus_output_cost)
1294
- for c in anthropic_calls
1295
- )
1296
-
1297
- savings = always_opus_cost - actual_cost
1298
- savings_percent = (savings / always_opus_cost * 100) if always_opus_cost > 0 else 0
1299
-
1300
- return {
1301
- "total_calls": total,
1302
- "sonnet_attempts": len(sonnet_calls),
1303
- "sonnet_successes": sonnet_successes,
1304
- "opus_fallbacks": opus_fallbacks,
1305
- "success_rate_sonnet": (
1306
- (sonnet_successes / len(sonnet_calls) * 100) if sonnet_calls else 0.0
1307
- ),
1308
- "fallback_rate": (opus_fallbacks / total * 100) if total > 0 else 0.0,
1309
- "actual_cost": actual_cost,
1310
- "always_opus_cost": always_opus_cost,
1311
- "savings": savings,
1312
- "savings_percent": savings_percent,
1313
- "avg_cost_per_call": actual_cost / total if total > 0 else 0.0,
1314
- "avg_opus_cost_per_call": always_opus_cost / total if total > 0 else 0.0,
1315
- }
1316
-
1317
- def cost_savings_report(
1318
- self,
1319
- since: datetime | None = None,
1320
- ) -> dict[str, Any]:
1321
- """Generate cost savings report.
1322
-
1323
- Args:
1324
- since: Only consider workflows after this time
1325
-
1326
- Returns:
1327
- Dict with savings analysis
1328
-
1329
- """
1330
- workflows = self.store.get_workflows(since=since, limit=10000)
1331
-
1332
- total_cost = sum(wf.total_cost for wf in workflows)
1333
- total_baseline = sum(wf.baseline_cost for wf in workflows)
1334
- total_savings = sum(wf.savings for wf in workflows)
1335
-
1336
- return {
1337
- "workflow_count": len(workflows),
1338
- "total_actual_cost": total_cost,
1339
- "total_baseline_cost": total_baseline,
1340
- "total_savings": total_savings,
1341
- "savings_percent": (
1342
- (total_savings / total_baseline * 100) if total_baseline > 0 else 0
1343
- ),
1344
- "avg_cost_per_workflow": total_cost / len(workflows) if workflows else 0,
1345
- }
1346
-
1347
- # Tier 1 automation monitoring analytics
1348
-
1349
- def task_routing_accuracy(
1350
- self,
1351
- since: datetime | None = None,
1352
- ) -> dict[str, Any]:
1353
- """Analyze task routing accuracy.
1354
-
1355
- Args:
1356
- since: Only consider routings after this time
1357
-
1358
- Returns:
1359
- Dict with routing accuracy metrics by task type and strategy
1360
-
1361
- """
1362
- routings = self.store.get_task_routings(since=since, limit=10000)
1363
-
1364
- if not routings:
1365
- return {
1366
- "total_tasks": 0,
1367
- "successful_routing": 0,
1368
- "accuracy_rate": 0.0,
1369
- "avg_confidence": 0.0,
1370
- "by_task_type": {},
1371
- "by_strategy": {},
1372
- }
1373
-
1374
- total = len(routings)
1375
- successful = sum(1 for r in routings if r.success)
1376
- total_confidence = sum(r.confidence_score for r in routings)
1377
-
1378
- # Aggregate by task type
1379
- by_type: dict[str, dict[str, int | float]] = {}
1380
- for r in routings:
1381
- if r.task_type not in by_type:
1382
- by_type[r.task_type] = {"total": 0, "success": 0}
1383
- by_type[r.task_type]["total"] += 1
1384
- if r.success:
1385
- by_type[r.task_type]["success"] += 1
1386
-
1387
- # Calculate rates
1388
- for _task_type, stats in by_type.items():
1389
- stats["rate"] = stats["success"] / stats["total"] if stats["total"] > 0 else 0.0
1390
-
1391
- # Aggregate by strategy
1392
- by_strategy: dict[str, dict[str, int]] = {}
1393
- for r in routings:
1394
- if r.routing_strategy not in by_strategy:
1395
- by_strategy[r.routing_strategy] = {"total": 0, "success": 0}
1396
- by_strategy[r.routing_strategy]["total"] += 1
1397
- if r.success:
1398
- by_strategy[r.routing_strategy]["success"] += 1
1399
-
1400
- return {
1401
- "total_tasks": total,
1402
- "successful_routing": successful,
1403
- "accuracy_rate": successful / total if total > 0 else 0.0,
1404
- "avg_confidence": total_confidence / total if total > 0 else 0.0,
1405
- "by_task_type": by_type,
1406
- "by_strategy": by_strategy,
1407
- }
1408
-
1409
- def test_execution_trends(
1410
- self,
1411
- since: datetime | None = None,
1412
- ) -> dict[str, Any]:
1413
- """Analyze test execution trends.
1414
-
1415
- Args:
1416
- since: Only consider executions after this time
1417
-
1418
- Returns:
1419
- Dict with test execution metrics and trends
1420
-
1421
- """
1422
- executions = self.store.get_test_executions(since=since, limit=1000)
1423
-
1424
- if not executions:
1425
- return {
1426
- "total_executions": 0,
1427
- "success_rate": 0.0,
1428
- "avg_duration_seconds": 0.0,
1429
- "total_tests_run": 0,
1430
- "total_failures": 0,
1431
- "coverage_trend": "stable",
1432
- "most_failing_tests": [],
1433
- }
1434
-
1435
- total_execs = len(executions)
1436
- successful_execs = sum(1 for e in executions if e.success)
1437
- total_duration = sum(e.duration_seconds for e in executions)
1438
- total_tests = sum(e.total_tests for e in executions)
1439
- total_failures = sum(e.failed for e in executions)
1440
-
1441
- # Find most failing tests
1442
- failure_counts: dict[str, int] = {}
1443
- for exec_rec in executions:
1444
- for test in exec_rec.failed_tests:
1445
- test_name = test.get("name", "unknown")
1446
- failure_counts[test_name] = failure_counts.get(test_name, 0) + 1
1447
-
1448
- most_failing = [
1449
- {"name": name, "failures": count}
1450
- for name, count in heapq.nlargest(10, failure_counts.items(), key=lambda x: x[1])
1451
- ]
1452
-
1453
- return {
1454
- "total_executions": total_execs,
1455
- "success_rate": successful_execs / total_execs if total_execs > 0 else 0.0,
1456
- "avg_duration_seconds": total_duration / total_execs if total_execs > 0 else 0.0,
1457
- "total_tests_run": total_tests,
1458
- "total_failures": total_failures,
1459
- "coverage_trend": "stable", # Will be computed from coverage_progress
1460
- "most_failing_tests": most_failing,
1461
- }
1462
-
1463
- def coverage_progress(
1464
- self,
1465
- since: datetime | None = None,
1466
- ) -> dict[str, Any]:
1467
- """Track coverage progress over time.
1468
-
1469
- Args:
1470
- since: Only consider coverage records after this time
1471
-
1472
- Returns:
1473
- Dict with coverage metrics and trends
1474
-
1475
- """
1476
- records = self.store.get_coverage_history(since=since, limit=1000)
1477
-
1478
- if not records:
1479
- return {
1480
- "current_coverage": 0.0,
1481
- "previous_coverage": 0.0,
1482
- "change": 0.0,
1483
- "trend": "no_data",
1484
- "coverage_history": [],
1485
- "files_improved": 0,
1486
- "files_declined": 0,
1487
- "critical_gaps_count": 0,
1488
- }
1489
-
1490
- # Latest and first records
1491
- latest = records[-1]
1492
- first = records[0]
1493
- current_coverage = latest.overall_percentage
1494
-
1495
- # Calculate trend by comparing first to last
1496
- if len(records) == 1:
1497
- # Single record - no trend analysis possible
1498
- prev_coverage = 0.0
1499
- change = 0.0
1500
- trend = "stable"
1501
- else:
1502
- # Multiple records - compare first to last
1503
- prev_coverage = first.overall_percentage
1504
- change = current_coverage - prev_coverage
1505
-
1506
- # Determine trend based on change
1507
- if change > 1.0:
1508
- trend = "improving"
1509
- elif change < -1.0:
1510
- trend = "declining"
1511
- else:
1512
- trend = "stable"
1513
-
1514
- # Build coverage history from records
1515
- coverage_history = [
1516
- {
1517
- "timestamp": r.timestamp,
1518
- "coverage": r.overall_percentage,
1519
- "trend": r.trend,
1520
- }
1521
- for r in records
1522
- ]
1523
-
1524
- return {
1525
- "current_coverage": current_coverage,
1526
- "previous_coverage": prev_coverage,
1527
- "change": change,
1528
- "trend": trend,
1529
- "coverage_history": coverage_history,
1530
- "files_improved": 0, # Would need file-level history
1531
- "files_declined": 0, # Would need file-level history
1532
- "critical_gaps_count": len(latest.critical_gaps),
1533
- }
1534
-
1535
- def agent_performance(
1536
- self,
1537
- since: datetime | None = None,
1538
- ) -> dict[str, Any]:
1539
- """Analyze agent/workflow performance.
1540
-
1541
- Args:
1542
- since: Only consider assignments after this time
1543
-
1544
- Returns:
1545
- Dict with agent performance metrics
1546
-
1547
- """
1548
- assignments = self.store.get_agent_assignments(
1549
- since=since, automated_only=False, limit=10000
1550
- )
1551
-
1552
- if not assignments:
1553
- return {
1554
- "total_assignments": 0,
1555
- "by_agent": {},
1556
- "automation_rate": 0.0,
1557
- "human_review_rate": 0.0,
1558
- }
1559
-
1560
- # Aggregate by agent
1561
- by_agent: dict[str, dict[str, Any]] = {}
1562
- total_assignments = len(assignments)
1563
- total_automated = 0
1564
- total_human_review = 0
1565
-
1566
- for assignment in assignments:
1567
- agent = assignment.assigned_agent
1568
- if agent not in by_agent:
1569
- by_agent[agent] = {
1570
- "assignments": 0,
1571
- "completed": 0,
1572
- "successful": 0,
1573
- "success_rate": 0.0,
1574
- "avg_duration_hours": 0.0,
1575
- "quality_score_avg": 0.0,
1576
- "total_duration": 0.0,
1577
- "quality_scores": [],
1578
- }
1579
-
1580
- stats = by_agent[agent]
1581
- stats["assignments"] += 1
1582
- if assignment.status == "completed":
1583
- stats["completed"] += 1
1584
- if assignment.actual_duration_hours is not None:
1585
- stats["total_duration"] += assignment.actual_duration_hours
1586
-
1587
- # Track successful assignments (not just completed)
1588
- if assignment.success:
1589
- stats["successful"] += 1
1590
-
1591
- if assignment.automated_eligible:
1592
- total_automated += 1
1593
- if assignment.human_review_required:
1594
- total_human_review += 1
1595
-
1596
- # Calculate averages
1597
- for _agent, stats in by_agent.items():
1598
- if stats["assignments"] > 0:
1599
- stats["success_rate"] = stats["successful"] / stats["assignments"]
1600
- if stats["completed"] > 0:
1601
- stats["avg_duration_hours"] = stats["total_duration"] / stats["completed"]
1602
-
1603
- # Remove helper fields
1604
- del stats["total_duration"]
1605
- del stats["quality_scores"]
1606
- del stats["successful"] # Remove helper field, keep success_rate
1607
-
1608
- return {
1609
- "total_assignments": total_assignments,
1610
- "by_agent": by_agent,
1611
- "automation_rate": (
1612
- total_automated / total_assignments if total_assignments > 0 else 0.0
1613
- ),
1614
- "human_review_rate": (
1615
- total_human_review / total_assignments if total_assignments > 0 else 0.0
1616
- ),
1617
- }
1618
-
1619
- def tier1_summary(
1620
- self,
1621
- since: datetime | None = None,
1622
- ) -> dict[str, Any]:
1623
- """Comprehensive Tier 1 automation summary.
1624
-
1625
- Args:
1626
- since: Only consider records after this time
1627
-
1628
- Returns:
1629
- Dict combining all Tier 1 metrics
1630
-
1631
- """
1632
- return {
1633
- "task_routing": self.task_routing_accuracy(since),
1634
- "test_execution": self.test_execution_trends(since),
1635
- "coverage": self.coverage_progress(since),
1636
- "agent_performance": self.agent_performance(since),
1637
- "cost_savings": self.cost_savings_report(since),
1638
- }
1639
-
1640
-
1641
- # Singleton for global telemetry
1642
- _telemetry_store: TelemetryStore | None = None
1643
-
1644
-
1645
- def get_telemetry_store(storage_dir: str = ".empathy") -> TelemetryStore:
1646
- """Get or create the global telemetry store."""
1647
- global _telemetry_store
1648
- if _telemetry_store is None:
1649
- _telemetry_store = TelemetryStore(storage_dir)
1650
- return _telemetry_store
1651
-
1652
-
1653
- def log_llm_call(record: LLMCallRecord) -> None:
1654
- """Convenience function to log an LLM call."""
1655
- get_telemetry_store().log_call(record)
1656
-
1657
-
1658
- def log_workflow_run(record: WorkflowRunRecord) -> None:
1659
- """Convenience function to log a workflow run."""
1660
- get_telemetry_store().log_workflow(record)