language-operator 0.1.63 → 0.1.66
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.plan.md +127 -0
- data/.rspec +3 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +4 -1
- data/Makefile +34 -80
- data/README.md +20 -1
- data/components/agent/Gemfile +1 -1
- data/docs/cheat-sheet.md +173 -0
- data/docs/observability.md +208 -0
- data/lib/language_operator/agent/base.rb +10 -1
- data/lib/language_operator/agent/event_config.rb +172 -0
- data/lib/language_operator/agent/safety/ast_validator.rb +1 -1
- data/lib/language_operator/agent/safety/safe_executor.rb +5 -1
- data/lib/language_operator/agent/task_executor.rb +97 -7
- data/lib/language_operator/agent/telemetry.rb +25 -3
- data/lib/language_operator/agent/web_server.rb +6 -9
- data/lib/language_operator/agent.rb +24 -14
- data/lib/language_operator/cli/commands/agent/base.rb +155 -64
- data/lib/language_operator/cli/commands/agent/code_operations.rb +157 -16
- data/lib/language_operator/cli/commands/cluster.rb +2 -2
- data/lib/language_operator/cli/commands/status.rb +2 -2
- data/lib/language_operator/cli/commands/system/synthesize.rb +1 -1
- data/lib/language_operator/cli/errors/suggestions.rb +1 -1
- data/lib/language_operator/cli/formatters/value_formatter.rb +1 -1
- data/lib/language_operator/cli/helpers/ux_helper.rb +3 -4
- data/lib/language_operator/config.rb +3 -3
- data/lib/language_operator/constants/kubernetes_labels.rb +2 -2
- data/lib/language_operator/constants.rb +1 -0
- data/lib/language_operator/dsl/task_definition.rb +18 -7
- data/lib/language_operator/instrumentation/task_tracer.rb +44 -3
- data/lib/language_operator/kubernetes/client.rb +112 -1
- data/lib/language_operator/templates/schema/CHANGELOG.md +28 -0
- data/lib/language_operator/templates/schema/agent_dsl_openapi.yaml +1 -1
- data/lib/language_operator/templates/schema/agent_dsl_schema.json +1 -1
- data/lib/language_operator/type_coercion.rb +22 -8
- data/lib/language_operator/version.rb +1 -1
- data/synth/002/agent.rb +23 -12
- data/synth/002/output.log +88 -15
- data/synth/003/Makefile +17 -4
- data/synth/003/agent.txt +1 -1
- data/synth/004/Makefile +54 -0
- data/synth/004/README.md +281 -0
- data/synth/004/instructions.txt +1 -0
- metadata +11 -6
- data/lib/language_operator/cli/commands/agent/learning.rb +0 -289
- data/synth/003/agent.optimized.rb +0 -66
- data/synth/003/agent.synthesized.rb +0 -41
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
# Observability and Telemetry
|
|
2
|
+
|
|
3
|
+
The Language Operator gem includes comprehensive OpenTelemetry instrumentation to enable observability, debugging, and optimization of agent executions.
|
|
4
|
+
|
|
5
|
+
## OpenTelemetry Integration
|
|
6
|
+
|
|
7
|
+
The gem automatically instruments agent executions with OpenTelemetry spans, following the [OpenTelemetry Semantic Conventions for GenAI](https://opentelemetry.io/docs/specs/semconv/gen-ai/).
|
|
8
|
+
|
|
9
|
+
### Configuration
|
|
10
|
+
|
|
11
|
+
Configure telemetry via environment variables:
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
# Basic telemetry (always enabled)
|
|
15
|
+
OTEL_EXPORTER_OTLP_ENDPOINT=https://your-otel-collector:4317
|
|
16
|
+
|
|
17
|
+
# Data capture controls (optional - defaults to metadata only)
|
|
18
|
+
CAPTURE_TASK_INPUTS=true # Capture full task inputs as JSON
|
|
19
|
+
CAPTURE_TASK_OUTPUTS=true # Capture full task outputs as JSON
|
|
20
|
+
CAPTURE_TOOL_ARGS=true # Capture tool call arguments
|
|
21
|
+
CAPTURE_TOOL_RESULTS=true # Capture tool call results
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
**Security Note:** Data capture is disabled by default to prevent sensitive information leakage. Only enable full data capture in secure environments.
|
|
25
|
+
|
|
26
|
+
## Span Hierarchy
|
|
27
|
+
|
|
28
|
+
The gem creates a hierarchical trace structure that enables the learning system to identify and analyze complete agent executions:
|
|
29
|
+
|
|
30
|
+
```
|
|
31
|
+
agent_executor (parent span - overall agent run)
|
|
32
|
+
└── task_executor.execute_task (child span - task 1)
|
|
33
|
+
└── execute_tool github (grandchild span - tool call 1)
|
|
34
|
+
└── execute_tool slack (grandchild span - tool call 2)
|
|
35
|
+
└── task_executor.execute_task (child span - task 2)
|
|
36
|
+
└── task_executor.execute_task (child span - task 3)
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
### Span Names
|
|
40
|
+
|
|
41
|
+
| Span Name | Purpose | Created By |
|
|
42
|
+
|-----------|---------|------------|
|
|
43
|
+
| `agent_executor` | Overall agent execution | `LanguageOperator::Agent.execute_main_block()` |
|
|
44
|
+
| `task_executor.execute_task` | Individual task execution | `TaskExecutor#execute_task()` |
|
|
45
|
+
| `execute_tool #{tool_name}` | Tool calls from LLM responses | `TaskTracer#record_single_tool_call()` |
|
|
46
|
+
| `execute_tool.#{tool_name}` | Direct tool calls from symbolic tasks | `Client::Base` tool wrapper |
|
|
47
|
+
|
|
48
|
+
## Span Attributes
|
|
49
|
+
|
|
50
|
+
### Agent Executor Span
|
|
51
|
+
|
|
52
|
+
The top-level `agent_executor` span includes:
|
|
53
|
+
|
|
54
|
+
```
|
|
55
|
+
agent.name: "my-agent" # Agent identifier
|
|
56
|
+
agent.task_count: 5 # Number of tasks in agent
|
|
57
|
+
agent.mode: "autonomous" # Execution mode (autonomous/scheduled/interactive)
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### Task Executor Span
|
|
61
|
+
|
|
62
|
+
Each `task_executor.execute_task` span includes:
|
|
63
|
+
|
|
64
|
+
```
|
|
65
|
+
# Core identification (CRITICAL for learning system)
|
|
66
|
+
task.name: "fetch_user_data" # Task identifier
|
|
67
|
+
gen_ai.operation.name: "execute_task" # Operation type
|
|
68
|
+
|
|
69
|
+
# Execution metadata
|
|
70
|
+
task.max_retries: 3 # Retry configuration
|
|
71
|
+
task.timeout: 30000 # Timeout in milliseconds
|
|
72
|
+
task.type: "hybrid" # Task type (neural/symbolic/hybrid)
|
|
73
|
+
task.has_neural: "true" # Has neural implementation
|
|
74
|
+
task.has_symbolic: "false" # Has symbolic implementation
|
|
75
|
+
|
|
76
|
+
# Agent context
|
|
77
|
+
agent.name: "my-agent" # Agent identifier (explicit for learning system)
|
|
78
|
+
|
|
79
|
+
# Data capture (when enabled)
|
|
80
|
+
task.inputs: '{"user_id": 123}' # JSON-encoded inputs (CAPTURE_TASK_INPUTS=true)
|
|
81
|
+
task.outputs: '{"user": {...}}' # JSON-encoded outputs (CAPTURE_TASK_OUTPUTS=true)
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
### Tool Call Spans
|
|
85
|
+
|
|
86
|
+
Tool calls create spans with names like `execute_tool #{tool_name}` and include:
|
|
87
|
+
|
|
88
|
+
```
|
|
89
|
+
# GenAI semantic attributes
|
|
90
|
+
gen_ai.operation.name: "execute_tool" # Operation type
|
|
91
|
+
gen_ai.tool.name: "github" # Tool identifier
|
|
92
|
+
gen_ai.tool.call.id: "call_123" # Call ID (if available)
|
|
93
|
+
|
|
94
|
+
# Data capture (when enabled)
|
|
95
|
+
gen_ai.tool.call.arguments: '{"repo": "..."}' # JSON arguments (CAPTURE_TOOL_ARGS=true)
|
|
96
|
+
gen_ai.tool.call.result: '{"status": "ok"}' # JSON result (CAPTURE_TOOL_RESULTS=true)
|
|
97
|
+
|
|
98
|
+
# Size metadata (always captured)
|
|
99
|
+
gen_ai.tool.call.arguments.size: 45 # Arguments size in bytes
|
|
100
|
+
gen_ai.tool.call.result.size: 1024 # Result size in bytes
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## Learning System Integration
|
|
104
|
+
|
|
105
|
+
This span naming convention enables the language-operator Kubernetes controller to:
|
|
106
|
+
|
|
107
|
+
1. **Identify Task Executions**: Query traces by `task_executor.execute_task` spans
|
|
108
|
+
2. **Group by Agent**: Filter by `agent.name` attribute
|
|
109
|
+
3. **Analyze Patterns**: Extract execution patterns from span attributes
|
|
110
|
+
4. **Build Optimizations**: Create optimized implementations based on trace analysis
|
|
111
|
+
|
|
112
|
+
### Example OTLP Query
|
|
113
|
+
|
|
114
|
+
To find all task executions for an agent:
|
|
115
|
+
|
|
116
|
+
```sql
|
|
117
|
+
SELECT * FROM spans
|
|
118
|
+
WHERE name = 'task_executor.execute_task'
|
|
119
|
+
AND attributes['agent.name'] = 'my-agent'
|
|
120
|
+
AND start_time > NOW() - INTERVAL '1 hour'
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
## Data Privacy and Security
|
|
124
|
+
|
|
125
|
+
### Default Behavior (Secure)
|
|
126
|
+
|
|
127
|
+
By default, the gem captures:
|
|
128
|
+
- ✅ Task names and metadata
|
|
129
|
+
- ✅ Execution timing and counts
|
|
130
|
+
- ✅ Tool names and call frequencies
|
|
131
|
+
- ✅ Data sizes (bytes)
|
|
132
|
+
- ❌ **NOT** actual data content
|
|
133
|
+
|
|
134
|
+
### Full Data Capture (Optional)
|
|
135
|
+
|
|
136
|
+
When explicitly enabled, the gem additionally captures:
|
|
137
|
+
- ⚠️ Complete task inputs and outputs as JSON
|
|
138
|
+
- ⚠️ Tool call arguments and results
|
|
139
|
+
- ⚠️ LLM prompts and responses
|
|
140
|
+
|
|
141
|
+
**Warning:** Only enable full data capture in development or secure production environments. Captured data may contain sensitive information.
|
|
142
|
+
|
|
143
|
+
### Data Sanitization
|
|
144
|
+
|
|
145
|
+
When full capture is enabled, the gem:
|
|
146
|
+
- Truncates large payloads (>1000 chars for span attributes)
|
|
147
|
+
- Converts complex objects to JSON automatically
|
|
148
|
+
- Respects OpenTelemetry attribute limits
|
|
149
|
+
|
|
150
|
+
## Performance Impact
|
|
151
|
+
|
|
152
|
+
Telemetry overhead is minimal:
|
|
153
|
+
- **Default mode**: <5% performance overhead
|
|
154
|
+
- **Full capture mode**: ~10% performance overhead
|
|
155
|
+
- **Span creation**: <1ms per span
|
|
156
|
+
- **Data serialization**: 1-5ms for complex objects
|
|
157
|
+
|
|
158
|
+
## Debugging with Traces
|
|
159
|
+
|
|
160
|
+
### Common Queries
|
|
161
|
+
|
|
162
|
+
**Find slow tasks:**
|
|
163
|
+
```sql
|
|
164
|
+
SELECT attributes['task.name'], duration_ms
|
|
165
|
+
FROM spans
|
|
166
|
+
WHERE name = 'task_executor.execute_task'
|
|
167
|
+
AND duration_ms > 5000
|
|
168
|
+
ORDER BY duration_ms DESC
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
**Tool usage analysis:**
|
|
172
|
+
```sql
|
|
173
|
+
SELECT attributes['gen_ai.tool.name'], COUNT(*)
|
|
174
|
+
FROM spans
|
|
175
|
+
WHERE name LIKE 'execute_tool%'
|
|
176
|
+
GROUP BY attributes['gen_ai.tool.name']
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
**Agent execution frequency:**
|
|
180
|
+
```sql
|
|
181
|
+
SELECT attributes['agent.name'], COUNT(*) as executions
|
|
182
|
+
FROM spans
|
|
183
|
+
WHERE name = 'agent_executor'
|
|
184
|
+
AND start_time > NOW() - INTERVAL '24 hours'
|
|
185
|
+
GROUP BY attributes['agent.name']
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
### Trace Sampling
|
|
189
|
+
|
|
190
|
+
For high-volume agents, consider trace sampling:
|
|
191
|
+
|
|
192
|
+
```bash
|
|
193
|
+
# Sample 10% of traces
|
|
194
|
+
OTEL_TRACES_SAMPLER=parentbased_traceidratio
|
|
195
|
+
OTEL_TRACES_SAMPLER_ARG=0.1
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
## Related Documentation
|
|
199
|
+
|
|
200
|
+
- [Agent Runtime Architecture](./agent-internals.md) - How agents execute
|
|
201
|
+
- [Best Practices](./best-practices.md) - Production deployment guidance
|
|
202
|
+
- [Understanding Generated Code](./understanding-generated-code.md) - Agent code structure
|
|
203
|
+
|
|
204
|
+
## External Resources
|
|
205
|
+
|
|
206
|
+
- [OpenTelemetry Semantic Conventions](https://opentelemetry.io/docs/specs/semconv/gen-ai/)
|
|
207
|
+
- [Language Operator Controller](https://github.com/language-operator/language-operator) - Learning system implementation
|
|
208
|
+
- [OTLP Specification](https://opentelemetry.io/docs/specs/otlp/) - Wire format
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require_relative '../client'
|
|
4
4
|
require_relative '../constants'
|
|
5
|
+
require_relative '../kubernetes/client'
|
|
5
6
|
require_relative 'telemetry'
|
|
6
7
|
require_relative 'instrumentation'
|
|
7
8
|
|
|
@@ -21,7 +22,7 @@ module LanguageOperator
|
|
|
21
22
|
class Base < LanguageOperator::Client::Base
|
|
22
23
|
include Instrumentation
|
|
23
24
|
|
|
24
|
-
attr_reader :workspace_path, :mode
|
|
25
|
+
attr_reader :workspace_path, :mode, :kubernetes_client
|
|
25
26
|
|
|
26
27
|
# Initialize the agent
|
|
27
28
|
#
|
|
@@ -40,6 +41,14 @@ module LanguageOperator
|
|
|
40
41
|
@workspace_path = ENV.fetch('WORKSPACE_PATH', '/workspace')
|
|
41
42
|
@mode = agent_mode_with_default
|
|
42
43
|
@executor = nil
|
|
44
|
+
|
|
45
|
+
# Initialize Kubernetes client for event emission (only in K8s environments)
|
|
46
|
+
@kubernetes_client = begin
|
|
47
|
+
LanguageOperator::Kubernetes::Client.instance if ENV.fetch('KUBERNETES_SERVICE_HOST', nil)
|
|
48
|
+
rescue StandardError => e
|
|
49
|
+
logger.warn('Failed to initialize Kubernetes client', error: e.message)
|
|
50
|
+
nil
|
|
51
|
+
end
|
|
43
52
|
end
|
|
44
53
|
|
|
45
54
|
# Run the agent in its configured mode
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative '../config'
|
|
4
|
+
|
|
5
|
+
module LanguageOperator
|
|
6
|
+
module Agent
|
|
7
|
+
# Event emission configuration for agent runtime
|
|
8
|
+
#
|
|
9
|
+
# Manages configuration for Kubernetes event emission including:
|
|
10
|
+
# - Event filtering and batching options
|
|
11
|
+
# - Error handling preferences
|
|
12
|
+
# - Performance tuning settings
|
|
13
|
+
#
|
|
14
|
+
# @example Load event configuration
|
|
15
|
+
# config = EventConfig.load
|
|
16
|
+
# puts "Events enabled: #{config[:enabled]}"
|
|
17
|
+
# puts "Max events per minute: #{config[:rate_limit]}"
|
|
18
|
+
module EventConfig
|
|
19
|
+
# Load event emission configuration from environment variables
|
|
20
|
+
#
|
|
21
|
+
# @return [Hash] Event configuration hash
|
|
22
|
+
def self.load
|
|
23
|
+
Config.from_env(
|
|
24
|
+
{
|
|
25
|
+
# Core event emission settings
|
|
26
|
+
enabled: 'ENABLE_K8S_EVENTS',
|
|
27
|
+
disabled: 'DISABLE_K8S_EVENTS',
|
|
28
|
+
|
|
29
|
+
# Event filtering
|
|
30
|
+
emit_success_events: 'EMIT_SUCCESS_EVENTS',
|
|
31
|
+
emit_failure_events: 'EMIT_FAILURE_EVENTS',
|
|
32
|
+
emit_validation_events: 'EMIT_VALIDATION_EVENTS',
|
|
33
|
+
|
|
34
|
+
# Performance and rate limiting
|
|
35
|
+
rate_limit_per_minute: 'EVENT_RATE_LIMIT_PER_MINUTE',
|
|
36
|
+
batch_size: 'EVENT_BATCH_SIZE',
|
|
37
|
+
batch_timeout_ms: 'EVENT_BATCH_TIMEOUT_MS',
|
|
38
|
+
|
|
39
|
+
# Error handling
|
|
40
|
+
retry_failed_events: 'RETRY_FAILED_EVENTS',
|
|
41
|
+
max_event_retries: 'MAX_EVENT_RETRIES',
|
|
42
|
+
retry_delay_ms: 'EVENT_RETRY_DELAY_MS',
|
|
43
|
+
|
|
44
|
+
# Event content control
|
|
45
|
+
include_task_metadata: 'INCLUDE_TASK_METADATA',
|
|
46
|
+
include_error_details: 'INCLUDE_ERROR_DETAILS',
|
|
47
|
+
truncate_long_messages: 'TRUNCATE_LONG_MESSAGES',
|
|
48
|
+
max_message_length: 'MAX_EVENT_MESSAGE_LENGTH'
|
|
49
|
+
},
|
|
50
|
+
defaults: {
|
|
51
|
+
enabled: 'true',
|
|
52
|
+
disabled: 'false',
|
|
53
|
+
emit_success_events: 'true',
|
|
54
|
+
emit_failure_events: 'true',
|
|
55
|
+
emit_validation_events: 'true',
|
|
56
|
+
rate_limit_per_minute: '60',
|
|
57
|
+
batch_size: '1',
|
|
58
|
+
batch_timeout_ms: '1000',
|
|
59
|
+
retry_failed_events: 'true',
|
|
60
|
+
max_event_retries: '3',
|
|
61
|
+
retry_delay_ms: '1000',
|
|
62
|
+
include_task_metadata: 'true',
|
|
63
|
+
include_error_details: 'true',
|
|
64
|
+
truncate_long_messages: 'true',
|
|
65
|
+
max_message_length: '1000'
|
|
66
|
+
},
|
|
67
|
+
types: {
|
|
68
|
+
enabled: :boolean,
|
|
69
|
+
disabled: :boolean,
|
|
70
|
+
emit_success_events: :boolean,
|
|
71
|
+
emit_failure_events: :boolean,
|
|
72
|
+
emit_validation_events: :boolean,
|
|
73
|
+
rate_limit_per_minute: :integer,
|
|
74
|
+
batch_size: :integer,
|
|
75
|
+
batch_timeout_ms: :integer,
|
|
76
|
+
retry_failed_events: :boolean,
|
|
77
|
+
max_event_retries: :integer,
|
|
78
|
+
retry_delay_ms: :integer,
|
|
79
|
+
include_task_metadata: :boolean,
|
|
80
|
+
include_error_details: :boolean,
|
|
81
|
+
truncate_long_messages: :boolean,
|
|
82
|
+
max_message_length: :integer
|
|
83
|
+
}
|
|
84
|
+
)
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# Check if event emission is enabled overall
|
|
88
|
+
#
|
|
89
|
+
# Events are enabled if:
|
|
90
|
+
# - Running in Kubernetes (KUBERNETES_SERVICE_HOST set)
|
|
91
|
+
# - Not explicitly disabled (DISABLE_K8S_EVENTS != 'true')
|
|
92
|
+
# - Explicitly enabled (ENABLE_K8S_EVENTS != 'false')
|
|
93
|
+
#
|
|
94
|
+
# @param config [Hash] Configuration hash from load
|
|
95
|
+
# @return [Boolean] True if events should be emitted
|
|
96
|
+
def self.enabled?(config = nil)
|
|
97
|
+
config ||= load
|
|
98
|
+
|
|
99
|
+
# Must be in Kubernetes environment
|
|
100
|
+
return false unless ENV.fetch('KUBERNETES_SERVICE_HOST', nil)
|
|
101
|
+
|
|
102
|
+
# Respect explicit disable flag (legacy)
|
|
103
|
+
return false if config[:disabled]
|
|
104
|
+
|
|
105
|
+
# Check enable flag
|
|
106
|
+
config[:enabled]
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
# Check if specific event type should be emitted
|
|
110
|
+
#
|
|
111
|
+
# @param event_type [Symbol] Event type (:success, :failure, :validation)
|
|
112
|
+
# @param config [Hash] Configuration hash from load
|
|
113
|
+
# @return [Boolean] True if this event type should be emitted
|
|
114
|
+
def self.should_emit?(event_type, config = nil)
|
|
115
|
+
return false unless enabled?(config)
|
|
116
|
+
|
|
117
|
+
config ||= load
|
|
118
|
+
|
|
119
|
+
case event_type
|
|
120
|
+
when :success
|
|
121
|
+
config[:emit_success_events]
|
|
122
|
+
when :failure
|
|
123
|
+
config[:emit_failure_events]
|
|
124
|
+
when :validation
|
|
125
|
+
config[:emit_validation_events]
|
|
126
|
+
else
|
|
127
|
+
false
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
# Get rate limiting configuration
|
|
132
|
+
#
|
|
133
|
+
# @param config [Hash] Configuration hash from load
|
|
134
|
+
# @return [Hash] Rate limiting settings
|
|
135
|
+
def self.rate_limit_config(config = nil)
|
|
136
|
+
config ||= load
|
|
137
|
+
{
|
|
138
|
+
per_minute: config[:rate_limit_per_minute],
|
|
139
|
+
batch_size: config[:batch_size],
|
|
140
|
+
batch_timeout_ms: config[:batch_timeout_ms]
|
|
141
|
+
}
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
# Get retry configuration for failed events
|
|
145
|
+
#
|
|
146
|
+
# @param config [Hash] Configuration hash from load
|
|
147
|
+
# @return [Hash] Retry settings
|
|
148
|
+
def self.retry_config(config = nil)
|
|
149
|
+
config ||= load
|
|
150
|
+
{
|
|
151
|
+
enabled: config[:retry_failed_events],
|
|
152
|
+
max_retries: config[:max_event_retries],
|
|
153
|
+
delay_ms: config[:retry_delay_ms]
|
|
154
|
+
}
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
# Get content configuration for event messages
|
|
158
|
+
#
|
|
159
|
+
# @param config [Hash] Configuration hash from load
|
|
160
|
+
# @return [Hash] Content settings
|
|
161
|
+
def self.content_config(config = nil)
|
|
162
|
+
config ||= load
|
|
163
|
+
{
|
|
164
|
+
include_task_metadata: config[:include_task_metadata],
|
|
165
|
+
include_error_details: config[:include_error_details],
|
|
166
|
+
truncate_long_messages: config[:truncate_long_messages],
|
|
167
|
+
max_message_length: config[:max_message_length]
|
|
168
|
+
}
|
|
169
|
+
end
|
|
170
|
+
end
|
|
171
|
+
end
|
|
172
|
+
end
|
|
@@ -36,7 +36,8 @@ module LanguageOperator
|
|
|
36
36
|
|
|
37
37
|
# Step 3: Execute using instance_eval with smart constant injection
|
|
38
38
|
# Only inject constants that won't conflict with user-defined ones
|
|
39
|
-
safe_constants = %w[Numeric Integer Float String Array Hash TrueClass FalseClass Time Date
|
|
39
|
+
safe_constants = %w[Numeric Integer Float String Array Hash TrueClass FalseClass Time Date
|
|
40
|
+
ArgumentError TypeError RuntimeError StandardError]
|
|
40
41
|
|
|
41
42
|
# Find which constants user code defines to avoid redefinition warnings
|
|
42
43
|
user_defined_constants = safe_constants.select { |const| code.include?("#{const} =") }
|
|
@@ -129,6 +130,9 @@ module LanguageOperator
|
|
|
129
130
|
when :TrueClass, :FalseClass, :NilClass
|
|
130
131
|
# Allow boolean and nil types
|
|
131
132
|
::Object.const_get(name)
|
|
133
|
+
when :ArgumentError, :TypeError, :RuntimeError, :StandardError
|
|
134
|
+
# Allow standard Ruby exception classes for error handling
|
|
135
|
+
::Object.const_get(name)
|
|
132
136
|
else
|
|
133
137
|
# Security-by-default: explicitly deny access to any other constants
|
|
134
138
|
# This prevents sandbox bypass through const_missing fallback
|
|
@@ -106,15 +106,11 @@ module LanguageOperator
|
|
|
106
106
|
def execute_task(task_name, inputs: {}, timeout: nil, max_retries: nil)
|
|
107
107
|
execution_start = Time.now
|
|
108
108
|
max_retries ||= @config[:max_retries]
|
|
109
|
-
|
|
109
|
+
|
|
110
110
|
# Reset JSON parsing retry flag for this task
|
|
111
111
|
@parsing_retry_attempted = false
|
|
112
112
|
|
|
113
|
-
with_span('task_executor.execute_task', attributes:
|
|
114
|
-
'task.name' => task_name.to_s,
|
|
115
|
-
'task.inputs' => inputs.keys.map(&:to_s).join(','),
|
|
116
|
-
'task.max_retries' => max_retries
|
|
117
|
-
}) do
|
|
113
|
+
with_span('task_executor.execute_task', attributes: build_task_execution_attributes(task_name, inputs, max_retries)) do
|
|
118
114
|
# Fast task lookup using pre-built cache
|
|
119
115
|
task_name_sym = task_name.to_sym
|
|
120
116
|
task_info = @task_cache[task_name_sym]
|
|
@@ -140,15 +136,31 @@ module LanguageOperator
|
|
|
140
136
|
OpenTelemetry::Trace.current_span&.set_attribute('task.timeout', timeout)
|
|
141
137
|
|
|
142
138
|
# Execute with retry logic
|
|
143
|
-
execute_with_retry(task, task_name, inputs, timeout, max_retries, execution_start)
|
|
139
|
+
result = execute_with_retry(task, task_name, inputs, timeout, max_retries, execution_start)
|
|
140
|
+
|
|
141
|
+
# Add task outputs to span for learning system (if enabled)
|
|
142
|
+
current_span = OpenTelemetry::Trace.current_span
|
|
143
|
+
current_span&.set_attribute('task.outputs', result.to_json) if current_span && capture_enabled?(:outputs)
|
|
144
|
+
|
|
145
|
+
# Emit Kubernetes event for successful task completion
|
|
146
|
+
emit_task_execution_event(task_name, success: true, execution_start: execution_start)
|
|
147
|
+
|
|
148
|
+
result
|
|
144
149
|
end
|
|
145
150
|
rescue ArgumentError => e
|
|
146
151
|
# Validation errors should not be retried - re-raise immediately
|
|
147
152
|
log_task_error(task_name, e, :validation, execution_start)
|
|
153
|
+
emit_task_execution_event(task_name, success: false, execution_start: execution_start, error: e, event_type: :validation)
|
|
148
154
|
raise TaskValidationError.new(task_name, e.message, e)
|
|
155
|
+
rescue TaskValidationError => e
|
|
156
|
+
# TaskValidationError from validate_inputs should be logged as :validation
|
|
157
|
+
log_task_error(task_name, e, :validation, execution_start)
|
|
158
|
+
emit_task_execution_event(task_name, success: false, execution_start: execution_start, error: e, event_type: :validation)
|
|
159
|
+
raise e
|
|
149
160
|
rescue StandardError => e
|
|
150
161
|
# Catch any unexpected errors that escaped retry logic
|
|
151
162
|
log_task_error(task_name, e, :system, execution_start)
|
|
163
|
+
emit_task_execution_event(task_name, success: false, execution_start: execution_start, error: e)
|
|
152
164
|
raise create_appropriate_error(task_name, e)
|
|
153
165
|
end
|
|
154
166
|
|
|
@@ -371,6 +383,39 @@ module LanguageOperator
|
|
|
371
383
|
'Agent::TaskExecutor'
|
|
372
384
|
end
|
|
373
385
|
|
|
386
|
+
# Emit Kubernetes event for task execution
|
|
387
|
+
#
|
|
388
|
+
# @param task_name [Symbol, String] Task name
|
|
389
|
+
# @param success [Boolean] Whether task succeeded
|
|
390
|
+
# @param execution_start [Time] Task execution start time
|
|
391
|
+
# @param error [Exception, nil] Error if task failed
|
|
392
|
+
# @param event_type [Symbol, nil] Event type override (:success, :failure, :validation)
|
|
393
|
+
def emit_task_execution_event(task_name, success:, execution_start:, error: nil, event_type: nil)
|
|
394
|
+
return unless @agent.respond_to?(:kubernetes_client)
|
|
395
|
+
|
|
396
|
+
duration_ms = ((Time.now - execution_start) * 1000).round(2)
|
|
397
|
+
|
|
398
|
+
metadata = {
|
|
399
|
+
'task_type' => determine_task_type(@tasks[task_name.to_sym])
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
if error
|
|
403
|
+
metadata['error_type'] = error.class.name
|
|
404
|
+
metadata['error_category'] = categorize_error(error).to_s
|
|
405
|
+
end
|
|
406
|
+
|
|
407
|
+
@agent.kubernetes_client.emit_execution_event(
|
|
408
|
+
task_name.to_s,
|
|
409
|
+
success: success,
|
|
410
|
+
duration_ms: duration_ms,
|
|
411
|
+
metadata: metadata
|
|
412
|
+
)
|
|
413
|
+
rescue StandardError => e
|
|
414
|
+
logger.warn('Failed to emit task execution event',
|
|
415
|
+
task: task_name,
|
|
416
|
+
error: e.message)
|
|
417
|
+
end
|
|
418
|
+
|
|
374
419
|
# Summarize hash values for logging (truncate long strings)
|
|
375
420
|
# Optimized for performance with lazy computation
|
|
376
421
|
#
|
|
@@ -620,6 +665,8 @@ module LanguageOperator
|
|
|
620
665
|
# @param task [TaskDefinition] The task definition
|
|
621
666
|
# @return [String] Task type
|
|
622
667
|
def determine_task_type(task)
|
|
668
|
+
return nil unless task
|
|
669
|
+
|
|
623
670
|
if task.neural? && task.symbolic?
|
|
624
671
|
'hybrid'
|
|
625
672
|
elsif task.neural?
|
|
@@ -964,6 +1011,49 @@ module LanguageOperator
|
|
|
964
1011
|
end
|
|
965
1012
|
cache
|
|
966
1013
|
end
|
|
1014
|
+
|
|
1015
|
+
# Build semantic attributes for task execution span
|
|
1016
|
+
#
|
|
1017
|
+
# Includes attributes required for learning status tracking:
|
|
1018
|
+
# - task.name: Task identifier for learning controller
|
|
1019
|
+
# - agent.name: Agent identifier (explicit for learning system)
|
|
1020
|
+
# - gen_ai.operation.name: Semantic operation name
|
|
1021
|
+
#
|
|
1022
|
+
# @param task_name [Symbol] Name of the task being executed
|
|
1023
|
+
# @param inputs [Hash] Task input parameters
|
|
1024
|
+
# @param max_retries [Integer] Maximum retry attempts
|
|
1025
|
+
# @return [Hash] Span attributes
|
|
1026
|
+
def build_task_execution_attributes(task_name, inputs, max_retries)
|
|
1027
|
+
attributes = {
|
|
1028
|
+
# Core task identification (CRITICAL for learning system)
|
|
1029
|
+
'task.name' => task_name.to_s,
|
|
1030
|
+
'task.max_retries' => max_retries,
|
|
1031
|
+
|
|
1032
|
+
# Semantic operation name for better trace organization
|
|
1033
|
+
'gen_ai.operation.name' => 'execute_task'
|
|
1034
|
+
}
|
|
1035
|
+
|
|
1036
|
+
# Add task inputs - JSON-encoded if capture enabled, else just keys
|
|
1037
|
+
attributes['task.inputs'] = if capture_enabled?(:inputs)
|
|
1038
|
+
inputs.to_json
|
|
1039
|
+
else
|
|
1040
|
+
inputs.keys.map(&:to_s).join(',')
|
|
1041
|
+
end
|
|
1042
|
+
|
|
1043
|
+
# Explicitly add agent name if available (redundant with resource attribute but ensures visibility)
|
|
1044
|
+
if (agent_name = ENV.fetch('AGENT_NAME', nil))
|
|
1045
|
+
attributes['agent.name'] = agent_name
|
|
1046
|
+
end
|
|
1047
|
+
|
|
1048
|
+
# Add task type information if available
|
|
1049
|
+
if (task_info = @task_cache[task_name.to_sym])
|
|
1050
|
+
attributes['task.type'] = task_info[:type]
|
|
1051
|
+
attributes['task.has_neural'] = task_info[:neural].to_s
|
|
1052
|
+
attributes['task.has_symbolic'] = task_info[:symbolic].to_s
|
|
1053
|
+
end
|
|
1054
|
+
|
|
1055
|
+
attributes
|
|
1056
|
+
end
|
|
967
1057
|
end
|
|
968
1058
|
end
|
|
969
1059
|
end
|
|
@@ -70,6 +70,11 @@ module LanguageOperator
|
|
|
70
70
|
|
|
71
71
|
# Build resource attributes from environment variables
|
|
72
72
|
#
|
|
73
|
+
# Includes semantic attributes required for learning status tracking:
|
|
74
|
+
# - agent.name: Required for learning controller to identify agent executions
|
|
75
|
+
# - agent.mode: Agent operating mode (autonomous, scheduled, reactive)
|
|
76
|
+
# - service.version: Agent runtime version for observability
|
|
77
|
+
#
|
|
73
78
|
# @return [Hash] Resource attributes
|
|
74
79
|
def build_resource_attributes
|
|
75
80
|
attributes = {}
|
|
@@ -83,9 +88,26 @@ module LanguageOperator
|
|
|
83
88
|
# Kubernetes pod name
|
|
84
89
|
attributes['k8s.pod.name'] = ENV['HOSTNAME'] if ENV['HOSTNAME']
|
|
85
90
|
|
|
86
|
-
# Agent-specific attributes
|
|
87
|
-
|
|
88
|
-
|
|
91
|
+
# Agent-specific attributes (CRITICAL for learning system)
|
|
92
|
+
if (agent_name = ENV.fetch('AGENT_NAME', nil))
|
|
93
|
+
attributes['agent.name'] = agent_name
|
|
94
|
+
# Also set as service.name for better trace organization
|
|
95
|
+
attributes['service.name'] = "language-operator-agent-#{agent_name}"
|
|
96
|
+
else
|
|
97
|
+
warn 'AGENT_NAME environment variable not set - learning status tracking may not work correctly'
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
if (agent_mode = ENV.fetch('AGENT_MODE', nil))
|
|
101
|
+
attributes['agent.mode'] = agent_mode
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# Agent runtime version for observability
|
|
105
|
+
attributes['service.version'] = LanguageOperator::VERSION if defined?(LanguageOperator::VERSION)
|
|
106
|
+
|
|
107
|
+
# Agent cluster context
|
|
108
|
+
if (cluster_name = ENV.fetch('AGENT_CLUSTER', nil))
|
|
109
|
+
attributes['agent.cluster'] = cluster_name
|
|
110
|
+
end
|
|
89
111
|
|
|
90
112
|
attributes
|
|
91
113
|
end
|
|
@@ -179,16 +179,13 @@ module LanguageOperator
|
|
|
179
179
|
|
|
180
180
|
# Drain and cleanup all executors in the pool
|
|
181
181
|
executors_cleaned = 0
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
end
|
|
182
|
+
|
|
183
|
+
until @executor_pool.empty?
|
|
184
|
+
executor = @executor_pool.pop unless @executor_pool.empty?
|
|
185
|
+
if executor
|
|
186
|
+
executor.cleanup_connections
|
|
187
|
+
executors_cleaned += 1
|
|
189
188
|
end
|
|
190
|
-
rescue ThreadError
|
|
191
|
-
# Pool is empty, we're done
|
|
192
189
|
end
|
|
193
190
|
|
|
194
191
|
puts "Cleaned up #{executors_cleaned} executors from pool"
|