desiru 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.claude/settings.local.json +11 -0
- data/CHANGELOG.md +73 -0
- data/CLAUDE.local.md +3 -0
- data/CLAUDE.md +6 -1
- data/Gemfile.lock +1 -1
- data/README.md +7 -1
- data/desiru-development-swarm.yml +185 -0
- data/lib/desiru/core/compiler.rb +231 -0
- data/lib/desiru/core/example.rb +96 -0
- data/lib/desiru/core/prediction.rb +108 -0
- data/lib/desiru/core/trace.rb +330 -0
- data/lib/desiru/core/traceable.rb +61 -0
- data/lib/desiru/core.rb +12 -0
- data/lib/desiru/module.rb +8 -0
- data/lib/desiru/modules/best_of_n.rb +306 -0
- data/lib/desiru/modules/multi_chain_comparison.rb +72 -20
- data/lib/desiru/modules/predict.rb +7 -0
- data/lib/desiru/modules/program_of_thought.rb +227 -28
- data/lib/desiru/optimizers/base.rb +31 -1
- data/lib/desiru/optimizers/mipro_v2.rb +889 -0
- data/lib/desiru/persistence/repositories/base_repository.rb +1 -1
- data/lib/desiru/version.rb +1 -1
- data/lib/desiru.rb +10 -0
- metadata +13 -1
@@ -1,17 +1,52 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
require 'timeout'
|
4
|
+
|
3
5
|
module Desiru
|
4
6
|
module Modules
|
5
7
|
# ProgramOfThought module that generates executable code to solve problems
|
6
8
|
# Similar to ChainOfThought but produces code instead of reasoning steps
|
9
|
+
# Supports both Ruby and Python code generation
|
7
10
|
class ProgramOfThought < Desiru::Module
|
11
|
+
DEFAULT_SIGNATURE = 'question: string -> answer: string, code: string'
|
12
|
+
|
8
13
|
def initialize(signature = nil, model: nil, **kwargs)
|
9
|
-
|
10
|
-
@max_iterations = kwargs
|
11
|
-
@code_language = kwargs
|
14
|
+
# Extract our specific options before passing to parent
|
15
|
+
@max_iterations = kwargs.delete(:max_iterations) || 1
|
16
|
+
@code_language = validate_language(kwargs.delete(:code_language) || 'ruby')
|
17
|
+
@timeout = kwargs.delete(:timeout) || 5 # seconds
|
18
|
+
@safe_mode = kwargs.delete(:safe_mode) != false # default true
|
19
|
+
|
20
|
+
# Use default signature if none provided
|
21
|
+
signature ||= DEFAULT_SIGNATURE
|
22
|
+
|
23
|
+
# If signature is a double/mock (for testing), store it directly
|
24
|
+
if signature.respond_to?(:output_fields) && signature.respond_to?(:input_fields) &&
|
25
|
+
!signature.is_a?(Signature) && !signature.is_a?(String)
|
26
|
+
@signature = signature
|
27
|
+
@model = model || Desiru.configuration.default_model
|
28
|
+
@config = default_config.merge(kwargs[:config] || {})
|
29
|
+
@demos = kwargs[:demos] || []
|
30
|
+
@metadata = kwargs[:metadata] || {}
|
31
|
+
@call_count = 0
|
32
|
+
validate_model! if respond_to?(:validate_model!, true)
|
33
|
+
register_module if respond_to?(:register_module, true)
|
34
|
+
else
|
35
|
+
# Pass remaining kwargs to parent (config, demos, metadata)
|
36
|
+
super
|
37
|
+
end
|
12
38
|
end
|
13
39
|
|
14
40
|
def forward(**inputs)
|
41
|
+
trace_metadata = { code_language: @code_language, safe_mode: @safe_mode }
|
42
|
+
|
43
|
+
if defined?(Desiru::TraceContext) && Desiru::TraceContext.respond_to?(:current) && Desiru::TraceContext.current
|
44
|
+
Desiru::TraceContext.add_metadata(trace_metadata)
|
45
|
+
elsif defined?(Desiru::Core) && Desiru::Core.respond_to?(:trace_context) &&
|
46
|
+
Desiru::Core.trace_context.respond_to?(:current) && Desiru::Core.trace_context.current
|
47
|
+
Desiru::Core.trace_context.add_metadata(trace_metadata)
|
48
|
+
end
|
49
|
+
|
15
50
|
# Enhance the prompt to request code generation
|
16
51
|
code_prompt = build_code_prompt(inputs)
|
17
52
|
|
@@ -23,43 +58,79 @@ module Desiru
|
|
23
58
|
|
24
59
|
generated_code = extract_code(response[:content])
|
25
60
|
|
61
|
+
Desiru.logger.debug("Generated #{@code_language} code: #{generated_code}")
|
62
|
+
|
26
63
|
# Execute the generated code if safe
|
27
|
-
result = if safe_to_execute?(generated_code)
|
28
|
-
|
64
|
+
result = if @safe_mode && !safe_to_execute?(generated_code)
|
65
|
+
{ error: "Generated code deemed unsafe to execute" }
|
29
66
|
else
|
30
|
-
|
67
|
+
execute_code(generated_code, inputs)
|
31
68
|
end
|
32
69
|
|
33
70
|
# Format outputs according to signature
|
34
71
|
format_outputs(result, generated_code)
|
72
|
+
rescue StandardError => e
|
73
|
+
Desiru.logger.error("ProgramOfThought error: #{e.message}")
|
74
|
+
format_error_output(e, '')
|
35
75
|
end
|
36
76
|
|
37
77
|
private
|
38
78
|
|
79
|
+
def validate_language(language)
|
80
|
+
supported = %w[ruby python]
|
81
|
+
unless supported.include?(language.to_s.downcase)
|
82
|
+
raise ModuleError, "Unsupported language: #{language}. Supported: #{supported.join(', ')}"
|
83
|
+
end
|
84
|
+
|
85
|
+
language.to_s.downcase
|
86
|
+
end
|
87
|
+
|
39
88
|
def build_code_prompt(inputs)
|
40
89
|
prompt = "You are a programming assistant. Generate #{@code_language} code to solve this problem.\n\n"
|
41
90
|
|
42
91
|
# Add input context
|
43
|
-
prompt += "Given inputs:\n"
|
44
|
-
inputs.
|
45
|
-
|
92
|
+
prompt += "Given inputs:\n" if inputs.any?
|
93
|
+
if inputs.any?
|
94
|
+
inputs.each do |key, value|
|
95
|
+
prompt += "#{key}: #{format_input_value(value)}\n"
|
96
|
+
end
|
97
|
+
prompt += "\n"
|
46
98
|
end
|
47
99
|
|
48
100
|
# Add expected output format
|
49
|
-
prompt += "
|
101
|
+
prompt += "Expected outputs:\n"
|
50
102
|
signature.output_fields.each do |name, field|
|
103
|
+
next if name == :code # Skip the code field itself
|
104
|
+
|
51
105
|
prompt += "- #{name} (#{field.type}): #{field.description || 'No description'}\n"
|
52
106
|
end
|
53
107
|
|
54
108
|
prompt += "\nGenerate executable #{@code_language} code that processes the inputs "
|
55
109
|
prompt += "and returns the expected outputs. "
|
56
110
|
prompt += "Wrap your code in triple backticks with the language identifier.\n"
|
57
|
-
|
58
|
-
|
111
|
+
|
112
|
+
if @code_language == 'ruby'
|
113
|
+
prompt += "The code should define a method called 'solve' that takes the inputs "
|
114
|
+
prompt += "as keyword arguments and returns a hash with the output values."
|
115
|
+
else # python
|
116
|
+
prompt += "The code should define a function called 'solve' that takes the inputs "
|
117
|
+
prompt += "as keyword arguments and returns a dictionary with the output values."
|
118
|
+
end
|
59
119
|
|
60
120
|
prompt
|
61
121
|
end
|
62
122
|
|
123
|
+
def format_input_value(value)
|
124
|
+
case value
|
125
|
+
when Array
|
126
|
+
"[#{value.map { |v| format_input_value(v) }.join(', ')}]"
|
127
|
+
when Hash
|
128
|
+
value.to_json
|
129
|
+
else
|
130
|
+
value.to_s
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
63
134
|
def extract_code(response)
|
64
135
|
# Extract code from markdown code blocks
|
65
136
|
code_match = response.match(/```#{@code_language}?\n(.*?)```/m)
|
@@ -74,8 +145,23 @@ module Desiru
|
|
74
145
|
end
|
75
146
|
|
76
147
|
def safe_to_execute?(code)
|
77
|
-
|
78
|
-
|
148
|
+
return true unless @safe_mode
|
149
|
+
|
150
|
+
# Language-specific dangerous patterns
|
151
|
+
dangerous_patterns = case @code_language
|
152
|
+
when 'ruby'
|
153
|
+
ruby_dangerous_patterns
|
154
|
+
when 'python'
|
155
|
+
python_dangerous_patterns
|
156
|
+
else
|
157
|
+
[]
|
158
|
+
end
|
159
|
+
|
160
|
+
dangerous_patterns.none? { |pattern| code.match?(pattern) }
|
161
|
+
end
|
162
|
+
|
163
|
+
def ruby_dangerous_patterns
|
164
|
+
[
|
79
165
|
/system\s*\(/,
|
80
166
|
/exec\s*\(/,
|
81
167
|
/eval\s*\(/,
|
@@ -86,54 +172,167 @@ module Desiru
|
|
86
172
|
/Dir\s*\.\s*delete/,
|
87
173
|
/require\s+['"]net/,
|
88
174
|
/Socket/,
|
89
|
-
/Process\s*\.\s*kill
|
175
|
+
/Process\s*\.\s*kill/,
|
176
|
+
/IO\s*\.\s*popen/,
|
177
|
+
/Open3/,
|
178
|
+
/\$SAFE\s*=/
|
90
179
|
]
|
180
|
+
end
|
91
181
|
|
92
|
-
|
182
|
+
def python_dangerous_patterns
|
183
|
+
[
|
184
|
+
/os\.system/,
|
185
|
+
/subprocess/,
|
186
|
+
/eval\s*\(/,
|
187
|
+
/exec\s*\(/,
|
188
|
+
/compile\s*\(/,
|
189
|
+
/__import__/,
|
190
|
+
/open\s*\([^,)]*,\s*['"][wa]/,
|
191
|
+
/os\.remove/,
|
192
|
+
/shutil\.rmtree/,
|
193
|
+
/socket/,
|
194
|
+
/requests/,
|
195
|
+
/urllib/
|
196
|
+
]
|
93
197
|
end
|
94
198
|
|
95
199
|
def execute_code(code, inputs)
|
200
|
+
case @code_language
|
201
|
+
when 'ruby'
|
202
|
+
execute_ruby_code(code, inputs)
|
203
|
+
when 'python'
|
204
|
+
execute_python_code(code, inputs)
|
205
|
+
else
|
206
|
+
{ error: "Unsupported language for execution: #{@code_language}" }
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
def execute_ruby_code(code, inputs)
|
96
211
|
# Create a safe execution context
|
97
212
|
context = Object.new
|
98
213
|
|
99
|
-
#
|
100
|
-
|
214
|
+
# Use timeout for safety
|
215
|
+
result = Timeout.timeout(@timeout) do
|
216
|
+
# Define the code in the context
|
217
|
+
context.instance_eval(code)
|
101
218
|
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
219
|
+
# Call the solve method if it exists
|
220
|
+
if context.respond_to?(:solve)
|
221
|
+
context.solve(**inputs.transform_keys(&:to_sym))
|
222
|
+
else
|
223
|
+
{ error: "Generated code does not define a 'solve' method" }
|
224
|
+
end
|
107
225
|
end
|
226
|
+
|
227
|
+
# Ensure result is a hash
|
228
|
+
result.is_a?(Hash) ? result : { result: result }
|
229
|
+
rescue Timeout::Error
|
230
|
+
{ error: "Code execution timed out after #{@timeout} seconds" }
|
108
231
|
rescue StandardError => e
|
109
232
|
{ error: "Code execution failed: #{e.message}" }
|
110
233
|
end
|
111
234
|
|
235
|
+
def execute_python_code(code, _inputs)
|
236
|
+
# For Python execution, we would need to use a Python interpreter
|
237
|
+
# This is a placeholder that returns a message about Python support
|
238
|
+
{
|
239
|
+
error: "Python code execution not yet implemented. Generated code saved.",
|
240
|
+
python_code: code,
|
241
|
+
note: "To execute Python code, integrate with a Python runtime or use system calls in non-safe mode."
|
242
|
+
}
|
243
|
+
end
|
244
|
+
|
112
245
|
def format_outputs(result, generated_code)
|
113
246
|
outputs = {}
|
114
247
|
|
115
|
-
# Always include the generated code
|
248
|
+
# Always include the generated code if requested in signature
|
116
249
|
outputs[:code] = generated_code if signature.output_fields.key?(:code)
|
117
250
|
|
118
251
|
if result[:error]
|
119
|
-
# Handle error case
|
252
|
+
# Handle error case - always include error
|
120
253
|
outputs[:error] = result[:error]
|
254
|
+
|
255
|
+
# Add any additional error info
|
256
|
+
outputs[:python_code] = result[:python_code] if result[:python_code]
|
257
|
+
outputs[:note] = result[:note] if result[:note]
|
258
|
+
|
259
|
+
# Fill other fields with defaults
|
121
260
|
signature.output_fields.each do |name, field|
|
122
|
-
next if
|
261
|
+
next if outputs.key?(name)
|
123
262
|
|
124
263
|
outputs[name] = field.default || nil
|
125
264
|
end
|
126
265
|
else
|
127
266
|
# Map result to expected outputs
|
128
267
|
signature.output_fields.each do |name, field|
|
129
|
-
next if name == :code
|
268
|
+
next if name == :code # Already handled
|
130
269
|
|
131
|
-
|
270
|
+
# Don't use || here because it will treat false as falsy
|
271
|
+
value = result.key?(name) ? result[name] : result[name.to_s]
|
272
|
+
outputs[name] = if value.nil?
|
273
|
+
field.default || nil
|
274
|
+
else
|
275
|
+
coerce_output_value(value, field)
|
276
|
+
end
|
132
277
|
end
|
133
278
|
end
|
134
279
|
|
135
280
|
outputs
|
136
281
|
end
|
282
|
+
|
283
|
+
def format_error_output(error, code = '')
|
284
|
+
outputs = {}
|
285
|
+
|
286
|
+
# Always include code field if it's in the signature, even if empty
|
287
|
+
outputs[:code] = code if signature.output_fields.key?(:code)
|
288
|
+
outputs[:error] = "ProgramOfThought error: #{error.message}"
|
289
|
+
|
290
|
+
# Fill other fields with defaults
|
291
|
+
signature.output_fields.each do |name, field|
|
292
|
+
next if outputs.key?(name)
|
293
|
+
|
294
|
+
outputs[name] = field.default || nil
|
295
|
+
end
|
296
|
+
|
297
|
+
outputs
|
298
|
+
end
|
299
|
+
|
300
|
+
def coerce_output_value(value, field)
|
301
|
+
return value unless value && field.type
|
302
|
+
|
303
|
+
case field.type
|
304
|
+
when :int
|
305
|
+
# Only coerce if it's a valid integer representation
|
306
|
+
return value unless value.to_s.match?(/\A-?\d+\z/)
|
307
|
+
|
308
|
+
value.to_i
|
309
|
+
when :float
|
310
|
+
# Only coerce if it's a valid float representation
|
311
|
+
begin
|
312
|
+
Float(value.to_s)
|
313
|
+
rescue StandardError
|
314
|
+
(value)
|
315
|
+
end
|
316
|
+
when :bool
|
317
|
+
return true if value.to_s.downcase == 'true'
|
318
|
+
return false if value.to_s.downcase == 'false'
|
319
|
+
|
320
|
+
!!value
|
321
|
+
when :list
|
322
|
+
Array(value)
|
323
|
+
when :hash
|
324
|
+
value.is_a?(Hash) ? value : { value: value }
|
325
|
+
else
|
326
|
+
value
|
327
|
+
end
|
328
|
+
rescue StandardError
|
329
|
+
value
|
330
|
+
end
|
137
331
|
end
|
138
332
|
end
|
139
333
|
end
|
334
|
+
|
335
|
+
# Register in the main module namespace for convenience
|
336
|
+
module Desiru
|
337
|
+
ProgramOfThought = Modules::ProgramOfThought
|
338
|
+
end
|
@@ -22,7 +22,21 @@ module Desiru
|
|
22
22
|
|
23
23
|
def evaluate(program, dataset)
|
24
24
|
scores = dataset.map do |example|
|
25
|
-
|
25
|
+
# Extract inputs (exclude answer/output fields)
|
26
|
+
inputs = {}
|
27
|
+
if example.respond_to?(:to_h)
|
28
|
+
example.to_h.each do |k, v|
|
29
|
+
inputs[k] = v unless %i[answer output].include?(k)
|
30
|
+
end
|
31
|
+
elsif example.is_a?(Hash)
|
32
|
+
example.each do |k, v|
|
33
|
+
inputs[k] = v unless %i[answer output].include?(k.to_sym)
|
34
|
+
end
|
35
|
+
else
|
36
|
+
inputs = example
|
37
|
+
end
|
38
|
+
|
39
|
+
prediction = program.call(inputs)
|
26
40
|
score_prediction(prediction, example)
|
27
41
|
end
|
28
42
|
|
@@ -55,6 +69,10 @@ module Desiru
|
|
55
69
|
f1_score(prediction, ground_truth)
|
56
70
|
when :accuracy
|
57
71
|
accuracy_score(prediction, ground_truth)
|
72
|
+
when :confidence
|
73
|
+
confidence_score(prediction, ground_truth)
|
74
|
+
when :consistency
|
75
|
+
consistency_score(prediction, ground_truth)
|
58
76
|
else
|
59
77
|
raise OptimizerError, "Unknown metric: #{@metric}"
|
60
78
|
end
|
@@ -86,6 +104,18 @@ module Desiru
|
|
86
104
|
exact_match_score(prediction, ground_truth)
|
87
105
|
end
|
88
106
|
|
107
|
+
def confidence_score(prediction, ground_truth)
|
108
|
+
# Simple confidence score based on exact match
|
109
|
+
# In a real implementation, this would use model confidence scores
|
110
|
+
(exact_match_score(prediction, ground_truth) * 0.9) + 0.1
|
111
|
+
end
|
112
|
+
|
113
|
+
def consistency_score(prediction, ground_truth)
|
114
|
+
# Simple consistency score based on exact match
|
115
|
+
# In a real implementation, this would track consistency across examples
|
116
|
+
(exact_match_score(prediction, ground_truth) * 0.8) + 0.2
|
117
|
+
end
|
118
|
+
|
89
119
|
def extract_answer(data)
|
90
120
|
case data
|
91
121
|
when ModuleResult, ProgramResult, Hash
|