dspy 0.28.1 → 0.28.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,177 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'sorbet-runtime'
4
+ require_relative '../signature'
5
+ require_relative '../predict'
6
+
7
+ module DSPy
8
+ module Propose
9
+ # Dataset Summary Generator for creating concise dataset descriptions
10
+ # Used by GroundedProposer for data-aware instruction generation
11
+ module DatasetSummaryGenerator
12
+ extend T::Sig
13
+
14
+ # Signature for summarizing observations into a brief summary
15
+ class ObservationSummarizer < DSPy::Signature
16
+ description "Given a series of observations I have made about my dataset, please summarize them into a brief 2-3 sentence summary which highlights only the most important details."
17
+
18
+ input do
19
+ const :observations, String, description: "Observations I have made about my dataset"
20
+ end
21
+
22
+ output do
23
+ const :summary, String, description: "Two to Three sentence summary of only the most significant highlights of my observations"
24
+ end
25
+ end
26
+
27
+ # Signature for generating initial dataset observations
28
+ class DatasetDescriptor < DSPy::Signature
29
+ description "Given several examples from a dataset please write observations about trends that hold for most or all of the samples. " \
30
+ "Some areas you may consider in your observations: topics, content, syntax, conciceness, etc. " \
31
+ "It will be useful to make an educated guess as to the nature of the task this dataset will enable. Don't be afraid to be creative"
32
+
33
+ input do
34
+ const :examples, String, description: "Sample data points from the dataset"
35
+ end
36
+
37
+ output do
38
+ const :observations, String, description: "Somethings that holds true for most or all of the data you observed"
39
+ end
40
+ end
41
+
42
+ # Signature for refining observations with prior context
43
+ class DatasetDescriptorWithPriorObservations < DSPy::Signature
44
+ description "Given several examples from a dataset please write observations about trends that hold for most or all of the samples. " \
45
+ "I will also provide you with a few observations I have already made. Please add your own observations or if you feel the observations are comprehensive say 'COMPLETE' " \
46
+ "Some areas you may consider in your observations: topics, content, syntax, conciceness, etc. " \
47
+ "It will be useful to make an educated guess as to the nature of the task this dataset will enable. Don't be afraid to be creative"
48
+
49
+ input do
50
+ const :examples, String, description: "Sample data points from the dataset"
51
+ const :prior_observations, String, description: "Some prior observations I made about the data"
52
+ end
53
+
54
+ output do
55
+ const :observations, String, description: "Somethings that holds true for most or all of the data you observed or COMPLETE if you have nothing to add"
56
+ end
57
+ end
58
+
59
+ # Helper function to ensure consistent ordering of input keys in string representations
60
+ # This helps with caching and consistent LLM prompts
61
+ sig { params(unordered_repr: String).returns(String) }
62
+ def self.order_input_keys_in_string(unordered_repr)
63
+ # Regex pattern to match the input keys structure
64
+ pattern = /input_keys=\{([^}]+)\}/
65
+
66
+ # Function to reorder keys
67
+ unordered_repr.gsub(pattern) do |match|
68
+ keys_str = Regexp.last_match(1)
69
+ # Split the keys, strip extra spaces, and sort them
70
+ keys = keys_str.split(',').map(&:strip).sort
71
+ # Format the sorted keys back into the expected structure
72
+ "input_keys={#{keys.join(', ')}}"
73
+ end
74
+ end
75
+
76
+ # Strip common prefixes from LLM outputs (e.g., "Answer:", "Output:")
77
+ sig { params(text: String).returns(String) }
78
+ def self.strip_prefix(text)
79
+ # Pattern matches up to 4 words followed by a colon
80
+ pattern = /^[\*\s]*(([\w'\-]+\s+){0,4}[\w'\-]+):\s*/
81
+ modified_text = text.gsub(pattern, '')
82
+ modified_text.strip.gsub(/^["']|["']$/, '')
83
+ end
84
+
85
+ # Generate a concise 2-3 sentence summary of a training dataset
86
+ # Used for data-aware instruction proposal in MIPROv2
87
+ #
88
+ # @param trainset [Array<DSPy::Example>] Training examples to summarize
89
+ # @param view_data_batch_size [Integer] Number of examples to process per batch
90
+ # @param prompt_model [DSPy::LM, nil] Language model to use (defaults to DSPy.lm)
91
+ # @param verbose [Boolean] Whether to print progress information
92
+ # @return [String] 2-3 sentence summary of the dataset characteristics
93
+ #
94
+ # @example Basic usage
95
+ # summary = DatasetSummaryGenerator.create_dataset_summary(
96
+ # trainset,
97
+ # view_data_batch_size: 10,
98
+ # prompt_model: DSPy::LM.new('gpt-4o-mini')
99
+ # )
100
+ #
101
+ sig do
102
+ params(
103
+ trainset: T::Array[DSPy::Example],
104
+ view_data_batch_size: Integer,
105
+ prompt_model: T.nilable(DSPy::LM),
106
+ verbose: T::Boolean
107
+ ).returns(String)
108
+ end
109
+ def self.create_dataset_summary(trainset, view_data_batch_size, prompt_model, verbose: false)
110
+ if verbose
111
+ puts "\nBootstrapping dataset summary (this will be used to generate instructions)..."
112
+ end
113
+
114
+ # Use provided model or fall back to global LM
115
+ lm = prompt_model || DSPy.lm
116
+ raise ArgumentError, "No language model configured. Set prompt_model or DSPy.lm" unless lm
117
+
118
+ # Use provided LM in a block context
119
+ DSPy.with_lm(lm) do
120
+ # Initial observation from first batch
121
+ upper_lim = [trainset.length, view_data_batch_size].min
122
+ examples_repr = order_input_keys_in_string(trainset[0...upper_lim].inspect)
123
+
124
+ predictor = DSPy::Predict.new(DatasetDescriptor)
125
+ observation = predictor.call(examples: examples_repr)
126
+ observations = observation.observations
127
+
128
+ # Iteratively refine observations with additional batches
129
+ skips = 0
130
+ max_calls = 10
131
+ calls = 0
132
+
133
+ begin
134
+ (view_data_batch_size...trainset.length).step(view_data_batch_size) do |b|
135
+ calls += 1
136
+ break if calls >= max_calls
137
+
138
+ puts "Processing batch starting at index #{b}" if verbose
139
+
140
+ upper_lim = [trainset.length, b + view_data_batch_size].min
141
+ examples_repr = order_input_keys_in_string(trainset[b...upper_lim].inspect)
142
+
143
+ predictor = DSPy::Predict.new(DatasetDescriptorWithPriorObservations)
144
+ output = predictor.call(
145
+ prior_observations: observations,
146
+ examples: examples_repr
147
+ )
148
+
149
+ # Check if LLM indicates observations are complete
150
+ if output.observations.length >= 8 && output.observations[0...8].upcase == "COMPLETE"
151
+ skips += 1
152
+ break if skips >= 5
153
+ next
154
+ end
155
+
156
+ observations += output.observations
157
+ end
158
+ rescue => e
159
+ if verbose
160
+ puts "Error during observation refinement: #{e.message}. Using observations from past round for summary."
161
+ end
162
+ end
163
+
164
+ # Generate final summary from accumulated observations
165
+ predictor = DSPy::Predict.new(ObservationSummarizer)
166
+ summary = predictor.call(observations: observations)
167
+
168
+ if verbose
169
+ puts "\nGenerated summary: #{strip_prefix(summary.summary)}\n"
170
+ end
171
+
172
+ strip_prefix(summary.summary)
173
+ end
174
+ end
175
+ end
176
+ end
177
+ end
@@ -11,40 +11,78 @@ module DSPy
11
11
  class GroundedProposer
12
12
  extend T::Sig
13
13
 
14
- # Configuration for instruction proposal
14
+ # Python-compatible TIPS dictionary for instruction generation
15
+ TIPS = {
16
+ "none" => "",
17
+ "creative" => "Don't be afraid to be creative when creating the new instruction!",
18
+ "simple" => "Keep the instruction clear and concise.",
19
+ "description" => "Make sure your instruction is very informative and descriptive.",
20
+ "high_stakes" => "The instruction should include a high stakes scenario in which the LM must solve the task!",
21
+ "persona" => 'Include a persona that is relevant to the task in the instruction (ie. "You are a ...")'
22
+ }.freeze
23
+
24
+ # Configuration for instruction proposal (Python-compatible)
15
25
  class Config
16
26
  extend T::Sig
17
27
 
28
+ # Core parameters
18
29
  sig { returns(Integer) }
19
30
  attr_accessor :num_instruction_candidates
20
31
 
32
+ # Python-compatible awareness flags (match Python defaults exactly)
33
+ sig { returns(T::Boolean) }
34
+ attr_accessor :program_aware
35
+
36
+ sig { returns(T::Boolean) }
37
+ attr_accessor :use_dataset_summary
38
+
39
+ sig { returns(T::Boolean) }
40
+ attr_accessor :use_task_demos
41
+
42
+ sig { returns(T::Boolean) }
43
+ attr_accessor :use_tip
44
+
45
+ sig { returns(T::Boolean) }
46
+ attr_accessor :use_instruct_history
47
+
48
+ # Additional parameters
21
49
  sig { returns(Integer) }
22
- attr_accessor :max_examples_for_analysis
50
+ attr_accessor :view_data_batch_size
23
51
 
24
52
  sig { returns(Integer) }
25
- attr_accessor :max_instruction_length
53
+ attr_accessor :num_demos_in_context
26
54
 
27
55
  sig { returns(T::Boolean) }
28
- attr_accessor :use_task_description
56
+ attr_accessor :set_tip_randomly
29
57
 
30
58
  sig { returns(T::Boolean) }
31
- attr_accessor :use_input_output_analysis
59
+ attr_accessor :set_history_randomly
32
60
 
33
- sig { returns(T::Boolean) }
34
- attr_accessor :use_few_shot_examples
61
+ sig { returns(Float) }
62
+ attr_accessor :init_temperature
35
63
 
36
- sig { returns(String) }
37
- attr_accessor :proposal_model
64
+ sig { returns(T::Boolean) }
65
+ attr_accessor :verbose
38
66
 
39
67
  sig { void }
40
68
  def initialize
69
+ # Core parameters
41
70
  @num_instruction_candidates = 5
42
- @max_examples_for_analysis = 10
43
- @max_instruction_length = 200
44
- @use_task_description = true
45
- @use_input_output_analysis = true
46
- @use_few_shot_examples = true
47
- @proposal_model = "gpt-4o-mini"
71
+
72
+ # Python-compatible awareness flags (match Python defaults)
73
+ @program_aware = true
74
+ @use_dataset_summary = true
75
+ @use_task_demos = true
76
+ @use_tip = true
77
+ @use_instruct_history = true
78
+
79
+ # Additional parameters
80
+ @view_data_batch_size = 10
81
+ @num_demos_in_context = 3
82
+ @set_tip_randomly = true
83
+ @set_history_randomly = true
84
+ @init_temperature = 1.0
85
+ @verbose = false
48
86
  end
49
87
  end
50
88
 
@@ -88,11 +126,66 @@ module DSPy
88
126
  sig { returns(Config) }
89
127
  attr_reader :config
90
128
 
91
- sig { params(config: T.nilable(Config)).void }
92
- def initialize(config: nil)
129
+ sig do
130
+ params(
131
+ config: T.nilable(Config),
132
+ program: T.nilable(T.untyped),
133
+ trainset: T.nilable(T::Array[DSPy::Example])
134
+ ).void
135
+ end
136
+ def initialize(config: nil, program: nil, trainset: nil)
93
137
  @config = config || Config.new
138
+ @program = program
139
+ @trainset = trainset
140
+ @dataset_summary = nil
141
+ @program_code_string = nil
142
+
143
+ # Generate dataset summary if data-aware mode enabled (Python: use_dataset_summary)
144
+ if @config.use_dataset_summary && trainset && !trainset.empty?
145
+ begin
146
+ require_relative 'dataset_summary_generator'
147
+ @dataset_summary = DatasetSummaryGenerator.create_dataset_summary(
148
+ trainset,
149
+ @config.view_data_batch_size,
150
+ DSPy.current_lm,
151
+ verbose: @config.verbose
152
+ )
153
+ rescue => e
154
+ DSPy.logger.warn("Failed to generate dataset summary: #{e.message}")
155
+ @dataset_summary = nil
156
+ end
157
+ end
158
+
159
+ # Extract program source code if program-aware mode enabled
160
+ if @config.program_aware && program
161
+ @program_code_string = extract_program_source(program)
162
+ end
163
+ end
164
+
165
+ private
166
+
167
+ # Extract source code from program for program-aware mode
168
+ sig { params(program: T.untyped).returns(T.nilable(String)) }
169
+ def extract_program_source(program)
170
+ # Get the program's class
171
+ klass = program.is_a?(Class) ? program : program.class
172
+
173
+ # Try to get source location
174
+ source_location = klass.instance_method(:forward).source_location rescue nil
175
+ return nil unless source_location
176
+
177
+ file, line = source_location
178
+ # Read the source file and extract the class definition
179
+ # This is a simplified version - could be enhanced with method_source gem
180
+ code = "Program: #{klass.name}\nSource: #{file}:#{line}"
181
+ code
182
+ rescue => e
183
+ DSPy.logger.warn("Could not extract program source: #{e.message}")
184
+ nil
94
185
  end
95
186
 
187
+ public
188
+
96
189
  # Generate instruction candidates for a signature and training examples
97
190
  sig do
98
191
  params(
@@ -116,9 +209,10 @@ module DSPy
116
209
 
117
210
  # Generate instruction candidates
118
211
  candidates = generate_instruction_candidates(
119
- signature_class,
120
- analysis,
121
- current_instruction
212
+ signature_class,
213
+ analysis,
214
+ current_instruction,
215
+ few_shot_examples: few_shot_examples
122
216
  )
123
217
 
124
218
  # Filter and rank candidates
@@ -126,8 +220,8 @@ module DSPy
126
220
 
127
221
  metadata = {
128
222
  generation_timestamp: Time.now.iso8601,
129
- model_used: @config.proposal_model,
130
- num_examples_analyzed: [examples.size, @config.max_examples_for_analysis].min,
223
+ model_used: DSPy.current_lm.model,
224
+ num_examples_analyzed: [examples.size, @config.view_data_batch_size].min,
131
225
  original_instruction: current_instruction
132
226
  }
133
227
 
@@ -204,7 +298,7 @@ module DSPy
204
298
  # Analyze patterns in training examples
205
299
  sig { params(examples: T::Array[T.untyped]).returns(T::Hash[Symbol, T.untyped]) }
206
300
  def analyze_example_patterns(examples)
207
- analysis_examples = examples.take(@config.max_examples_for_analysis)
301
+ analysis_examples = examples.take(@config.view_data_batch_size)
208
302
 
209
303
  {
210
304
  total_examples: examples.size,
@@ -323,12 +417,18 @@ module DSPy
323
417
  params(
324
418
  signature_class: T.class_of(DSPy::Signature),
325
419
  analysis: T::Hash[Symbol, T.untyped],
326
- current_instruction: T.nilable(String)
420
+ current_instruction: T.nilable(String),
421
+ few_shot_examples: T.nilable(T::Array[T.untyped])
327
422
  ).returns(T::Array[String])
328
423
  end
329
- def generate_instruction_candidates(signature_class, analysis, current_instruction)
424
+ def generate_instruction_candidates(signature_class, analysis, current_instruction, few_shot_examples: nil)
330
425
  # Build context for instruction generation
331
- context = build_generation_context(signature_class, analysis, current_instruction)
426
+ context = build_generation_context(
427
+ signature_class,
428
+ analysis,
429
+ current_instruction,
430
+ few_shot_examples: few_shot_examples
431
+ )
332
432
 
333
433
  # Create instruction generation signature
334
434
  instruction_signature = create_instruction_generation_signature
@@ -346,16 +446,7 @@ module DSPy
346
446
  )
347
447
 
348
448
  instruction = result.instruction.strip
349
-
350
- # Truncate if too long
351
- if instruction.length > @config.max_instruction_length
352
- instruction = instruction[0, @config.max_instruction_length].strip
353
- # Try to end at a word boundary
354
- if instruction.include?(' ')
355
- instruction = instruction.rpartition(' ').first + '.'
356
- end
357
- end
358
-
449
+
359
450
  candidates << instruction if instruction.length > 0
360
451
  rescue => error
361
452
  DSPy.logger.warn("Failed to generate instruction candidate #{i + 1}: #{error.message}")
@@ -375,32 +466,56 @@ module DSPy
375
466
  params(
376
467
  signature_class: T.class_of(DSPy::Signature),
377
468
  analysis: T::Hash[Symbol, T.untyped],
378
- current_instruction: T.nilable(String)
469
+ current_instruction: T.nilable(String),
470
+ few_shot_examples: T.nilable(T::Array[T.untyped])
379
471
  ).returns(String)
380
472
  end
381
- def build_generation_context(signature_class, analysis, current_instruction)
473
+ def build_generation_context(signature_class, analysis, current_instruction, few_shot_examples: nil)
382
474
  context_parts = []
383
-
384
- context_parts << "Task: #{signature_class.description}" if @config.use_task_description
385
-
386
- if @config.use_input_output_analysis
387
- # Build detailed field descriptions including enum values
388
- input_descriptions = analysis[:input_fields].map { |f| format_field_description(f) }
389
- output_descriptions = analysis[:output_fields].map { |f| format_field_description(f) }
390
-
391
- context_parts << "Input fields: #{input_descriptions.join(', ')}"
392
- context_parts << "Output fields: #{output_descriptions.join(', ')}"
475
+
476
+ # Include dataset summary if enabled and available
477
+ if @config.use_dataset_summary && @dataset_summary
478
+ context_parts << "Dataset Summary: #{@dataset_summary}"
393
479
  end
394
-
480
+
481
+ # Include program code if enabled and available
482
+ if @config.program_aware && @program_code_string
483
+ context_parts << "Program Code:\n#{@program_code_string}"
484
+ end
485
+
486
+ # Always include task description (fundamental to understanding the task)
487
+ context_parts << "Task: #{signature_class.description}"
488
+
489
+ # Always include field analysis (fundamental to understanding inputs/outputs)
490
+ input_descriptions = analysis[:input_fields].map { |f| format_field_description(f) }
491
+ output_descriptions = analysis[:output_fields].map { |f| format_field_description(f) }
492
+
493
+ context_parts << "Input fields: #{input_descriptions.join(', ')}"
494
+ context_parts << "Output fields: #{output_descriptions.join(', ')}"
495
+
496
+ # Include task demos if enabled and available
497
+ if @config.use_task_demos && few_shot_examples && !few_shot_examples.empty?
498
+ demo_strings = few_shot_examples.take(@config.num_demos_in_context).map do |example|
499
+ format_example_as_demo(example)
500
+ end
501
+ context_parts << "Task Demos:\n#{demo_strings.join("\n\n")}"
502
+ end
503
+
395
504
  if analysis[:common_themes] && analysis[:common_themes].any?
396
505
  context_parts << "Task themes: #{analysis[:common_themes].join(', ')}"
397
506
  end
398
-
507
+
399
508
  if current_instruction
400
509
  context_parts << "Current instruction: \"#{current_instruction}\""
401
510
  end
402
-
403
- context_parts.join("\n")
511
+
512
+ # Include tip if enabled
513
+ if @config.use_tip
514
+ tip = select_tip
515
+ context_parts << "Tip: #{tip}" if tip && !tip.empty?
516
+ end
517
+
518
+ context_parts.join("\n\n")
404
519
  end
405
520
 
406
521
  # Format field description with enum values if applicable
@@ -414,6 +529,42 @@ module DSPy
414
529
  end
415
530
  end
416
531
 
532
+ # Format an example as a demo for context
533
+ sig { params(example: T.untyped).returns(String) }
534
+ def format_example_as_demo(example)
535
+ return example.to_s unless example.respond_to?(:inputs) && example.respond_to?(:expected)
536
+
537
+ parts = []
538
+
539
+ # Format inputs
540
+ if example.inputs && !example.inputs.empty?
541
+ input_strs = example.inputs.map { |k, v| "#{k}: #{v.inspect}" }
542
+ parts << "Inputs: #{input_strs.join(', ')}"
543
+ end
544
+
545
+ # Format expected outputs
546
+ if example.expected && !example.expected.empty?
547
+ output_strs = example.expected.map { |k, v| "#{k}: #{v.inspect}" }
548
+ parts << "Expected: #{output_strs.join(', ')}"
549
+ end
550
+
551
+ parts.join(" | ")
552
+ end
553
+
554
+ # Select a tip based on configuration
555
+ sig { returns(T.nilable(String)) }
556
+ def select_tip
557
+ if @config.set_tip_randomly
558
+ # Randomly select a tip (excluding "none")
559
+ tip_keys = TIPS.keys.reject { |k| k == "none" }
560
+ selected_key = tip_keys.sample
561
+ TIPS[selected_key]
562
+ else
563
+ # Return empty string when not using random tips
564
+ ""
565
+ end
566
+ end
567
+
417
568
  # Build requirements text for instruction generation
418
569
  sig { params(analysis: T::Hash[Symbol, T.untyped]).returns(String) }
419
570
  def build_requirements_text(analysis)
@@ -478,25 +629,21 @@ module DSPy
478
629
  # Filter out duplicates and empty candidates
479
630
  filtered = candidates.uniq.reject(&:empty?)
480
631
 
481
- # Simple ranking based on length and content quality
632
+ # Simple ranking based on content quality (Python-compatible: no length scoring)
482
633
  filtered.sort_by do |instruction|
483
634
  score = 0
484
-
485
- # Prefer moderate length instructions
486
- length_score = [instruction.length, @config.max_instruction_length].min / @config.max_instruction_length.to_f
487
- score += length_score * 0.3
488
-
635
+
489
636
  # Prefer instructions with action words
490
637
  action_words = %w[analyze classify generate explain solve determine identify]
491
638
  action_score = action_words.count { |word| instruction.downcase.include?(word) }
492
639
  score += action_score * 0.4
493
-
640
+
494
641
  # Prefer instructions that mention reasoning for complex tasks
495
642
  if analysis[:complexity_indicators][:requires_reasoning]
496
643
  reasoning_score = instruction.downcase.match?(/\b(step|think|reason|explain)\b/) ? 1 : 0
497
644
  score += reasoning_score * 0.3
498
645
  end
499
-
646
+
500
647
  -score # Negative for descending sort
501
648
  end
502
649
  end
@@ -588,7 +735,7 @@ module DSPy
588
735
  'proposal.num_candidates' => result.num_candidates,
589
736
  'proposal.best_instruction_length' => result.best_instruction.length,
590
737
  'proposal.analysis_themes' => result.analysis[:common_themes] || [],
591
- 'proposal.model_used' => @config.proposal_model
738
+ 'proposal.model_used' => DSPy.current_lm.model
592
739
  })
593
740
  end
594
741
  end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'sorbet-runtime'
4
+
5
+ module DSPy
6
+ module Teleprompt
7
+ # Bootstrap strategy enum for create_n_fewshot_demo_sets
8
+ # Provides type-safe alternatives to Python's magic number seeds
9
+ class BootstrapStrategy < T::Enum
10
+ enums do
11
+ # No demonstrations - zero-shot learning (Python seed = -3)
12
+ ZeroShot = new
13
+
14
+ # Labeled examples only - no bootstrap generation (Python seed = -2)
15
+ LabeledOnly = new
16
+
17
+ # Bootstrapped demonstrations without shuffling (Python seed = -1)
18
+ Unshuffled = new
19
+
20
+ # Bootstrapped demonstrations with shuffling and random size (Python seed >= 0)
21
+ # Requires separate seed parameter for reproducibility
22
+ Shuffled = new
23
+ end
24
+ end
25
+ end
26
+ end