dspy 0.28.0 → 0.28.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/dspy/prompt.rb CHANGED
@@ -22,21 +22,39 @@ module DSPy
22
22
  sig { returns(T.nilable(String)) }
23
23
  attr_reader :signature_class_name
24
24
 
25
+ # Returns the effective schema format
26
+ # Precedence: instance variable (if not :json default) > config.lm > :json
27
+ sig { returns(Symbol) }
28
+ def schema_format
29
+ # If @schema_format was explicitly set to something other than :json, respect it
30
+ return @schema_format if @schema_format && @schema_format != :json
31
+
32
+ # Otherwise, read from config if available
33
+ DSPy.config.lm&.schema_format || @schema_format || :json
34
+ end
35
+
36
+ sig { returns(T.nilable(T.class_of(Signature))) }
37
+ attr_reader :signature_class
38
+
25
39
  sig do
26
40
  params(
27
41
  instruction: String,
28
42
  input_schema: T::Hash[Symbol, T.untyped],
29
43
  output_schema: T::Hash[Symbol, T.untyped],
30
44
  few_shot_examples: T::Array[FewShotExample],
31
- signature_class_name: T.nilable(String)
45
+ signature_class_name: T.nilable(String),
46
+ schema_format: Symbol,
47
+ signature_class: T.nilable(T.class_of(Signature))
32
48
  ).void
33
49
  end
34
- def initialize(instruction:, input_schema:, output_schema:, few_shot_examples: [], signature_class_name: nil)
50
+ def initialize(instruction:, input_schema:, output_schema:, few_shot_examples: [], signature_class_name: nil, schema_format: :json, signature_class: nil)
35
51
  @instruction = instruction
36
52
  @few_shot_examples = few_shot_examples.freeze
37
53
  @input_schema = input_schema.freeze
38
54
  @output_schema = output_schema.freeze
39
55
  @signature_class_name = signature_class_name
56
+ @schema_format = schema_format
57
+ @signature_class = signature_class
40
58
  end
41
59
 
42
60
  # Immutable update methods for optimization
@@ -47,7 +65,9 @@ module DSPy
47
65
  input_schema: @input_schema,
48
66
  output_schema: @output_schema,
49
67
  few_shot_examples: @few_shot_examples,
50
- signature_class_name: @signature_class_name
68
+ signature_class_name: @signature_class_name,
69
+ schema_format: @schema_format,
70
+ signature_class: @signature_class
51
71
  )
52
72
  end
53
73
 
@@ -58,7 +78,9 @@ module DSPy
58
78
  input_schema: @input_schema,
59
79
  output_schema: @output_schema,
60
80
  few_shot_examples: new_examples,
61
- signature_class_name: @signature_class_name
81
+ signature_class_name: @signature_class_name,
82
+ schema_format: @schema_format,
83
+ signature_class: @signature_class
62
84
  )
63
85
  end
64
86
 
@@ -72,16 +94,29 @@ module DSPy
72
94
  sig { returns(String) }
73
95
  def render_system_prompt
74
96
  sections = []
75
-
76
- sections << "Your input schema fields are:"
77
- sections << "```json"
78
- sections << JSON.pretty_generate(@input_schema)
79
- sections << "```"
80
-
81
- sections << "Your output schema fields are:"
82
- sections << "```json"
83
- sections << JSON.pretty_generate(@output_schema)
84
- sections << "```"
97
+
98
+ case schema_format
99
+ when :baml
100
+ sections << "Your input schema fields are:"
101
+ sections << "```baml"
102
+ sections << render_baml_schema(@input_schema, :input)
103
+ sections << "```"
104
+
105
+ sections << "Your output schema fields are:"
106
+ sections << "```baml"
107
+ sections << render_baml_schema(@output_schema, :output)
108
+ sections << "```"
109
+ else # :json (default)
110
+ sections << "Your input schema fields are:"
111
+ sections << "```json"
112
+ sections << JSON.pretty_generate(@input_schema)
113
+ sections << "```"
114
+
115
+ sections << "Your output schema fields are:"
116
+ sections << "```json"
117
+ sections << JSON.pretty_generate(@output_schema)
118
+ sections << "```"
119
+ end
85
120
 
86
121
  sections << ""
87
122
  sections << "All interactions will be structured in the following way, with the appropriate values filled in."
@@ -148,32 +183,36 @@ module DSPy
148
183
  few_shot_examples: @few_shot_examples.map(&:to_h),
149
184
  input_schema: @input_schema,
150
185
  output_schema: @output_schema,
151
- signature_class_name: @signature_class_name
186
+ signature_class_name: @signature_class_name,
187
+ schema_format: @schema_format
152
188
  }
153
189
  end
154
190
 
155
191
  sig { params(hash: T::Hash[Symbol, T.untyped]).returns(Prompt) }
156
192
  def self.from_h(hash)
157
193
  examples = (hash[:few_shot_examples] || []).map { |ex| FewShotExample.from_h(ex) }
158
-
194
+
159
195
  new(
160
196
  instruction: hash[:instruction] || "",
161
197
  input_schema: hash[:input_schema] || {},
162
198
  output_schema: hash[:output_schema] || {},
163
199
  few_shot_examples: examples,
164
- signature_class_name: hash[:signature_class_name]
200
+ signature_class_name: hash[:signature_class_name],
201
+ schema_format: hash[:schema_format] || :json
165
202
  )
166
203
  end
167
204
 
168
205
  # Create prompt from signature class
169
- sig { params(signature_class: T.class_of(Signature)).returns(Prompt) }
170
- def self.from_signature(signature_class)
206
+ sig { params(signature_class: T.class_of(Signature), schema_format: Symbol).returns(Prompt) }
207
+ def self.from_signature(signature_class, schema_format: :json)
171
208
  new(
172
209
  instruction: signature_class.description || "Complete this task.",
173
210
  input_schema: signature_class.input_json_schema,
174
211
  output_schema: signature_class.output_json_schema,
175
212
  few_shot_examples: [],
176
- signature_class_name: signature_class.name
213
+ signature_class_name: signature_class.name,
214
+ schema_format: schema_format,
215
+ signature_class: signature_class
177
216
  )
178
217
  end
179
218
 
@@ -221,6 +260,37 @@ module DSPy
221
260
 
222
261
  private
223
262
 
263
+ # Render BAML schema for input or output
264
+ sig { params(schema: T::Hash[Symbol, T.untyped], type: Symbol).returns(String) }
265
+ def render_baml_schema(schema, type)
266
+ # If we have a signature_class, use sorbet-baml's to_baml method with custom name
267
+ if @signature_class
268
+ begin
269
+ require 'sorbet_baml'
270
+
271
+ struct_class = type == :input ? @signature_class.input_struct_class : @signature_class.output_struct_class
272
+ if struct_class
273
+ # Generate a proper class name from signature class name
274
+ base_name = @signature_class_name || @signature_class.name || "Schema"
275
+ class_name = type == :input ? "#{base_name}Input" : "#{base_name}Output"
276
+
277
+ # Get raw BAML and replace the ugly class name
278
+ raw_baml = struct_class.to_baml
279
+ # Replace the class definition line with a proper name
280
+ return raw_baml.sub(/^class #<Class:0x[0-9a-f]+>/, "class #{class_name}")
281
+ end
282
+ rescue LoadError
283
+ # Fall back to manual BAML generation if sorbet_baml is not available
284
+ end
285
+ end
286
+
287
+ # Fallback: generate BAML manually from schema
288
+ # This is a simple implementation that handles basic types
289
+ # For production use, sorbet-baml should be available
290
+ "# BAML schema generation requires sorbet-baml gem\n" \
291
+ "# Please install: gem install sorbet-baml"
292
+ end
293
+
224
294
  # Recursively serialize complex objects for JSON representation
225
295
  sig { params(obj: T.untyped).returns(T.untyped) }
226
296
  def serialize_for_json(obj)
@@ -0,0 +1,177 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'sorbet-runtime'
4
+ require_relative '../signature'
5
+ require_relative '../predict'
6
+
7
+ module DSPy
8
+ module Propose
9
+ # Dataset Summary Generator for creating concise dataset descriptions
10
+ # Used by GroundedProposer for data-aware instruction generation
11
+ module DatasetSummaryGenerator
12
+ extend T::Sig
13
+
14
+ # Signature for summarizing observations into a brief summary
15
+ class ObservationSummarizer < DSPy::Signature
16
+ description "Given a series of observations I have made about my dataset, please summarize them into a brief 2-3 sentence summary which highlights only the most important details."
17
+
18
+ input do
19
+ const :observations, String, description: "Observations I have made about my dataset"
20
+ end
21
+
22
+ output do
23
+ const :summary, String, description: "Two to Three sentence summary of only the most significant highlights of my observations"
24
+ end
25
+ end
26
+
27
+ # Signature for generating initial dataset observations
28
+ class DatasetDescriptor < DSPy::Signature
29
+ description "Given several examples from a dataset please write observations about trends that hold for most or all of the samples. " \
30
+ "Some areas you may consider in your observations: topics, content, syntax, conciceness, etc. " \
31
+ "It will be useful to make an educated guess as to the nature of the task this dataset will enable. Don't be afraid to be creative"
32
+
33
+ input do
34
+ const :examples, String, description: "Sample data points from the dataset"
35
+ end
36
+
37
+ output do
38
+ const :observations, String, description: "Somethings that holds true for most or all of the data you observed"
39
+ end
40
+ end
41
+
42
+ # Signature for refining observations with prior context
43
+ class DatasetDescriptorWithPriorObservations < DSPy::Signature
44
+ description "Given several examples from a dataset please write observations about trends that hold for most or all of the samples. " \
45
+ "I will also provide you with a few observations I have already made. Please add your own observations or if you feel the observations are comprehensive say 'COMPLETE' " \
46
+ "Some areas you may consider in your observations: topics, content, syntax, conciceness, etc. " \
47
+ "It will be useful to make an educated guess as to the nature of the task this dataset will enable. Don't be afraid to be creative"
48
+
49
+ input do
50
+ const :examples, String, description: "Sample data points from the dataset"
51
+ const :prior_observations, String, description: "Some prior observations I made about the data"
52
+ end
53
+
54
+ output do
55
+ const :observations, String, description: "Somethings that holds true for most or all of the data you observed or COMPLETE if you have nothing to add"
56
+ end
57
+ end
58
+
59
+ # Helper function to ensure consistent ordering of input keys in string representations
60
+ # This helps with caching and consistent LLM prompts
61
+ sig { params(unordered_repr: String).returns(String) }
62
+ def self.order_input_keys_in_string(unordered_repr)
63
+ # Regex pattern to match the input keys structure
64
+ pattern = /input_keys=\{([^}]+)\}/
65
+
66
+ # Function to reorder keys
67
+ unordered_repr.gsub(pattern) do |match|
68
+ keys_str = Regexp.last_match(1)
69
+ # Split the keys, strip extra spaces, and sort them
70
+ keys = keys_str.split(',').map(&:strip).sort
71
+ # Format the sorted keys back into the expected structure
72
+ "input_keys={#{keys.join(', ')}}"
73
+ end
74
+ end
75
+
76
+ # Strip common prefixes from LLM outputs (e.g., "Answer:", "Output:")
77
+ sig { params(text: String).returns(String) }
78
+ def self.strip_prefix(text)
79
+ # Pattern matches up to 4 words followed by a colon
80
+ pattern = /^[\*\s]*(([\w'\-]+\s+){0,4}[\w'\-]+):\s*/
81
+ modified_text = text.gsub(pattern, '')
82
+ modified_text.strip.gsub(/^["']|["']$/, '')
83
+ end
84
+
85
+ # Generate a concise 2-3 sentence summary of a training dataset
86
+ # Used for data-aware instruction proposal in MIPROv2
87
+ #
88
+ # @param trainset [Array<DSPy::Example>] Training examples to summarize
89
+ # @param view_data_batch_size [Integer] Number of examples to process per batch
90
+ # @param prompt_model [DSPy::LM, nil] Language model to use (defaults to DSPy.lm)
91
+ # @param verbose [Boolean] Whether to print progress information
92
+ # @return [String] 2-3 sentence summary of the dataset characteristics
93
+ #
94
+ # @example Basic usage
95
+ # summary = DatasetSummaryGenerator.create_dataset_summary(
96
+ # trainset,
97
+ # view_data_batch_size: 10,
98
+ # prompt_model: DSPy::LM.new('gpt-4o-mini')
99
+ # )
100
+ #
101
+ sig do
102
+ params(
103
+ trainset: T::Array[DSPy::Example],
104
+ view_data_batch_size: Integer,
105
+ prompt_model: T.nilable(DSPy::LM),
106
+ verbose: T::Boolean
107
+ ).returns(String)
108
+ end
109
+ def self.create_dataset_summary(trainset, view_data_batch_size, prompt_model, verbose: false)
110
+ if verbose
111
+ puts "\nBootstrapping dataset summary (this will be used to generate instructions)..."
112
+ end
113
+
114
+ # Use provided model or fall back to global LM
115
+ lm = prompt_model || DSPy.lm
116
+ raise ArgumentError, "No language model configured. Set prompt_model or DSPy.lm" unless lm
117
+
118
+ # Use provided LM in a block context
119
+ DSPy.with_lm(lm) do
120
+ # Initial observation from first batch
121
+ upper_lim = [trainset.length, view_data_batch_size].min
122
+ examples_repr = order_input_keys_in_string(trainset[0...upper_lim].inspect)
123
+
124
+ predictor = DSPy::Predict.new(DatasetDescriptor)
125
+ observation = predictor.call(examples: examples_repr)
126
+ observations = observation.observations
127
+
128
+ # Iteratively refine observations with additional batches
129
+ skips = 0
130
+ max_calls = 10
131
+ calls = 0
132
+
133
+ begin
134
+ (view_data_batch_size...trainset.length).step(view_data_batch_size) do |b|
135
+ calls += 1
136
+ break if calls >= max_calls
137
+
138
+ puts "Processing batch starting at index #{b}" if verbose
139
+
140
+ upper_lim = [trainset.length, b + view_data_batch_size].min
141
+ examples_repr = order_input_keys_in_string(trainset[b...upper_lim].inspect)
142
+
143
+ predictor = DSPy::Predict.new(DatasetDescriptorWithPriorObservations)
144
+ output = predictor.call(
145
+ prior_observations: observations,
146
+ examples: examples_repr
147
+ )
148
+
149
+ # Check if LLM indicates observations are complete
150
+ if output.observations.length >= 8 && output.observations[0...8].upcase == "COMPLETE"
151
+ skips += 1
152
+ break if skips >= 5
153
+ next
154
+ end
155
+
156
+ observations += output.observations
157
+ end
158
+ rescue => e
159
+ if verbose
160
+ puts "Error during observation refinement: #{e.message}. Using observations from past round for summary."
161
+ end
162
+ end
163
+
164
+ # Generate final summary from accumulated observations
165
+ predictor = DSPy::Predict.new(ObservationSummarizer)
166
+ summary = predictor.call(observations: observations)
167
+
168
+ if verbose
169
+ puts "\nGenerated summary: #{strip_prefix(summary.summary)}\n"
170
+ end
171
+
172
+ strip_prefix(summary.summary)
173
+ end
174
+ end
175
+ end
176
+ end
177
+ end