dspy 0.34.1 → 0.34.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +139 -216
  3. data/lib/dspy/chain_of_thought.rb +3 -2
  4. data/lib/dspy/context.rb +57 -30
  5. data/lib/dspy/evals/version.rb +1 -1
  6. data/lib/dspy/evals.rb +42 -31
  7. data/lib/dspy/events.rb +2 -3
  8. data/lib/dspy/example.rb +1 -1
  9. data/lib/dspy/lm/adapter.rb +39 -0
  10. data/lib/dspy/lm/json_strategy.rb +37 -2
  11. data/lib/dspy/lm/message.rb +1 -1
  12. data/lib/dspy/lm/response.rb +1 -1
  13. data/lib/dspy/lm/usage.rb +4 -4
  14. data/lib/dspy/lm.rb +27 -79
  15. data/lib/dspy/mixins/type_coercion.rb +189 -30
  16. data/lib/dspy/module.rb +70 -25
  17. data/lib/dspy/predict.rb +32 -5
  18. data/lib/dspy/prediction.rb +15 -57
  19. data/lib/dspy/prompt.rb +50 -30
  20. data/lib/dspy/propose/dataset_summary_generator.rb +1 -1
  21. data/lib/dspy/propose/grounded_proposer.rb +3 -3
  22. data/lib/dspy/re_act.rb +0 -162
  23. data/lib/dspy/registry/signature_registry.rb +3 -3
  24. data/lib/dspy/ruby_llm/lm/adapters/ruby_llm_adapter.rb +1 -27
  25. data/lib/dspy/schema/sorbet_json_schema.rb +7 -6
  26. data/lib/dspy/schema/version.rb +1 -1
  27. data/lib/dspy/schema_adapters.rb +1 -1
  28. data/lib/dspy/storage/program_storage.rb +2 -2
  29. data/lib/dspy/structured_outputs_prompt.rb +3 -3
  30. data/lib/dspy/teleprompt/utils.rb +2 -2
  31. data/lib/dspy/tools/github_cli_toolset.rb +7 -7
  32. data/lib/dspy/tools/text_processing_toolset.rb +2 -2
  33. data/lib/dspy/tools/toolset.rb +1 -1
  34. data/lib/dspy/version.rb +1 -1
  35. data/lib/dspy.rb +1 -4
  36. metadata +1 -26
  37. data/lib/dspy/events/subscriber_mixin.rb +0 -79
  38. data/lib/dspy/events/subscribers.rb +0 -43
  39. data/lib/dspy/memory/embedding_engine.rb +0 -68
  40. data/lib/dspy/memory/in_memory_store.rb +0 -216
  41. data/lib/dspy/memory/local_embedding_engine.rb +0 -244
  42. data/lib/dspy/memory/memory_compactor.rb +0 -298
  43. data/lib/dspy/memory/memory_manager.rb +0 -266
  44. data/lib/dspy/memory/memory_record.rb +0 -163
  45. data/lib/dspy/memory/memory_store.rb +0 -90
  46. data/lib/dspy/memory.rb +0 -30
  47. data/lib/dspy/tools/memory_toolset.rb +0 -117
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dspy
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.34.1
4
+ version: 0.34.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Vicente Reig Rincón de Arellano
@@ -135,20 +135,6 @@ dependencies:
135
135
  - - "~>"
136
136
  - !ruby/object:Gem::Version
137
137
  version: 1.0.0
138
- - !ruby/object:Gem::Dependency
139
- name: informers
140
- requirement: !ruby/object:Gem::Requirement
141
- requirements:
142
- - - "~>"
143
- - !ruby/object:Gem::Version
144
- version: '1.2'
145
- type: :runtime
146
- prerelease: false
147
- version_requirements: !ruby/object:Gem::Requirement
148
- requirements:
149
- - - "~>"
150
- - !ruby/object:Gem::Version
151
- version: '1.2'
152
138
  description: The Ruby framework for programming with large language models. DSPy.rb
153
139
  brings structured LLM programming to Ruby developers. Instead of wrestling with
154
140
  prompt strings and parsing responses, you define typed signatures using idiomatic
@@ -170,8 +156,6 @@ files:
170
156
  - lib/dspy/evals.rb
171
157
  - lib/dspy/evals/version.rb
172
158
  - lib/dspy/events.rb
173
- - lib/dspy/events/subscriber_mixin.rb
174
- - lib/dspy/events/subscribers.rb
175
159
  - lib/dspy/events/types.rb
176
160
  - lib/dspy/example.rb
177
161
  - lib/dspy/ext/struct_descriptions.rb
@@ -189,14 +173,6 @@ files:
189
173
  - lib/dspy/lm/response.rb
190
174
  - lib/dspy/lm/usage.rb
191
175
  - lib/dspy/lm/vision_models.rb
192
- - lib/dspy/memory.rb
193
- - lib/dspy/memory/embedding_engine.rb
194
- - lib/dspy/memory/in_memory_store.rb
195
- - lib/dspy/memory/local_embedding_engine.rb
196
- - lib/dspy/memory/memory_compactor.rb
197
- - lib/dspy/memory/memory_manager.rb
198
- - lib/dspy/memory/memory_record.rb
199
- - lib/dspy/memory/memory_store.rb
200
176
  - lib/dspy/mixins/instruction_updatable.rb
201
177
  - lib/dspy/mixins/struct_builder.rb
202
178
  - lib/dspy/mixins/type_coercion.rb
@@ -238,7 +214,6 @@ files:
238
214
  - lib/dspy/tools.rb
239
215
  - lib/dspy/tools/base.rb
240
216
  - lib/dspy/tools/github_cli_toolset.rb
241
- - lib/dspy/tools/memory_toolset.rb
242
217
  - lib/dspy/tools/schema.rb
243
218
  - lib/dspy/tools/text_processing_toolset.rb
244
219
  - lib/dspy/tools/toolset.rb
@@ -1,79 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'sorbet-runtime'
4
-
5
- module DSPy
6
- module Events
7
- # Mixin for adding class-level event subscriptions
8
- # Provides a clean way to subscribe to events at the class level
9
- # instead of requiring instance-based subscriptions
10
- #
11
- # Usage:
12
- # class MyTracker
13
- # include DSPy::Events::SubscriberMixin
14
- #
15
- # add_subscription('llm.*') do |name, attrs|
16
- # # Handle LLM events globally for this class
17
- # end
18
- # end
19
- module SubscriberMixin
20
- extend T::Sig
21
-
22
- def self.included(base)
23
- base.extend(ClassMethods)
24
- base.class_eval do
25
- @event_subscriptions = []
26
- @subscription_mutex = Mutex.new
27
-
28
- # Initialize subscriptions when the class is first loaded
29
- @subscriptions_initialized = false
30
- end
31
- end
32
-
33
- module ClassMethods
34
- extend T::Sig
35
-
36
- # Add a class-level event subscription
37
- sig { params(pattern: String, block: T.proc.params(arg0: String, arg1: T::Hash[T.any(String, Symbol), T.untyped]).void).returns(String) }
38
- def add_subscription(pattern, &block)
39
- subscription_mutex.synchronize do
40
- subscription_id = DSPy.events.subscribe(pattern, &block)
41
- event_subscriptions << subscription_id
42
- subscription_id
43
- end
44
- end
45
-
46
- # Remove all subscriptions for this class
47
- sig { void }
48
- def unsubscribe_all
49
- subscription_mutex.synchronize do
50
- event_subscriptions.each { |id| DSPy.events.unsubscribe(id) }
51
- event_subscriptions.clear
52
- end
53
- end
54
-
55
- # Get list of active subscription IDs
56
- sig { returns(T::Array[String]) }
57
- def subscriptions
58
- subscription_mutex.synchronize do
59
- event_subscriptions.dup
60
- end
61
- end
62
-
63
- private
64
-
65
- # Thread-safe access to subscriptions array
66
- sig { returns(T::Array[String]) }
67
- def event_subscriptions
68
- @event_subscriptions ||= []
69
- end
70
-
71
- # Thread-safe access to mutex
72
- sig { returns(Mutex) }
73
- def subscription_mutex
74
- @subscription_mutex ||= Mutex.new
75
- end
76
- end
77
- end
78
- end
79
- end
@@ -1,43 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module DSPy
4
- module Events
5
- # Base subscriber class for event-driven patterns
6
- # This provides the foundation for creating custom event subscribers
7
- #
8
- # Example usage:
9
- # class MySubscriber < DSPy::Events::BaseSubscriber
10
- # def subscribe
11
- # add_subscription('llm.*') do |event_name, attributes|
12
- # # Handle LLM events
13
- # end
14
- # end
15
- # end
16
- #
17
- # subscriber = MySubscriber.new
18
- # # subscriber will start receiving events
19
- # subscriber.unsubscribe # Clean up when done
20
- class BaseSubscriber
21
- def initialize
22
- @subscriptions = []
23
- end
24
-
25
- def subscribe
26
- raise NotImplementedError, "Subclasses must implement #subscribe"
27
- end
28
-
29
- def unsubscribe
30
- @subscriptions.each { |id| DSPy.events.unsubscribe(id) }
31
- @subscriptions.clear
32
- end
33
-
34
- protected
35
-
36
- def add_subscription(pattern, &block)
37
- subscription_id = DSPy.events.subscribe(pattern, &block)
38
- @subscriptions << subscription_id
39
- subscription_id
40
- end
41
- end
42
- end
43
- end
@@ -1,68 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'sorbet-runtime'
4
-
5
- module DSPy
6
- module Memory
7
- # Abstract base class for embedding engines
8
- class EmbeddingEngine
9
- extend T::Sig
10
- extend T::Helpers
11
- abstract!
12
-
13
- # Generate embeddings for a single text
14
- sig { abstract.params(text: String).returns(T::Array[Float]) }
15
- def embed(text); end
16
-
17
- # Generate embeddings for multiple texts (batch processing)
18
- sig { abstract.params(texts: T::Array[String]).returns(T::Array[T::Array[Float]]) }
19
- def embed_batch(texts); end
20
-
21
- # Get the dimension of embeddings produced by this engine
22
- sig { abstract.returns(Integer) }
23
- def embedding_dimension; end
24
-
25
- # Get the model name/identifier
26
- sig { abstract.returns(String) }
27
- def model_name; end
28
-
29
- # Check if the engine is ready to use
30
- sig { returns(T::Boolean) }
31
- def ready?
32
- true
33
- end
34
-
35
- # Get engine statistics
36
- sig { returns(T::Hash[Symbol, T.untyped]) }
37
- def stats
38
- {
39
- model_name: model_name,
40
- embedding_dimension: embedding_dimension,
41
- ready: ready?
42
- }
43
- end
44
-
45
- # Normalize a vector to unit length
46
- sig { params(vector: T::Array[Float]).returns(T::Array[Float]) }
47
- def normalize_vector(vector)
48
- magnitude = Math.sqrt(vector.sum { |x| x * x })
49
- return vector if magnitude == 0.0
50
- vector.map { |x| x / magnitude }
51
- end
52
-
53
- # Calculate cosine similarity between two vectors
54
- sig { params(a: T::Array[Float], b: T::Array[Float]).returns(Float) }
55
- def cosine_similarity(a, b)
56
- return 0.0 if a.empty? || b.empty? || a.size != b.size
57
-
58
- dot_product = a.zip(b).sum { |x, y| x * y }
59
- magnitude_a = Math.sqrt(a.sum { |x| x * x })
60
- magnitude_b = Math.sqrt(b.sum { |x| x * x })
61
-
62
- return 0.0 if magnitude_a == 0.0 || magnitude_b == 0.0
63
-
64
- dot_product / (magnitude_a * magnitude_b)
65
- end
66
- end
67
- end
68
- end
@@ -1,216 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'sorbet-runtime'
4
- require_relative 'memory_store'
5
-
6
- module DSPy
7
- module Memory
8
- # In-memory implementation of MemoryStore for development and testing
9
- class InMemoryStore < MemoryStore
10
- extend T::Sig
11
-
12
- sig { void }
13
- def initialize
14
- @memories = T.let({}, T::Hash[String, MemoryRecord])
15
- @mutex = T.let(Mutex.new, Mutex)
16
- end
17
-
18
- sig { override.params(record: MemoryRecord).returns(T::Boolean) }
19
- def store(record)
20
- @mutex.synchronize do
21
- @memories[record.id] = record
22
- true
23
- end
24
- end
25
-
26
- sig { override.params(id: String).returns(T.nilable(MemoryRecord)) }
27
- def retrieve(id)
28
- @mutex.synchronize do
29
- record = @memories[id]
30
- record&.record_access!
31
- record
32
- end
33
- end
34
-
35
- sig { override.params(record: MemoryRecord).returns(T::Boolean) }
36
- def update(record)
37
- @mutex.synchronize do
38
- if @memories.key?(record.id)
39
- @memories[record.id] = record
40
- true
41
- else
42
- false
43
- end
44
- end
45
- end
46
-
47
- sig { override.params(id: String).returns(T::Boolean) }
48
- def delete(id)
49
- @mutex.synchronize do
50
- @memories.delete(id) ? true : false
51
- end
52
- end
53
-
54
- sig { override.params(user_id: T.nilable(String), limit: T.nilable(Integer), offset: T.nilable(Integer)).returns(T::Array[MemoryRecord]) }
55
- def list(user_id: nil, limit: nil, offset: nil)
56
- @mutex.synchronize do
57
- records = @memories.values
58
-
59
- # Filter by user_id if provided
60
- records = records.select { |r| r.user_id == user_id } if user_id
61
-
62
- # Sort by created_at (newest first)
63
- records = records.sort_by(&:created_at).reverse
64
-
65
- # Apply offset and limit
66
- records = records.drop(offset) if offset
67
- records = records.take(limit) if limit
68
-
69
- records
70
- end
71
- end
72
-
73
- sig { override.params(query: String, user_id: T.nilable(String), limit: T.nilable(Integer)).returns(T::Array[MemoryRecord]) }
74
- def search(query, user_id: nil, limit: nil)
75
- @mutex.synchronize do
76
- regex = Regexp.new(Regexp.escape(query), Regexp::IGNORECASE)
77
-
78
- records = @memories.values.select do |record|
79
- # Filter by user_id if provided
80
- next false if user_id && record.user_id != user_id
81
-
82
- # Search in content and tags
83
- record.content.match?(regex) || record.tags.any? { |tag| tag.match?(regex) }
84
- end
85
-
86
- # Sort by relevance (exact matches first, then by recency)
87
- records = records.sort_by do |record|
88
- exact_match = record.content.downcase.include?(query.downcase) ? 0 : 1
89
- [exact_match, -record.created_at.to_f]
90
- end
91
-
92
- records = records.take(limit) if limit
93
- records
94
- end
95
- end
96
-
97
- sig { override.params(tags: T::Array[String], user_id: T.nilable(String), limit: T.nilable(Integer)).returns(T::Array[MemoryRecord]) }
98
- def search_by_tags(tags, user_id: nil, limit: nil)
99
- @mutex.synchronize do
100
- records = @memories.values.select do |record|
101
- # Filter by user_id if provided
102
- next false if user_id && record.user_id != user_id
103
-
104
- # Check if record has any of the specified tags
105
- tags.any? { |tag| record.has_tag?(tag) }
106
- end
107
-
108
- # Sort by number of matching tags, then by recency
109
- records = records.sort_by do |record|
110
- matching_tags = tags.count { |tag| record.has_tag?(tag) }
111
- [-matching_tags, -record.created_at.to_f]
112
- end
113
-
114
- records = records.take(limit) if limit
115
- records
116
- end
117
- end
118
-
119
- sig { override.params(embedding: T::Array[Float], user_id: T.nilable(String), limit: T.nilable(Integer), threshold: T.nilable(Float)).returns(T::Array[MemoryRecord]) }
120
- def vector_search(embedding, user_id: nil, limit: nil, threshold: nil)
121
- @mutex.synchronize do
122
- records_with_similarity = []
123
-
124
- @memories.values.each do |record|
125
- # Filter by user_id if provided
126
- next if user_id && record.user_id != user_id
127
-
128
- # Skip records without embeddings
129
- next unless record.embedding
130
-
131
- # Calculate cosine similarity
132
- similarity = cosine_similarity(embedding, record.embedding)
133
-
134
- # Apply threshold if provided
135
- next if threshold && similarity < threshold
136
-
137
- records_with_similarity << [record, similarity]
138
- end
139
-
140
- # Sort by similarity (highest first)
141
- records_with_similarity.sort_by! { |_, similarity| -similarity }
142
-
143
- # Apply limit
144
- records_with_similarity = records_with_similarity.take(limit) if limit
145
-
146
- # Return just the records
147
- records_with_similarity.map(&:first)
148
- end
149
- end
150
-
151
- sig { override.params(user_id: T.nilable(String)).returns(Integer) }
152
- def count(user_id: nil)
153
- @mutex.synchronize do
154
- if user_id
155
- @memories.values.count { |record| record.user_id == user_id }
156
- else
157
- @memories.size
158
- end
159
- end
160
- end
161
-
162
- sig { override.params(user_id: T.nilable(String)).returns(Integer) }
163
- def clear(user_id: nil)
164
- @mutex.synchronize do
165
- if user_id
166
- count = @memories.values.count { |record| record.user_id == user_id }
167
- @memories.reject! { |_, record| record.user_id == user_id }
168
- count
169
- else
170
- count = @memories.size
171
- @memories.clear
172
- count
173
- end
174
- end
175
- end
176
-
177
- sig { override.returns(T::Boolean) }
178
- def supports_vector_search?
179
- true
180
- end
181
-
182
- sig { override.returns(T::Hash[Symbol, T.untyped]) }
183
- def stats
184
- @mutex.synchronize do
185
- total = @memories.size
186
- with_embeddings = @memories.values.count(&:embedding)
187
- users = @memories.values.map(&:user_id).compact.uniq.size
188
-
189
- {
190
- total_memories: total,
191
- memories_with_embeddings: with_embeddings,
192
- unique_users: users,
193
- supports_vector_search: supports_vector_search?,
194
- avg_access_count: total > 0 ? @memories.values.sum(&:access_count) / total.to_f : 0
195
- }
196
- end
197
- end
198
-
199
- private
200
-
201
- # Calculate cosine similarity between two vectors
202
- sig { params(a: T::Array[Float], b: T::Array[Float]).returns(Float) }
203
- def cosine_similarity(a, b)
204
- return 0.0 if a.empty? || b.empty? || a.size != b.size
205
-
206
- dot_product = a.zip(b).sum { |x, y| x * y }
207
- magnitude_a = Math.sqrt(a.sum { |x| x * x })
208
- magnitude_b = Math.sqrt(b.sum { |x| x * x })
209
-
210
- return 0.0 if magnitude_a == 0.0 || magnitude_b == 0.0
211
-
212
- dot_product / (magnitude_a * magnitude_b)
213
- end
214
- end
215
- end
216
- end
@@ -1,244 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'sorbet-runtime'
4
-
5
- require 'informers'
6
-
7
- require_relative 'embedding_engine'
8
-
9
- module DSPy
10
- module Memory
11
- # Local embedding engine using ankane/informers for privacy-preserving embeddings
12
- class LocalEmbeddingEngine < EmbeddingEngine
13
- extend T::Sig
14
-
15
- # Default models supported by informers
16
- DEFAULT_MODEL = 'Xenova/all-MiniLM-L6-v2'
17
- SUPPORTED_MODELS = [
18
- 'Xenova/all-MiniLM-L6-v2',
19
- 'Xenova/all-MiniLM-L12-v2',
20
- 'Xenova/multi-qa-MiniLM-L6-cos-v1',
21
- 'Xenova/paraphrase-MiniLM-L6-v2'
22
- ].freeze
23
-
24
- sig { returns(String) }
25
- attr_reader :model_name
26
-
27
- sig { params(model_name: String).void }
28
- def initialize(model_name = DEFAULT_MODEL)
29
- @model_name = model_name
30
- @model = T.let(nil, T.nilable(T.untyped))
31
- @embedding_dim = T.let(nil, T.nilable(Integer))
32
- @ready = T.let(false, T::Boolean)
33
-
34
- load_model!
35
- end
36
-
37
- sig { override.params(text: String).returns(T::Array[Float]) }
38
- def embed(text)
39
- DSPy::Context.with_span(
40
- operation: 'embedding.generate',
41
- **DSPy::ObservationType::Embedding.langfuse_attributes,
42
- 'embedding.model' => @model_name,
43
- 'embedding.input' => text[0..200], # Truncate for logging
44
- 'embedding.input_length' => text.length
45
- ) do |span|
46
- ensure_ready!
47
-
48
- # Preprocess text
49
- cleaned_text = preprocess_text(text)
50
-
51
- # Generate embedding
52
- result = @model.call(cleaned_text)
53
-
54
- # Extract embedding array and normalize
55
- embedding = result.first.to_a
56
- normalized = normalize_vector(embedding)
57
-
58
- # Add embedding metadata to span
59
- if span
60
- span.set_attribute('embedding.dimension', normalized.length)
61
- span.set_attribute('embedding.magnitude', Math.sqrt(normalized.sum { |x| x * x }))
62
- end
63
-
64
- normalized
65
- end
66
- end
67
-
68
- sig { override.params(texts: T::Array[String]).returns(T::Array[T::Array[Float]]) }
69
- def embed_batch(texts)
70
- ensure_ready!
71
-
72
- # Generate embeddings one by one (informers doesn't support true batch processing)
73
- texts.map do |text|
74
- embed(text)
75
- end
76
- end
77
-
78
- sig { override.returns(Integer) }
79
- def embedding_dimension
80
- @embedding_dim || load_model_info!
81
- end
82
-
83
- sig { override.returns(String) }
84
- def model_name
85
- @model_name
86
- end
87
-
88
- sig { override.returns(T::Boolean) }
89
- def ready?
90
- @ready
91
- end
92
-
93
- sig { override.returns(T::Hash[Symbol, T.untyped]) }
94
- def stats
95
- {
96
- model_name: @model_name,
97
- embedding_dimension: embedding_dimension,
98
- ready: ready?,
99
- supported_models: SUPPORTED_MODELS,
100
- backend: 'informers'
101
- }
102
- end
103
-
104
- # Check if a model is supported
105
- sig { params(model_name: String).returns(T::Boolean) }
106
- def self.model_supported?(model_name)
107
- SUPPORTED_MODELS.include?(model_name)
108
- end
109
-
110
- # List all supported models
111
- sig { returns(T::Array[String]) }
112
- def self.supported_models
113
- SUPPORTED_MODELS
114
- end
115
-
116
- private
117
-
118
- # Load the embedding model
119
- sig { void }
120
- def load_model!
121
- begin
122
- @model = Informers.pipeline('feature-extraction', @model_name)
123
- @ready = true
124
- load_model_info!
125
- rescue => e
126
- @ready = false
127
- raise "Failed to load embedding model '#{@model_name}': #{e.message}"
128
- end
129
- end
130
-
131
- # Load model information (dimension, etc.)
132
- sig { returns(Integer) }
133
- def load_model_info!
134
- return @embedding_dim if @embedding_dim
135
-
136
- # Test with a simple string to get dimension
137
- test_result = @model.call("test")
138
- @embedding_dim = test_result.first.size
139
- end
140
-
141
- # Ensure the model is ready
142
- sig { void }
143
- def ensure_ready!
144
- unless @ready
145
- raise "Embedding engine not ready. Model '#{@model_name}' failed to load."
146
- end
147
- end
148
-
149
- # Preprocess text for better embeddings
150
- sig { params(text: String).returns(String) }
151
- def preprocess_text(text)
152
- # Basic text preprocessing
153
- cleaned = text.strip
154
-
155
- # Remove excessive whitespace
156
- cleaned = cleaned.gsub(/\s+/, ' ')
157
-
158
- # Truncate if too long (most models have token limits)
159
- if cleaned.length > 8192 # Conservative limit
160
- cleaned = cleaned[0..8191]
161
- end
162
-
163
- cleaned
164
- end
165
- end
166
-
167
- # Fallback embedding engine when informers is not available
168
- class NoOpEmbeddingEngine < EmbeddingEngine
169
- extend T::Sig
170
-
171
- sig { override.params(text: String).returns(T::Array[Float]) }
172
- def embed(text)
173
- # Return a simple hash-based embedding for basic functionality
174
- simple_hash_embedding(text)
175
- end
176
-
177
- sig { override.params(texts: T::Array[String]).returns(T::Array[T::Array[Float]]) }
178
- def embed_batch(texts)
179
- texts.map { |text| embed(text) }
180
- end
181
-
182
- sig { override.returns(Integer) }
183
- def embedding_dimension
184
- 128 # Fixed dimension for hash-based embeddings
185
- end
186
-
187
- sig { override.returns(String) }
188
- def model_name
189
- 'simple-hash'
190
- end
191
-
192
- sig { override.returns(T::Boolean) }
193
- def ready?
194
- true
195
- end
196
-
197
- private
198
-
199
- # Generate a simple hash-based embedding that captures semantic similarity
200
- sig { params(text: String).returns(T::Array[Float]) }
201
- def simple_hash_embedding(text)
202
- # Create a deterministic but semantically aware embedding
203
- words = text.downcase.split(/\W+/).reject(&:empty?)
204
-
205
- # Initialize embedding vector
206
- embedding = Array.new(128, 0.0)
207
-
208
- # Create base embedding from all words
209
- words.each_with_index do |word, word_idx|
210
- word_hash = word.sum(&:ord)
211
-
212
- # Distribute word influence across dimensions
213
- (0..7).each do |i|
214
- dim = (word_hash + i * 13) % 128
215
- weight = Math.sin(word_hash + i) * 0.2
216
- embedding[dim] += weight / Math.sqrt(words.length + 1)
217
- end
218
- end
219
-
220
- # Add semantic clusters for common words
221
- semantic_clusters = {
222
- ['programming', 'code', 'software', 'development'] => (0..15),
223
- ['ruby', 'python', 'java', 'javascript'] => (16..31),
224
- ['work', 'project', 'task', 'job'] => (32..47),
225
- ['tutorial', 'guide', 'learning', 'education'] => (48..63),
226
- ['memory', 'storage', 'data', 'information'] => (64..79),
227
- ['personal', 'private', 'individual', 'own'] => (80..95),
228
- ['important', 'critical', 'key', 'essential'] => (96..111),
229
- ['test', 'testing', 'spec', 'example'] => (112..127)
230
- }
231
-
232
- semantic_clusters.each do |cluster_words, range|
233
- cluster_weight = words.count { |word| cluster_words.include?(word) }
234
- if cluster_weight > 0
235
- range.each { |dim| embedding[dim] += cluster_weight * 0.3 }
236
- end
237
- end
238
-
239
- # Normalize to unit vector
240
- normalize_vector(embedding)
241
- end
242
- end
243
- end
244
- end