chromadb-experimental 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. checksums.yaml +7 -0
  2. data/lib/chromadb/admin_client.rb +6 -0
  3. data/lib/chromadb/client.rb +317 -0
  4. data/lib/chromadb/collection.rb +573 -0
  5. data/lib/chromadb/embedding_functions/chroma_bm25.rb +459 -0
  6. data/lib/chromadb/embedding_functions/chroma_cloud_qwen.rb +139 -0
  7. data/lib/chromadb/embedding_functions/chroma_cloud_splade.rb +121 -0
  8. data/lib/chromadb/embedding_functions.rb +121 -0
  9. data/lib/chromadb/errors.rb +120 -0
  10. data/lib/chromadb/http_client.rb +142 -0
  11. data/lib/chromadb/openapi/lib/chromadb/api/default_api.rb +2349 -0
  12. data/lib/chromadb/openapi/lib/chromadb/api_client.rb +392 -0
  13. data/lib/chromadb/openapi/lib/chromadb/api_error.rb +58 -0
  14. data/lib/chromadb/openapi/lib/chromadb/configuration.rb +295 -0
  15. data/lib/chromadb/openapi/lib/chromadb/models/add_collection_records_payload.rb +260 -0
  16. data/lib/chromadb/openapi/lib/chromadb/models/attach_function_request.rb +250 -0
  17. data/lib/chromadb/openapi/lib/chromadb/models/attach_function_response.rb +235 -0
  18. data/lib/chromadb/openapi/lib/chromadb/models/attached_function_api_response.rb +361 -0
  19. data/lib/chromadb/openapi/lib/chromadb/models/attached_function_info.rb +240 -0
  20. data/lib/chromadb/openapi/lib/chromadb/models/bool_inverted_index_type.rb +229 -0
  21. data/lib/chromadb/openapi/lib/chromadb/models/bool_value_type.rb +221 -0
  22. data/lib/chromadb/openapi/lib/chromadb/models/checklist_response.rb +245 -0
  23. data/lib/chromadb/openapi/lib/chromadb/models/collection.rb +315 -0
  24. data/lib/chromadb/openapi/lib/chromadb/models/collection_configuration.rb +240 -0
  25. data/lib/chromadb/openapi/lib/chromadb/models/create_collection_payload.rb +260 -0
  26. data/lib/chromadb/openapi/lib/chromadb/models/create_database_payload.rb +220 -0
  27. data/lib/chromadb/openapi/lib/chromadb/models/create_tenant_payload.rb +220 -0
  28. data/lib/chromadb/openapi/lib/chromadb/models/database.rb +240 -0
  29. data/lib/chromadb/openapi/lib/chromadb/models/detach_function_request.rb +221 -0
  30. data/lib/chromadb/openapi/lib/chromadb/models/detach_function_response.rb +220 -0
  31. data/lib/chromadb/openapi/lib/chromadb/models/embedding_function_new_configuration.rb +230 -0
  32. data/lib/chromadb/openapi/lib/chromadb/models/error_response.rb +230 -0
  33. data/lib/chromadb/openapi/lib/chromadb/models/float_inverted_index_type.rb +229 -0
  34. data/lib/chromadb/openapi/lib/chromadb/models/float_list_value_type.rb +221 -0
  35. data/lib/chromadb/openapi/lib/chromadb/models/float_value_type.rb +221 -0
  36. data/lib/chromadb/openapi/lib/chromadb/models/fork_collection_payload.rb +220 -0
  37. data/lib/chromadb/openapi/lib/chromadb/models/fts_index_type.rb +229 -0
  38. data/lib/chromadb/openapi/lib/chromadb/models/get_attached_function_response.rb +224 -0
  39. data/lib/chromadb/openapi/lib/chromadb/models/get_response.rb +270 -0
  40. data/lib/chromadb/openapi/lib/chromadb/models/get_tenant_response.rb +230 -0
  41. data/lib/chromadb/openapi/lib/chromadb/models/get_user_identity_response.rb +246 -0
  42. data/lib/chromadb/openapi/lib/chromadb/models/heartbeat_response.rb +235 -0
  43. data/lib/chromadb/openapi/lib/chromadb/models/hnsw_configuration.rb +330 -0
  44. data/lib/chromadb/openapi/lib/chromadb/models/hnsw_index_config.rb +371 -0
  45. data/lib/chromadb/openapi/lib/chromadb/models/include.rb +210 -0
  46. data/lib/chromadb/openapi/lib/chromadb/models/int_inverted_index_type.rb +229 -0
  47. data/lib/chromadb/openapi/lib/chromadb/models/int_value_type.rb +221 -0
  48. data/lib/chromadb/openapi/lib/chromadb/models/query_response.rb +280 -0
  49. data/lib/chromadb/openapi/lib/chromadb/models/raw_where_fields.rb +230 -0
  50. data/lib/chromadb/openapi/lib/chromadb/models/schema.rb +258 -0
  51. data/lib/chromadb/openapi/lib/chromadb/models/search_payload.rb +256 -0
  52. data/lib/chromadb/openapi/lib/chromadb/models/search_payload_filter.rb +230 -0
  53. data/lib/chromadb/openapi/lib/chromadb/models/search_payload_group_by.rb +230 -0
  54. data/lib/chromadb/openapi/lib/chromadb/models/search_payload_limit.rb +230 -0
  55. data/lib/chromadb/openapi/lib/chromadb/models/search_payload_select.rb +220 -0
  56. data/lib/chromadb/openapi/lib/chromadb/models/search_request_payload.rb +220 -0
  57. data/lib/chromadb/openapi/lib/chromadb/models/search_response.rb +270 -0
  58. data/lib/chromadb/openapi/lib/chromadb/models/space.rb +210 -0
  59. data/lib/chromadb/openapi/lib/chromadb/models/spann_configuration.rb +420 -0
  60. data/lib/chromadb/openapi/lib/chromadb/models/spann_index_config.rb +536 -0
  61. data/lib/chromadb/openapi/lib/chromadb/models/sparse_vector.rb +244 -0
  62. data/lib/chromadb/openapi/lib/chromadb/models/sparse_vector_index_config.rb +242 -0
  63. data/lib/chromadb/openapi/lib/chromadb/models/sparse_vector_index_type.rb +234 -0
  64. data/lib/chromadb/openapi/lib/chromadb/models/sparse_vector_value_type.rb +221 -0
  65. data/lib/chromadb/openapi/lib/chromadb/models/string_inverted_index_type.rb +229 -0
  66. data/lib/chromadb/openapi/lib/chromadb/models/string_value_type.rb +231 -0
  67. data/lib/chromadb/openapi/lib/chromadb/models/update_collection_configuration.rb +240 -0
  68. data/lib/chromadb/openapi/lib/chromadb/models/update_collection_payload.rb +240 -0
  69. data/lib/chromadb/openapi/lib/chromadb/models/update_collection_records_payload.rb +260 -0
  70. data/lib/chromadb/openapi/lib/chromadb/models/update_hnsw_configuration.rb +345 -0
  71. data/lib/chromadb/openapi/lib/chromadb/models/update_spann_configuration.rb +260 -0
  72. data/lib/chromadb/openapi/lib/chromadb/models/update_tenant_payload.rb +220 -0
  73. data/lib/chromadb/openapi/lib/chromadb/models/upsert_collection_records_payload.rb +260 -0
  74. data/lib/chromadb/openapi/lib/chromadb/models/value_types.rb +271 -0
  75. data/lib/chromadb/openapi/lib/chromadb/models/vector_index_config.rb +261 -0
  76. data/lib/chromadb/openapi/lib/chromadb/models/vector_index_type.rb +234 -0
  77. data/lib/chromadb/openapi/lib/chromadb/version.rb +15 -0
  78. data/lib/chromadb/openapi/lib/chromadb.rb +102 -0
  79. data/lib/chromadb/openapi.rb +6 -0
  80. data/lib/chromadb/schema.rb +744 -0
  81. data/lib/chromadb/schemas/chroma-cloud-qwen.json +61 -0
  82. data/lib/chromadb/schemas/chroma-cloud-splade.json +31 -0
  83. data/lib/chromadb/schemas/chroma_bm25.json +37 -0
  84. data/lib/chromadb/search/key.rb +94 -0
  85. data/lib/chromadb/search/limit.rb +41 -0
  86. data/lib/chromadb/search/rank.rb +425 -0
  87. data/lib/chromadb/search/search.rb +73 -0
  88. data/lib/chromadb/search/select.rb +54 -0
  89. data/lib/chromadb/search/where.rb +157 -0
  90. data/lib/chromadb/search.rb +8 -0
  91. data/lib/chromadb/types/results.rb +96 -0
  92. data/lib/chromadb/types/sparse_vector.rb +86 -0
  93. data/lib/chromadb/types/validation.rb +519 -0
  94. data/lib/chromadb/types.rb +13 -0
  95. data/lib/chromadb/version.rb +5 -0
  96. data/lib/chromadb.rb +15 -0
  97. metadata +233 -0
@@ -0,0 +1,459 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "set"
4
+ module Chroma
5
+ module EmbeddingFunctions
6
+ class Murmur3AbsHasher
7
+ def initialize(seed = 0)
8
+ @seed = seed
9
+ end
10
+
11
+ def hash(token)
12
+ unsigned = murmur3(token)
13
+ signed = unsigned >= 0x80000000 ? unsigned - 0x1_0000_0000 : unsigned
14
+ signed.abs
15
+ end
16
+
17
+ private
18
+
19
+ def murmur3(key)
20
+ bytes = key.to_s.b.bytes
21
+ h1 = @seed & 0xffffffff
22
+ c1 = 0xcc9e2d51
23
+ c2 = 0x1b873593
24
+ length = bytes.length
25
+ i = 0
26
+ rounded = length - (length & 3)
27
+
28
+ while i < rounded
29
+ k1 = (bytes[i] & 0xff) |
30
+ ((bytes[i + 1] & 0xff) << 8) |
31
+ ((bytes[i + 2] & 0xff) << 16) |
32
+ ((bytes[i + 3] & 0xff) << 24)
33
+ i += 4
34
+
35
+ k1 = imul(k1, c1)
36
+ k1 = rotl32(k1, 15)
37
+ k1 = imul(k1, c2)
38
+
39
+ h1 ^= k1
40
+ h1 = rotl32(h1, 13)
41
+ h1 = (imul(h1, 5) + 0xe6546b64) & 0xffffffff
42
+ end
43
+
44
+ k1 = 0
45
+ case length & 3
46
+ when 3
47
+ k1 ^= (bytes[i + 2] & 0xff) << 16
48
+ k1 ^= (bytes[i + 1] & 0xff) << 8
49
+ k1 ^= (bytes[i] & 0xff)
50
+ k1 = imul(k1, c1)
51
+ k1 = rotl32(k1, 15)
52
+ k1 = imul(k1, c2)
53
+ h1 ^= k1
54
+ when 2
55
+ k1 ^= (bytes[i + 1] & 0xff) << 8
56
+ k1 ^= (bytes[i] & 0xff)
57
+ k1 = imul(k1, c1)
58
+ k1 = rotl32(k1, 15)
59
+ k1 = imul(k1, c2)
60
+ h1 ^= k1
61
+ when 1
62
+ k1 ^= (bytes[i] & 0xff)
63
+ k1 = imul(k1, c1)
64
+ k1 = rotl32(k1, 15)
65
+ k1 = imul(k1, c2)
66
+ h1 ^= k1
67
+ end
68
+
69
+ h1 ^= length
70
+ h1 ^= (h1 >> 16)
71
+ h1 = imul(h1, 0x85ebca6b)
72
+ h1 ^= (h1 >> 13)
73
+ h1 = imul(h1, 0xc2b2ae35)
74
+ h1 ^= (h1 >> 16)
75
+ h1 & 0xffffffff
76
+ end
77
+
78
+ def imul(a, b)
79
+ ((a & 0xffffffff) * (b & 0xffffffff)) & 0xffffffff
80
+ end
81
+
82
+ def rotl32(x, r)
83
+ ((x << r) | (x >> (32 - r))) & 0xffffffff
84
+ end
85
+ end
86
+
87
+ class Bm25Tokenizer
88
+ def initialize(stemmer, stopwords, token_max_length)
89
+ @stemmer = stemmer
90
+ @stopwords = stopwords.map { |word| word.to_s.downcase }.to_set
91
+ @token_max_length = token_max_length
92
+ end
93
+
94
+ def tokenize(text)
95
+ cleaned = remove_non_alphanumeric(text)
96
+ raw_tokens = simple_tokenize(cleaned)
97
+ tokens = []
98
+ raw_tokens.each do |token|
99
+ next if token.empty?
100
+ next if @stopwords.include?(token)
101
+ next if token.length > @token_max_length
102
+
103
+ stemmed = stem(token).strip
104
+ tokens << stemmed unless stemmed.empty?
105
+ end
106
+ tokens
107
+ end
108
+
109
+ private
110
+
111
+ def remove_non_alphanumeric(text)
112
+ text.to_s.gsub(/[^\p{L}\p{N}_\s]+/u, " ")
113
+ end
114
+
115
+ def simple_tokenize(text)
116
+ text.downcase.split(/\s+/)
117
+ end
118
+
119
+ def stem(token)
120
+ if @stemmer.respond_to?(:stem)
121
+ @stemmer.stem(token)
122
+ elsif @stemmer.respond_to?(:stem_word)
123
+ @stemmer.stem_word(token)
124
+ elsif token.respond_to?(:porter2_stem)
125
+ token.porter2_stem
126
+ else
127
+ token
128
+ end
129
+ end
130
+ end
131
+
132
+ class Porter2StemmerAdapter
133
+ def stem(token)
134
+ token.to_s.porter2_stem
135
+ end
136
+ end
137
+
138
+ class HashedToken
139
+ attr_reader :hash, :label
140
+
141
+ def initialize(hash, label)
142
+ @hash = hash
143
+ @label = label
144
+ end
145
+
146
+ def eql?(other)
147
+ other.is_a?(HashedToken) && other.hash == @hash
148
+ end
149
+
150
+ def ==(other)
151
+ eql?(other)
152
+ end
153
+
154
+ def hash
155
+ @hash
156
+ end
157
+ end
158
+
159
+ class ChromaBm25EmbeddingFunction
160
+ NAME = "chroma_bm25"
161
+
162
+ DEFAULT_K = 1.2
163
+ DEFAULT_B = 0.75
164
+ DEFAULT_AVG_DOC_LENGTH = 256.0
165
+ DEFAULT_TOKEN_MAX_LENGTH = 40
166
+
167
+ DEFAULT_CHROMA_BM25_STOPWORDS = [
168
+ "a",
169
+ "about",
170
+ "above",
171
+ "after",
172
+ "again",
173
+ "against",
174
+ "ain",
175
+ "all",
176
+ "am",
177
+ "an",
178
+ "and",
179
+ "any",
180
+ "are",
181
+ "aren",
182
+ "aren't",
183
+ "as",
184
+ "at",
185
+ "be",
186
+ "because",
187
+ "been",
188
+ "before",
189
+ "being",
190
+ "below",
191
+ "between",
192
+ "both",
193
+ "but",
194
+ "by",
195
+ "can",
196
+ "couldn",
197
+ "couldn't",
198
+ "d",
199
+ "did",
200
+ "didn",
201
+ "didn't",
202
+ "do",
203
+ "does",
204
+ "doesn",
205
+ "doesn't",
206
+ "doing",
207
+ "don",
208
+ "don't",
209
+ "down",
210
+ "during",
211
+ "each",
212
+ "few",
213
+ "for",
214
+ "from",
215
+ "further",
216
+ "had",
217
+ "hadn",
218
+ "hadn't",
219
+ "has",
220
+ "hasn",
221
+ "hasn't",
222
+ "have",
223
+ "haven",
224
+ "haven't",
225
+ "having",
226
+ "he",
227
+ "her",
228
+ "here",
229
+ "hers",
230
+ "herself",
231
+ "him",
232
+ "himself",
233
+ "his",
234
+ "how",
235
+ "i",
236
+ "if",
237
+ "in",
238
+ "into",
239
+ "is",
240
+ "isn",
241
+ "isn't",
242
+ "it",
243
+ "it's",
244
+ "its",
245
+ "itself",
246
+ "just",
247
+ "ll",
248
+ "m",
249
+ "ma",
250
+ "me",
251
+ "mightn",
252
+ "mightn't",
253
+ "more",
254
+ "most",
255
+ "mustn",
256
+ "mustn't",
257
+ "my",
258
+ "myself",
259
+ "needn",
260
+ "needn't",
261
+ "no",
262
+ "nor",
263
+ "not",
264
+ "now",
265
+ "o",
266
+ "of",
267
+ "off",
268
+ "on",
269
+ "once",
270
+ "only",
271
+ "or",
272
+ "other",
273
+ "our",
274
+ "ours",
275
+ "ourselves",
276
+ "out",
277
+ "over",
278
+ "own",
279
+ "re",
280
+ "s",
281
+ "same",
282
+ "shan",
283
+ "shan't",
284
+ "she",
285
+ "she's",
286
+ "should",
287
+ "should've",
288
+ "shouldn",
289
+ "shouldn't",
290
+ "so",
291
+ "some",
292
+ "such",
293
+ "t",
294
+ "than",
295
+ "that",
296
+ "that'll",
297
+ "the",
298
+ "their",
299
+ "theirs",
300
+ "them",
301
+ "themselves",
302
+ "then",
303
+ "there",
304
+ "these",
305
+ "they",
306
+ "this",
307
+ "those",
308
+ "through",
309
+ "to",
310
+ "too",
311
+ "under",
312
+ "until",
313
+ "up",
314
+ "ve",
315
+ "very",
316
+ "was",
317
+ "wasn",
318
+ "wasn't",
319
+ "we",
320
+ "were",
321
+ "weren",
322
+ "weren't",
323
+ "what",
324
+ "when",
325
+ "where",
326
+ "which",
327
+ "while",
328
+ "who",
329
+ "whom",
330
+ "why",
331
+ "will",
332
+ "with",
333
+ "won",
334
+ "won't",
335
+ "wouldn",
336
+ "wouldn't",
337
+ "y",
338
+ "you",
339
+ "you'd",
340
+ "you'll",
341
+ "you're",
342
+ "you've",
343
+ "your",
344
+ "yours",
345
+ "yourself",
346
+ "yourselves"
347
+ ].freeze
348
+
349
+ attr_reader :k, :b, :avg_doc_length, :token_max_length, :stopwords, :include_tokens
350
+
351
+ def initialize(k: DEFAULT_K, b: DEFAULT_B, avg_doc_length: DEFAULT_AVG_DOC_LENGTH, token_max_length: DEFAULT_TOKEN_MAX_LENGTH, stopwords: nil, include_tokens: false)
352
+ @k = k.to_f
353
+ @b = b.to_f
354
+ @avg_doc_length = avg_doc_length.to_f
355
+ @token_max_length = token_max_length.to_i
356
+ @include_tokens = !!include_tokens
357
+
358
+ if stopwords
359
+ @stopwords = stopwords.map(&:to_s)
360
+ @stopword_list = @stopwords
361
+ else
362
+ @stopwords = nil
363
+ @stopword_list = DEFAULT_CHROMA_BM25_STOPWORDS
364
+ end
365
+
366
+ @hasher = Murmur3AbsHasher.new
367
+ @stemmer = Porter2StemmerAdapter.new
368
+ end
369
+
370
+ def call(documents)
371
+ return [] if documents.nil? || documents.empty?
372
+
373
+ documents.map { |doc| encode(doc.to_s) }
374
+ end
375
+
376
+ def embed_query(documents)
377
+ call(documents)
378
+ end
379
+
380
+ def name
381
+ NAME
382
+ end
383
+
384
+ def get_config
385
+ config = {
386
+ "k" => @k,
387
+ "b" => @b,
388
+ "avg_doc_length" => @avg_doc_length,
389
+ "token_max_length" => @token_max_length,
390
+ "include_tokens" => @include_tokens
391
+ }
392
+ config["stopwords"] = @stopwords.dup if @stopwords
393
+ config
394
+ end
395
+
396
+ def validate_config(config)
397
+ EmbeddingFunctions.validate_config_schema(config, "chroma_bm25")
398
+ end
399
+
400
+ def validate_config_update(old_config, new_config)
401
+ mutable_keys = %w[k b avg_doc_length token_max_length stopwords include_tokens]
402
+ new_config.each_key do |key|
403
+ next if mutable_keys.include?(key)
404
+ raise ArgumentError, "Updating '#{key}' is not supported for #{NAME}"
405
+ end
406
+ end
407
+
408
+ def self.build_from_config(config, client: nil)
409
+ new(
410
+ k: config["k"] || DEFAULT_K,
411
+ b: config["b"] || DEFAULT_B,
412
+ avg_doc_length: config["avg_doc_length"] || DEFAULT_AVG_DOC_LENGTH,
413
+ token_max_length: config["token_max_length"] || DEFAULT_TOKEN_MAX_LENGTH,
414
+ stopwords: config["stopwords"],
415
+ include_tokens: config.fetch("include_tokens", false),
416
+ )
417
+ end
418
+
419
+ def self.validate_config(config)
420
+ EmbeddingFunctions.validate_config_schema(config, "chroma_bm25")
421
+ end
422
+
423
+ private
424
+
425
+ def encode(text)
426
+ tokenizer = Bm25Tokenizer.new(@stemmer, @stopword_list, @token_max_length)
427
+ tokens = tokenizer.tokenize(text)
428
+ return Types::SparseVector.new(indices: [], values: [], labels: nil) if tokens.empty?
429
+
430
+ doc_len = tokens.length.to_f
431
+ counts = Hash.new(0)
432
+
433
+ tokens.each do |token|
434
+ token_key = HashedToken.new(@hasher.hash(token), @include_tokens ? token : nil)
435
+ counts[token_key] += 1
436
+ end
437
+
438
+ sorted_keys = counts.keys.sort_by(&:hash)
439
+ indices = []
440
+ values = []
441
+ labels = @include_tokens ? [] : nil
442
+
443
+ sorted_keys.each do |key|
444
+ tf = counts[key].to_f
445
+ denominator = tf + @k * (1 - @b + (@b * doc_len) / @avg_doc_length)
446
+ score = tf * (@k + 1) / denominator
447
+ indices << key.hash
448
+ values << score
449
+ labels << key.label if labels && key.label
450
+ end
451
+
452
+ Types::SparseVector.new(indices: indices, values: values, labels: labels)
453
+ end
454
+ end
455
+
456
+ register_sparse_embedding_function(ChromaBm25EmbeddingFunction::NAME, ChromaBm25EmbeddingFunction)
457
+ register_sparse_embedding_function("chroma-bm25", ChromaBm25EmbeddingFunction)
458
+ end
459
+ end
@@ -0,0 +1,139 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Chroma
4
+ module EmbeddingFunctions
5
+ class ChromaCloudQwenEmbeddingFunction
6
+ NAME = "chroma-cloud-qwen"
7
+ DEFAULT_MODEL = "Qwen/Qwen3-Embedding-0.6B"
8
+ DEFAULT_INSTRUCTIONS = {
9
+ "nl_to_code" => {
10
+ "documents" => "",
11
+ "query" => "Given a question about coding, retrieval code or passage that can solve user's question"
12
+ }
13
+ }.freeze
14
+
15
+ attr_reader :model, :task, :instructions, :api_key_env_var
16
+
17
+ def initialize(model: DEFAULT_MODEL, task: nil, instructions: DEFAULT_INSTRUCTIONS, api_key_env_var: "CHROMA_API_KEY")
18
+ @api_key_env_var = api_key_env_var
19
+ @api_key = ENV[@api_key_env_var] || Chroma::SharedState.cloud_api_key
20
+ raise ArgumentError, "API key not found in #{@api_key_env_var} or any existing clients" if @api_key.nil? || @api_key.empty?
21
+
22
+ @model = model
23
+ @task = task
24
+ @instructions = instructions
25
+
26
+ @connection = Faraday.new(url: "https://embed.trychroma.com") do |builder|
27
+ builder.headers["x-chroma-token"] = @api_key
28
+ builder.headers["x-chroma-embedding-model"] = @model
29
+ builder.headers["Content-Type"] = "application/json"
30
+ end
31
+ end
32
+
33
+ def call(texts)
34
+ return [] if texts.nil? || texts.empty?
35
+
36
+ payload = {
37
+ "instructions" => instruction_for("documents"),
38
+ "texts" => texts
39
+ }
40
+
41
+ response = @connection.post do |req|
42
+ req.body = JSON.generate(payload)
43
+ end
44
+
45
+ parse_response(response)
46
+ end
47
+
48
+ def embed_query(texts)
49
+ return [] if texts.nil? || texts.empty?
50
+
51
+ payload = {
52
+ "instructions" => instruction_for("query"),
53
+ "texts" => texts
54
+ }
55
+
56
+ response = @connection.post do |req|
57
+ req.body = JSON.generate(payload)
58
+ end
59
+
60
+ parse_response(response)
61
+ end
62
+
63
+ def name
64
+ NAME
65
+ end
66
+
67
+ def default_space
68
+ "cosine"
69
+ end
70
+
71
+ def supported_spaces
72
+ %w[cosine l2 ip]
73
+ end
74
+
75
+ def get_config
76
+ {
77
+ "api_key_env_var" => @api_key_env_var,
78
+ "model" => @model,
79
+ "task" => @task,
80
+ "instructions" => @instructions
81
+ }
82
+ end
83
+
84
+ def validate_config(config)
85
+ EmbeddingFunctions.validate_config_schema(config, "chroma-cloud-qwen")
86
+ end
87
+
88
+ def self.build_from_config(config, client: nil)
89
+ model = config["model"] || config[:model]
90
+ task = config["task"] || config[:task]
91
+ instructions = config["instructions"] || config[:instructions]
92
+ api_key_env_var = config["api_key_env_var"] || config[:api_key_env_var] || "CHROMA_API_KEY"
93
+
94
+ raise ArgumentError, "Config is missing required field 'model'" if model.nil?
95
+
96
+ ChromaCloudQwenEmbeddingFunction.new(
97
+ model: model,
98
+ task: task,
99
+ instructions: instructions || DEFAULT_INSTRUCTIONS,
100
+ api_key_env_var: api_key_env_var,
101
+ )
102
+ end
103
+
104
+ def self.validate_config(config)
105
+ EmbeddingFunctions.validate_config_schema(config, "chroma-cloud-qwen")
106
+ end
107
+
108
+ def validate_config_update(old_config, new_config)
109
+ %w[model task instructions].each do |key|
110
+ next unless new_config.key?(key)
111
+ raise ArgumentError, "The #{key} cannot be changed after initialization." if new_config[key] != old_config[key]
112
+ end
113
+ end
114
+
115
+ private
116
+
117
+ def instruction_for(target)
118
+ return "" unless @task && @instructions[@task]
119
+ target_instructions = @instructions[@task]
120
+ target_instructions[target] || ""
121
+ end
122
+
123
+ def parse_response(response)
124
+ unless response.success?
125
+ raise RuntimeError, "Failed to get embeddings from Chroma Cloud API: HTTP #{response.status} - #{response.body}"
126
+ end
127
+ data = JSON.parse(response.body)
128
+ unless data["embeddings"]
129
+ raise RuntimeError, data["error"] || "Unknown error"
130
+ end
131
+ data["embeddings"]
132
+ rescue JSON::ParserError
133
+ raise RuntimeError, "Invalid JSON response from Chroma Cloud API"
134
+ end
135
+ end
136
+
137
+ register_embedding_function(ChromaCloudQwenEmbeddingFunction::NAME, ChromaCloudQwenEmbeddingFunction)
138
+ end
139
+ end
@@ -0,0 +1,121 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Chroma
4
+ module EmbeddingFunctions
5
+ class ChromaCloudSpladeEmbeddingFunction
6
+ NAME = "chroma-cloud-splade"
7
+ DEFAULT_MODEL = "prithivida/Splade_PP_en_v1"
8
+
9
+ attr_reader :api_key_env_var, :model, :include_tokens
10
+
11
+ def initialize(api_key_env_var: "CHROMA_API_KEY", model: DEFAULT_MODEL, include_tokens: false)
12
+ @api_key_env_var = api_key_env_var
13
+ @api_key = ENV[@api_key_env_var] || Chroma::SharedState.cloud_api_key
14
+ raise ArgumentError, "API key not found in #{@api_key_env_var} or any existing clients" if @api_key.nil? || @api_key.empty?
15
+
16
+ @model = model
17
+ @include_tokens = !!include_tokens
18
+
19
+ @connection = Faraday.new(url: "https://embed.trychroma.com") do |builder|
20
+ builder.headers["x-chroma-token"] = @api_key
21
+ builder.headers["x-chroma-embedding-model"] = @model
22
+ builder.headers["Content-Type"] = "application/json"
23
+ end
24
+ end
25
+
26
+ def call(texts)
27
+ return [] if texts.nil? || texts.empty?
28
+
29
+ payload = {
30
+ "texts" => texts,
31
+ "task" => "",
32
+ "target" => "",
33
+ "fetch_tokens" => @include_tokens ? "true" : "false"
34
+ }
35
+
36
+ response = @connection.post("/embed_sparse") do |req|
37
+ req.body = JSON.generate(payload)
38
+ end
39
+
40
+ parse_response(response)
41
+ end
42
+
43
+ def embed_query(texts)
44
+ call(texts)
45
+ end
46
+
47
+ def name
48
+ NAME
49
+ end
50
+
51
+ def get_config
52
+ {
53
+ "api_key_env_var" => @api_key_env_var,
54
+ "model" => @model,
55
+ "include_tokens" => @include_tokens
56
+ }
57
+ end
58
+
59
+ def validate_config(config)
60
+ EmbeddingFunctions.validate_config_schema(config, "chroma-cloud-splade")
61
+ end
62
+
63
+ def validate_config_update(old_config, new_config)
64
+ %w[include_tokens model].each do |key|
65
+ next unless new_config.key?(key)
66
+ raise ArgumentError, "Updating '#{key}' is not supported for chroma-cloud-splade" if new_config[key] != old_config[key]
67
+ end
68
+ end
69
+
70
+ def self.build_from_config(config, client: nil)
71
+ api_key_env_var = config["api_key_env_var"] || config[:api_key_env_var]
72
+ model = config["model"] || config[:model]
73
+ include_tokens = config.key?("include_tokens") ? config["include_tokens"] : config[:include_tokens]
74
+
75
+ raise ArgumentError, "model must be provided in config" if model.nil?
76
+ raise ArgumentError, "api_key_env_var must be provided in config" if api_key_env_var.nil? || api_key_env_var.to_s.empty?
77
+
78
+ ChromaCloudSpladeEmbeddingFunction.new(
79
+ api_key_env_var: api_key_env_var,
80
+ model: model,
81
+ include_tokens: include_tokens || false,
82
+ )
83
+ end
84
+
85
+ def self.validate_config(config)
86
+ EmbeddingFunctions.validate_config_schema(config, "chroma-cloud-splade")
87
+ end
88
+
89
+ private
90
+
91
+ def parse_response(response)
92
+ unless response.success?
93
+ raise RuntimeError, "Failed to get embeddings from Chroma Cloud API: HTTP #{response.status} - #{response.body}"
94
+ end
95
+ data = JSON.parse(response.body)
96
+ raw_embeddings = data["embeddings"] || []
97
+
98
+ raw_embeddings.map do |embedding|
99
+ if embedding.is_a?(Hash)
100
+ indices = embedding["indices"] || []
101
+ values = embedding["values"] || []
102
+ labels = @include_tokens ? (embedding["labels"] || embedding["tokens"]) : nil
103
+ Types::Validation.normalize_sparse_vector(indices: indices, values: values, labels: labels)
104
+ elsif embedding.is_a?(Types::SparseVector)
105
+ Types::Validation.normalize_sparse_vector(
106
+ indices: embedding.indices,
107
+ values: embedding.values,
108
+ labels: @include_tokens ? embedding.labels : nil,
109
+ )
110
+ else
111
+ raise ArgumentError, "Unexpected sparse embedding format: #{embedding.inspect}"
112
+ end
113
+ end
114
+ rescue JSON::ParserError
115
+ raise RuntimeError, "Invalid JSON response from Chroma Cloud API"
116
+ end
117
+ end
118
+
119
+ register_sparse_embedding_function(ChromaCloudSpladeEmbeddingFunction::NAME, ChromaCloudSpladeEmbeddingFunction)
120
+ end
121
+ end