chromadb-experimental 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/chromadb/admin_client.rb +6 -0
- data/lib/chromadb/client.rb +317 -0
- data/lib/chromadb/collection.rb +573 -0
- data/lib/chromadb/embedding_functions/chroma_bm25.rb +459 -0
- data/lib/chromadb/embedding_functions/chroma_cloud_qwen.rb +139 -0
- data/lib/chromadb/embedding_functions/chroma_cloud_splade.rb +121 -0
- data/lib/chromadb/embedding_functions.rb +121 -0
- data/lib/chromadb/errors.rb +120 -0
- data/lib/chromadb/http_client.rb +142 -0
- data/lib/chromadb/openapi/lib/chromadb/api/default_api.rb +2349 -0
- data/lib/chromadb/openapi/lib/chromadb/api_client.rb +392 -0
- data/lib/chromadb/openapi/lib/chromadb/api_error.rb +58 -0
- data/lib/chromadb/openapi/lib/chromadb/configuration.rb +295 -0
- data/lib/chromadb/openapi/lib/chromadb/models/add_collection_records_payload.rb +260 -0
- data/lib/chromadb/openapi/lib/chromadb/models/attach_function_request.rb +250 -0
- data/lib/chromadb/openapi/lib/chromadb/models/attach_function_response.rb +235 -0
- data/lib/chromadb/openapi/lib/chromadb/models/attached_function_api_response.rb +361 -0
- data/lib/chromadb/openapi/lib/chromadb/models/attached_function_info.rb +240 -0
- data/lib/chromadb/openapi/lib/chromadb/models/bool_inverted_index_type.rb +229 -0
- data/lib/chromadb/openapi/lib/chromadb/models/bool_value_type.rb +221 -0
- data/lib/chromadb/openapi/lib/chromadb/models/checklist_response.rb +245 -0
- data/lib/chromadb/openapi/lib/chromadb/models/collection.rb +315 -0
- data/lib/chromadb/openapi/lib/chromadb/models/collection_configuration.rb +240 -0
- data/lib/chromadb/openapi/lib/chromadb/models/create_collection_payload.rb +260 -0
- data/lib/chromadb/openapi/lib/chromadb/models/create_database_payload.rb +220 -0
- data/lib/chromadb/openapi/lib/chromadb/models/create_tenant_payload.rb +220 -0
- data/lib/chromadb/openapi/lib/chromadb/models/database.rb +240 -0
- data/lib/chromadb/openapi/lib/chromadb/models/detach_function_request.rb +221 -0
- data/lib/chromadb/openapi/lib/chromadb/models/detach_function_response.rb +220 -0
- data/lib/chromadb/openapi/lib/chromadb/models/embedding_function_new_configuration.rb +230 -0
- data/lib/chromadb/openapi/lib/chromadb/models/error_response.rb +230 -0
- data/lib/chromadb/openapi/lib/chromadb/models/float_inverted_index_type.rb +229 -0
- data/lib/chromadb/openapi/lib/chromadb/models/float_list_value_type.rb +221 -0
- data/lib/chromadb/openapi/lib/chromadb/models/float_value_type.rb +221 -0
- data/lib/chromadb/openapi/lib/chromadb/models/fork_collection_payload.rb +220 -0
- data/lib/chromadb/openapi/lib/chromadb/models/fts_index_type.rb +229 -0
- data/lib/chromadb/openapi/lib/chromadb/models/get_attached_function_response.rb +224 -0
- data/lib/chromadb/openapi/lib/chromadb/models/get_response.rb +270 -0
- data/lib/chromadb/openapi/lib/chromadb/models/get_tenant_response.rb +230 -0
- data/lib/chromadb/openapi/lib/chromadb/models/get_user_identity_response.rb +246 -0
- data/lib/chromadb/openapi/lib/chromadb/models/heartbeat_response.rb +235 -0
- data/lib/chromadb/openapi/lib/chromadb/models/hnsw_configuration.rb +330 -0
- data/lib/chromadb/openapi/lib/chromadb/models/hnsw_index_config.rb +371 -0
- data/lib/chromadb/openapi/lib/chromadb/models/include.rb +210 -0
- data/lib/chromadb/openapi/lib/chromadb/models/int_inverted_index_type.rb +229 -0
- data/lib/chromadb/openapi/lib/chromadb/models/int_value_type.rb +221 -0
- data/lib/chromadb/openapi/lib/chromadb/models/query_response.rb +280 -0
- data/lib/chromadb/openapi/lib/chromadb/models/raw_where_fields.rb +230 -0
- data/lib/chromadb/openapi/lib/chromadb/models/schema.rb +258 -0
- data/lib/chromadb/openapi/lib/chromadb/models/search_payload.rb +256 -0
- data/lib/chromadb/openapi/lib/chromadb/models/search_payload_filter.rb +230 -0
- data/lib/chromadb/openapi/lib/chromadb/models/search_payload_group_by.rb +230 -0
- data/lib/chromadb/openapi/lib/chromadb/models/search_payload_limit.rb +230 -0
- data/lib/chromadb/openapi/lib/chromadb/models/search_payload_select.rb +220 -0
- data/lib/chromadb/openapi/lib/chromadb/models/search_request_payload.rb +220 -0
- data/lib/chromadb/openapi/lib/chromadb/models/search_response.rb +270 -0
- data/lib/chromadb/openapi/lib/chromadb/models/space.rb +210 -0
- data/lib/chromadb/openapi/lib/chromadb/models/spann_configuration.rb +420 -0
- data/lib/chromadb/openapi/lib/chromadb/models/spann_index_config.rb +536 -0
- data/lib/chromadb/openapi/lib/chromadb/models/sparse_vector.rb +244 -0
- data/lib/chromadb/openapi/lib/chromadb/models/sparse_vector_index_config.rb +242 -0
- data/lib/chromadb/openapi/lib/chromadb/models/sparse_vector_index_type.rb +234 -0
- data/lib/chromadb/openapi/lib/chromadb/models/sparse_vector_value_type.rb +221 -0
- data/lib/chromadb/openapi/lib/chromadb/models/string_inverted_index_type.rb +229 -0
- data/lib/chromadb/openapi/lib/chromadb/models/string_value_type.rb +231 -0
- data/lib/chromadb/openapi/lib/chromadb/models/update_collection_configuration.rb +240 -0
- data/lib/chromadb/openapi/lib/chromadb/models/update_collection_payload.rb +240 -0
- data/lib/chromadb/openapi/lib/chromadb/models/update_collection_records_payload.rb +260 -0
- data/lib/chromadb/openapi/lib/chromadb/models/update_hnsw_configuration.rb +345 -0
- data/lib/chromadb/openapi/lib/chromadb/models/update_spann_configuration.rb +260 -0
- data/lib/chromadb/openapi/lib/chromadb/models/update_tenant_payload.rb +220 -0
- data/lib/chromadb/openapi/lib/chromadb/models/upsert_collection_records_payload.rb +260 -0
- data/lib/chromadb/openapi/lib/chromadb/models/value_types.rb +271 -0
- data/lib/chromadb/openapi/lib/chromadb/models/vector_index_config.rb +261 -0
- data/lib/chromadb/openapi/lib/chromadb/models/vector_index_type.rb +234 -0
- data/lib/chromadb/openapi/lib/chromadb/version.rb +15 -0
- data/lib/chromadb/openapi/lib/chromadb.rb +102 -0
- data/lib/chromadb/openapi.rb +6 -0
- data/lib/chromadb/schema.rb +744 -0
- data/lib/chromadb/schemas/chroma-cloud-qwen.json +61 -0
- data/lib/chromadb/schemas/chroma-cloud-splade.json +31 -0
- data/lib/chromadb/schemas/chroma_bm25.json +37 -0
- data/lib/chromadb/search/key.rb +94 -0
- data/lib/chromadb/search/limit.rb +41 -0
- data/lib/chromadb/search/rank.rb +425 -0
- data/lib/chromadb/search/search.rb +73 -0
- data/lib/chromadb/search/select.rb +54 -0
- data/lib/chromadb/search/where.rb +157 -0
- data/lib/chromadb/search.rb +8 -0
- data/lib/chromadb/types/results.rb +96 -0
- data/lib/chromadb/types/sparse_vector.rb +86 -0
- data/lib/chromadb/types/validation.rb +519 -0
- data/lib/chromadb/types.rb +13 -0
- data/lib/chromadb/version.rb +5 -0
- data/lib/chromadb.rb +15 -0
- metadata +233 -0
|
@@ -0,0 +1,459 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "set"
|
|
4
|
+
module Chroma
|
|
5
|
+
module EmbeddingFunctions
|
|
6
|
+
class Murmur3AbsHasher
|
|
7
|
+
def initialize(seed = 0)
|
|
8
|
+
@seed = seed
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def hash(token)
|
|
12
|
+
unsigned = murmur3(token)
|
|
13
|
+
signed = unsigned >= 0x80000000 ? unsigned - 0x1_0000_0000 : unsigned
|
|
14
|
+
signed.abs
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
private
|
|
18
|
+
|
|
19
|
+
def murmur3(key)
|
|
20
|
+
bytes = key.to_s.b.bytes
|
|
21
|
+
h1 = @seed & 0xffffffff
|
|
22
|
+
c1 = 0xcc9e2d51
|
|
23
|
+
c2 = 0x1b873593
|
|
24
|
+
length = bytes.length
|
|
25
|
+
i = 0
|
|
26
|
+
rounded = length - (length & 3)
|
|
27
|
+
|
|
28
|
+
while i < rounded
|
|
29
|
+
k1 = (bytes[i] & 0xff) |
|
|
30
|
+
((bytes[i + 1] & 0xff) << 8) |
|
|
31
|
+
((bytes[i + 2] & 0xff) << 16) |
|
|
32
|
+
((bytes[i + 3] & 0xff) << 24)
|
|
33
|
+
i += 4
|
|
34
|
+
|
|
35
|
+
k1 = imul(k1, c1)
|
|
36
|
+
k1 = rotl32(k1, 15)
|
|
37
|
+
k1 = imul(k1, c2)
|
|
38
|
+
|
|
39
|
+
h1 ^= k1
|
|
40
|
+
h1 = rotl32(h1, 13)
|
|
41
|
+
h1 = (imul(h1, 5) + 0xe6546b64) & 0xffffffff
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
k1 = 0
|
|
45
|
+
case length & 3
|
|
46
|
+
when 3
|
|
47
|
+
k1 ^= (bytes[i + 2] & 0xff) << 16
|
|
48
|
+
k1 ^= (bytes[i + 1] & 0xff) << 8
|
|
49
|
+
k1 ^= (bytes[i] & 0xff)
|
|
50
|
+
k1 = imul(k1, c1)
|
|
51
|
+
k1 = rotl32(k1, 15)
|
|
52
|
+
k1 = imul(k1, c2)
|
|
53
|
+
h1 ^= k1
|
|
54
|
+
when 2
|
|
55
|
+
k1 ^= (bytes[i + 1] & 0xff) << 8
|
|
56
|
+
k1 ^= (bytes[i] & 0xff)
|
|
57
|
+
k1 = imul(k1, c1)
|
|
58
|
+
k1 = rotl32(k1, 15)
|
|
59
|
+
k1 = imul(k1, c2)
|
|
60
|
+
h1 ^= k1
|
|
61
|
+
when 1
|
|
62
|
+
k1 ^= (bytes[i] & 0xff)
|
|
63
|
+
k1 = imul(k1, c1)
|
|
64
|
+
k1 = rotl32(k1, 15)
|
|
65
|
+
k1 = imul(k1, c2)
|
|
66
|
+
h1 ^= k1
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
h1 ^= length
|
|
70
|
+
h1 ^= (h1 >> 16)
|
|
71
|
+
h1 = imul(h1, 0x85ebca6b)
|
|
72
|
+
h1 ^= (h1 >> 13)
|
|
73
|
+
h1 = imul(h1, 0xc2b2ae35)
|
|
74
|
+
h1 ^= (h1 >> 16)
|
|
75
|
+
h1 & 0xffffffff
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def imul(a, b)
|
|
79
|
+
((a & 0xffffffff) * (b & 0xffffffff)) & 0xffffffff
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def rotl32(x, r)
|
|
83
|
+
((x << r) | (x >> (32 - r))) & 0xffffffff
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
class Bm25Tokenizer
|
|
88
|
+
def initialize(stemmer, stopwords, token_max_length)
|
|
89
|
+
@stemmer = stemmer
|
|
90
|
+
@stopwords = stopwords.map { |word| word.to_s.downcase }.to_set
|
|
91
|
+
@token_max_length = token_max_length
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def tokenize(text)
|
|
95
|
+
cleaned = remove_non_alphanumeric(text)
|
|
96
|
+
raw_tokens = simple_tokenize(cleaned)
|
|
97
|
+
tokens = []
|
|
98
|
+
raw_tokens.each do |token|
|
|
99
|
+
next if token.empty?
|
|
100
|
+
next if @stopwords.include?(token)
|
|
101
|
+
next if token.length > @token_max_length
|
|
102
|
+
|
|
103
|
+
stemmed = stem(token).strip
|
|
104
|
+
tokens << stemmed unless stemmed.empty?
|
|
105
|
+
end
|
|
106
|
+
tokens
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
private
|
|
110
|
+
|
|
111
|
+
def remove_non_alphanumeric(text)
|
|
112
|
+
text.to_s.gsub(/[^\p{L}\p{N}_\s]+/u, " ")
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def simple_tokenize(text)
|
|
116
|
+
text.downcase.split(/\s+/)
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def stem(token)
|
|
120
|
+
if @stemmer.respond_to?(:stem)
|
|
121
|
+
@stemmer.stem(token)
|
|
122
|
+
elsif @stemmer.respond_to?(:stem_word)
|
|
123
|
+
@stemmer.stem_word(token)
|
|
124
|
+
elsif token.respond_to?(:porter2_stem)
|
|
125
|
+
token.porter2_stem
|
|
126
|
+
else
|
|
127
|
+
token
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
class Porter2StemmerAdapter
|
|
133
|
+
def stem(token)
|
|
134
|
+
token.to_s.porter2_stem
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
class HashedToken
|
|
139
|
+
attr_reader :hash, :label
|
|
140
|
+
|
|
141
|
+
def initialize(hash, label)
|
|
142
|
+
@hash = hash
|
|
143
|
+
@label = label
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
def eql?(other)
|
|
147
|
+
other.is_a?(HashedToken) && other.hash == @hash
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
def ==(other)
|
|
151
|
+
eql?(other)
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
def hash
|
|
155
|
+
@hash
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
class ChromaBm25EmbeddingFunction
|
|
160
|
+
NAME = "chroma_bm25"
|
|
161
|
+
|
|
162
|
+
DEFAULT_K = 1.2
|
|
163
|
+
DEFAULT_B = 0.75
|
|
164
|
+
DEFAULT_AVG_DOC_LENGTH = 256.0
|
|
165
|
+
DEFAULT_TOKEN_MAX_LENGTH = 40
|
|
166
|
+
|
|
167
|
+
DEFAULT_CHROMA_BM25_STOPWORDS = [
|
|
168
|
+
"a",
|
|
169
|
+
"about",
|
|
170
|
+
"above",
|
|
171
|
+
"after",
|
|
172
|
+
"again",
|
|
173
|
+
"against",
|
|
174
|
+
"ain",
|
|
175
|
+
"all",
|
|
176
|
+
"am",
|
|
177
|
+
"an",
|
|
178
|
+
"and",
|
|
179
|
+
"any",
|
|
180
|
+
"are",
|
|
181
|
+
"aren",
|
|
182
|
+
"aren't",
|
|
183
|
+
"as",
|
|
184
|
+
"at",
|
|
185
|
+
"be",
|
|
186
|
+
"because",
|
|
187
|
+
"been",
|
|
188
|
+
"before",
|
|
189
|
+
"being",
|
|
190
|
+
"below",
|
|
191
|
+
"between",
|
|
192
|
+
"both",
|
|
193
|
+
"but",
|
|
194
|
+
"by",
|
|
195
|
+
"can",
|
|
196
|
+
"couldn",
|
|
197
|
+
"couldn't",
|
|
198
|
+
"d",
|
|
199
|
+
"did",
|
|
200
|
+
"didn",
|
|
201
|
+
"didn't",
|
|
202
|
+
"do",
|
|
203
|
+
"does",
|
|
204
|
+
"doesn",
|
|
205
|
+
"doesn't",
|
|
206
|
+
"doing",
|
|
207
|
+
"don",
|
|
208
|
+
"don't",
|
|
209
|
+
"down",
|
|
210
|
+
"during",
|
|
211
|
+
"each",
|
|
212
|
+
"few",
|
|
213
|
+
"for",
|
|
214
|
+
"from",
|
|
215
|
+
"further",
|
|
216
|
+
"had",
|
|
217
|
+
"hadn",
|
|
218
|
+
"hadn't",
|
|
219
|
+
"has",
|
|
220
|
+
"hasn",
|
|
221
|
+
"hasn't",
|
|
222
|
+
"have",
|
|
223
|
+
"haven",
|
|
224
|
+
"haven't",
|
|
225
|
+
"having",
|
|
226
|
+
"he",
|
|
227
|
+
"her",
|
|
228
|
+
"here",
|
|
229
|
+
"hers",
|
|
230
|
+
"herself",
|
|
231
|
+
"him",
|
|
232
|
+
"himself",
|
|
233
|
+
"his",
|
|
234
|
+
"how",
|
|
235
|
+
"i",
|
|
236
|
+
"if",
|
|
237
|
+
"in",
|
|
238
|
+
"into",
|
|
239
|
+
"is",
|
|
240
|
+
"isn",
|
|
241
|
+
"isn't",
|
|
242
|
+
"it",
|
|
243
|
+
"it's",
|
|
244
|
+
"its",
|
|
245
|
+
"itself",
|
|
246
|
+
"just",
|
|
247
|
+
"ll",
|
|
248
|
+
"m",
|
|
249
|
+
"ma",
|
|
250
|
+
"me",
|
|
251
|
+
"mightn",
|
|
252
|
+
"mightn't",
|
|
253
|
+
"more",
|
|
254
|
+
"most",
|
|
255
|
+
"mustn",
|
|
256
|
+
"mustn't",
|
|
257
|
+
"my",
|
|
258
|
+
"myself",
|
|
259
|
+
"needn",
|
|
260
|
+
"needn't",
|
|
261
|
+
"no",
|
|
262
|
+
"nor",
|
|
263
|
+
"not",
|
|
264
|
+
"now",
|
|
265
|
+
"o",
|
|
266
|
+
"of",
|
|
267
|
+
"off",
|
|
268
|
+
"on",
|
|
269
|
+
"once",
|
|
270
|
+
"only",
|
|
271
|
+
"or",
|
|
272
|
+
"other",
|
|
273
|
+
"our",
|
|
274
|
+
"ours",
|
|
275
|
+
"ourselves",
|
|
276
|
+
"out",
|
|
277
|
+
"over",
|
|
278
|
+
"own",
|
|
279
|
+
"re",
|
|
280
|
+
"s",
|
|
281
|
+
"same",
|
|
282
|
+
"shan",
|
|
283
|
+
"shan't",
|
|
284
|
+
"she",
|
|
285
|
+
"she's",
|
|
286
|
+
"should",
|
|
287
|
+
"should've",
|
|
288
|
+
"shouldn",
|
|
289
|
+
"shouldn't",
|
|
290
|
+
"so",
|
|
291
|
+
"some",
|
|
292
|
+
"such",
|
|
293
|
+
"t",
|
|
294
|
+
"than",
|
|
295
|
+
"that",
|
|
296
|
+
"that'll",
|
|
297
|
+
"the",
|
|
298
|
+
"their",
|
|
299
|
+
"theirs",
|
|
300
|
+
"them",
|
|
301
|
+
"themselves",
|
|
302
|
+
"then",
|
|
303
|
+
"there",
|
|
304
|
+
"these",
|
|
305
|
+
"they",
|
|
306
|
+
"this",
|
|
307
|
+
"those",
|
|
308
|
+
"through",
|
|
309
|
+
"to",
|
|
310
|
+
"too",
|
|
311
|
+
"under",
|
|
312
|
+
"until",
|
|
313
|
+
"up",
|
|
314
|
+
"ve",
|
|
315
|
+
"very",
|
|
316
|
+
"was",
|
|
317
|
+
"wasn",
|
|
318
|
+
"wasn't",
|
|
319
|
+
"we",
|
|
320
|
+
"were",
|
|
321
|
+
"weren",
|
|
322
|
+
"weren't",
|
|
323
|
+
"what",
|
|
324
|
+
"when",
|
|
325
|
+
"where",
|
|
326
|
+
"which",
|
|
327
|
+
"while",
|
|
328
|
+
"who",
|
|
329
|
+
"whom",
|
|
330
|
+
"why",
|
|
331
|
+
"will",
|
|
332
|
+
"with",
|
|
333
|
+
"won",
|
|
334
|
+
"won't",
|
|
335
|
+
"wouldn",
|
|
336
|
+
"wouldn't",
|
|
337
|
+
"y",
|
|
338
|
+
"you",
|
|
339
|
+
"you'd",
|
|
340
|
+
"you'll",
|
|
341
|
+
"you're",
|
|
342
|
+
"you've",
|
|
343
|
+
"your",
|
|
344
|
+
"yours",
|
|
345
|
+
"yourself",
|
|
346
|
+
"yourselves"
|
|
347
|
+
].freeze
|
|
348
|
+
|
|
349
|
+
attr_reader :k, :b, :avg_doc_length, :token_max_length, :stopwords, :include_tokens
|
|
350
|
+
|
|
351
|
+
def initialize(k: DEFAULT_K, b: DEFAULT_B, avg_doc_length: DEFAULT_AVG_DOC_LENGTH, token_max_length: DEFAULT_TOKEN_MAX_LENGTH, stopwords: nil, include_tokens: false)
|
|
352
|
+
@k = k.to_f
|
|
353
|
+
@b = b.to_f
|
|
354
|
+
@avg_doc_length = avg_doc_length.to_f
|
|
355
|
+
@token_max_length = token_max_length.to_i
|
|
356
|
+
@include_tokens = !!include_tokens
|
|
357
|
+
|
|
358
|
+
if stopwords
|
|
359
|
+
@stopwords = stopwords.map(&:to_s)
|
|
360
|
+
@stopword_list = @stopwords
|
|
361
|
+
else
|
|
362
|
+
@stopwords = nil
|
|
363
|
+
@stopword_list = DEFAULT_CHROMA_BM25_STOPWORDS
|
|
364
|
+
end
|
|
365
|
+
|
|
366
|
+
@hasher = Murmur3AbsHasher.new
|
|
367
|
+
@stemmer = Porter2StemmerAdapter.new
|
|
368
|
+
end
|
|
369
|
+
|
|
370
|
+
def call(documents)
|
|
371
|
+
return [] if documents.nil? || documents.empty?
|
|
372
|
+
|
|
373
|
+
documents.map { |doc| encode(doc.to_s) }
|
|
374
|
+
end
|
|
375
|
+
|
|
376
|
+
def embed_query(documents)
|
|
377
|
+
call(documents)
|
|
378
|
+
end
|
|
379
|
+
|
|
380
|
+
def name
|
|
381
|
+
NAME
|
|
382
|
+
end
|
|
383
|
+
|
|
384
|
+
def get_config
|
|
385
|
+
config = {
|
|
386
|
+
"k" => @k,
|
|
387
|
+
"b" => @b,
|
|
388
|
+
"avg_doc_length" => @avg_doc_length,
|
|
389
|
+
"token_max_length" => @token_max_length,
|
|
390
|
+
"include_tokens" => @include_tokens
|
|
391
|
+
}
|
|
392
|
+
config["stopwords"] = @stopwords.dup if @stopwords
|
|
393
|
+
config
|
|
394
|
+
end
|
|
395
|
+
|
|
396
|
+
def validate_config(config)
|
|
397
|
+
EmbeddingFunctions.validate_config_schema(config, "chroma_bm25")
|
|
398
|
+
end
|
|
399
|
+
|
|
400
|
+
def validate_config_update(old_config, new_config)
|
|
401
|
+
mutable_keys = %w[k b avg_doc_length token_max_length stopwords include_tokens]
|
|
402
|
+
new_config.each_key do |key|
|
|
403
|
+
next if mutable_keys.include?(key)
|
|
404
|
+
raise ArgumentError, "Updating '#{key}' is not supported for #{NAME}"
|
|
405
|
+
end
|
|
406
|
+
end
|
|
407
|
+
|
|
408
|
+
def self.build_from_config(config, client: nil)
|
|
409
|
+
new(
|
|
410
|
+
k: config["k"] || DEFAULT_K,
|
|
411
|
+
b: config["b"] || DEFAULT_B,
|
|
412
|
+
avg_doc_length: config["avg_doc_length"] || DEFAULT_AVG_DOC_LENGTH,
|
|
413
|
+
token_max_length: config["token_max_length"] || DEFAULT_TOKEN_MAX_LENGTH,
|
|
414
|
+
stopwords: config["stopwords"],
|
|
415
|
+
include_tokens: config.fetch("include_tokens", false),
|
|
416
|
+
)
|
|
417
|
+
end
|
|
418
|
+
|
|
419
|
+
def self.validate_config(config)
|
|
420
|
+
EmbeddingFunctions.validate_config_schema(config, "chroma_bm25")
|
|
421
|
+
end
|
|
422
|
+
|
|
423
|
+
private
|
|
424
|
+
|
|
425
|
+
def encode(text)
|
|
426
|
+
tokenizer = Bm25Tokenizer.new(@stemmer, @stopword_list, @token_max_length)
|
|
427
|
+
tokens = tokenizer.tokenize(text)
|
|
428
|
+
return Types::SparseVector.new(indices: [], values: [], labels: nil) if tokens.empty?
|
|
429
|
+
|
|
430
|
+
doc_len = tokens.length.to_f
|
|
431
|
+
counts = Hash.new(0)
|
|
432
|
+
|
|
433
|
+
tokens.each do |token|
|
|
434
|
+
token_key = HashedToken.new(@hasher.hash(token), @include_tokens ? token : nil)
|
|
435
|
+
counts[token_key] += 1
|
|
436
|
+
end
|
|
437
|
+
|
|
438
|
+
sorted_keys = counts.keys.sort_by(&:hash)
|
|
439
|
+
indices = []
|
|
440
|
+
values = []
|
|
441
|
+
labels = @include_tokens ? [] : nil
|
|
442
|
+
|
|
443
|
+
sorted_keys.each do |key|
|
|
444
|
+
tf = counts[key].to_f
|
|
445
|
+
denominator = tf + @k * (1 - @b + (@b * doc_len) / @avg_doc_length)
|
|
446
|
+
score = tf * (@k + 1) / denominator
|
|
447
|
+
indices << key.hash
|
|
448
|
+
values << score
|
|
449
|
+
labels << key.label if labels && key.label
|
|
450
|
+
end
|
|
451
|
+
|
|
452
|
+
Types::SparseVector.new(indices: indices, values: values, labels: labels)
|
|
453
|
+
end
|
|
454
|
+
end
|
|
455
|
+
|
|
456
|
+
register_sparse_embedding_function(ChromaBm25EmbeddingFunction::NAME, ChromaBm25EmbeddingFunction)
|
|
457
|
+
register_sparse_embedding_function("chroma-bm25", ChromaBm25EmbeddingFunction)
|
|
458
|
+
end
|
|
459
|
+
end
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Chroma
|
|
4
|
+
module EmbeddingFunctions
|
|
5
|
+
class ChromaCloudQwenEmbeddingFunction
|
|
6
|
+
NAME = "chroma-cloud-qwen"
|
|
7
|
+
DEFAULT_MODEL = "Qwen/Qwen3-Embedding-0.6B"
|
|
8
|
+
DEFAULT_INSTRUCTIONS = {
|
|
9
|
+
"nl_to_code" => {
|
|
10
|
+
"documents" => "",
|
|
11
|
+
"query" => "Given a question about coding, retrieval code or passage that can solve user's question"
|
|
12
|
+
}
|
|
13
|
+
}.freeze
|
|
14
|
+
|
|
15
|
+
attr_reader :model, :task, :instructions, :api_key_env_var
|
|
16
|
+
|
|
17
|
+
def initialize(model: DEFAULT_MODEL, task: nil, instructions: DEFAULT_INSTRUCTIONS, api_key_env_var: "CHROMA_API_KEY")
|
|
18
|
+
@api_key_env_var = api_key_env_var
|
|
19
|
+
@api_key = ENV[@api_key_env_var] || Chroma::SharedState.cloud_api_key
|
|
20
|
+
raise ArgumentError, "API key not found in #{@api_key_env_var} or any existing clients" if @api_key.nil? || @api_key.empty?
|
|
21
|
+
|
|
22
|
+
@model = model
|
|
23
|
+
@task = task
|
|
24
|
+
@instructions = instructions
|
|
25
|
+
|
|
26
|
+
@connection = Faraday.new(url: "https://embed.trychroma.com") do |builder|
|
|
27
|
+
builder.headers["x-chroma-token"] = @api_key
|
|
28
|
+
builder.headers["x-chroma-embedding-model"] = @model
|
|
29
|
+
builder.headers["Content-Type"] = "application/json"
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def call(texts)
|
|
34
|
+
return [] if texts.nil? || texts.empty?
|
|
35
|
+
|
|
36
|
+
payload = {
|
|
37
|
+
"instructions" => instruction_for("documents"),
|
|
38
|
+
"texts" => texts
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
response = @connection.post do |req|
|
|
42
|
+
req.body = JSON.generate(payload)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
parse_response(response)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def embed_query(texts)
|
|
49
|
+
return [] if texts.nil? || texts.empty?
|
|
50
|
+
|
|
51
|
+
payload = {
|
|
52
|
+
"instructions" => instruction_for("query"),
|
|
53
|
+
"texts" => texts
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
response = @connection.post do |req|
|
|
57
|
+
req.body = JSON.generate(payload)
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
parse_response(response)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def name
|
|
64
|
+
NAME
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def default_space
|
|
68
|
+
"cosine"
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def supported_spaces
|
|
72
|
+
%w[cosine l2 ip]
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def get_config
|
|
76
|
+
{
|
|
77
|
+
"api_key_env_var" => @api_key_env_var,
|
|
78
|
+
"model" => @model,
|
|
79
|
+
"task" => @task,
|
|
80
|
+
"instructions" => @instructions
|
|
81
|
+
}
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def validate_config(config)
|
|
85
|
+
EmbeddingFunctions.validate_config_schema(config, "chroma-cloud-qwen")
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def self.build_from_config(config, client: nil)
|
|
89
|
+
model = config["model"] || config[:model]
|
|
90
|
+
task = config["task"] || config[:task]
|
|
91
|
+
instructions = config["instructions"] || config[:instructions]
|
|
92
|
+
api_key_env_var = config["api_key_env_var"] || config[:api_key_env_var] || "CHROMA_API_KEY"
|
|
93
|
+
|
|
94
|
+
raise ArgumentError, "Config is missing required field 'model'" if model.nil?
|
|
95
|
+
|
|
96
|
+
ChromaCloudQwenEmbeddingFunction.new(
|
|
97
|
+
model: model,
|
|
98
|
+
task: task,
|
|
99
|
+
instructions: instructions || DEFAULT_INSTRUCTIONS,
|
|
100
|
+
api_key_env_var: api_key_env_var,
|
|
101
|
+
)
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def self.validate_config(config)
|
|
105
|
+
EmbeddingFunctions.validate_config_schema(config, "chroma-cloud-qwen")
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def validate_config_update(old_config, new_config)
|
|
109
|
+
%w[model task instructions].each do |key|
|
|
110
|
+
next unless new_config.key?(key)
|
|
111
|
+
raise ArgumentError, "The #{key} cannot be changed after initialization." if new_config[key] != old_config[key]
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
private
|
|
116
|
+
|
|
117
|
+
def instruction_for(target)
|
|
118
|
+
return "" unless @task && @instructions[@task]
|
|
119
|
+
target_instructions = @instructions[@task]
|
|
120
|
+
target_instructions[target] || ""
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
def parse_response(response)
|
|
124
|
+
unless response.success?
|
|
125
|
+
raise RuntimeError, "Failed to get embeddings from Chroma Cloud API: HTTP #{response.status} - #{response.body}"
|
|
126
|
+
end
|
|
127
|
+
data = JSON.parse(response.body)
|
|
128
|
+
unless data["embeddings"]
|
|
129
|
+
raise RuntimeError, data["error"] || "Unknown error"
|
|
130
|
+
end
|
|
131
|
+
data["embeddings"]
|
|
132
|
+
rescue JSON::ParserError
|
|
133
|
+
raise RuntimeError, "Invalid JSON response from Chroma Cloud API"
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
register_embedding_function(ChromaCloudQwenEmbeddingFunction::NAME, ChromaCloudQwenEmbeddingFunction)
|
|
138
|
+
end
|
|
139
|
+
end
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Chroma
|
|
4
|
+
module EmbeddingFunctions
|
|
5
|
+
class ChromaCloudSpladeEmbeddingFunction
|
|
6
|
+
NAME = "chroma-cloud-splade"
|
|
7
|
+
DEFAULT_MODEL = "prithivida/Splade_PP_en_v1"
|
|
8
|
+
|
|
9
|
+
attr_reader :api_key_env_var, :model, :include_tokens
|
|
10
|
+
|
|
11
|
+
def initialize(api_key_env_var: "CHROMA_API_KEY", model: DEFAULT_MODEL, include_tokens: false)
|
|
12
|
+
@api_key_env_var = api_key_env_var
|
|
13
|
+
@api_key = ENV[@api_key_env_var] || Chroma::SharedState.cloud_api_key
|
|
14
|
+
raise ArgumentError, "API key not found in #{@api_key_env_var} or any existing clients" if @api_key.nil? || @api_key.empty?
|
|
15
|
+
|
|
16
|
+
@model = model
|
|
17
|
+
@include_tokens = !!include_tokens
|
|
18
|
+
|
|
19
|
+
@connection = Faraday.new(url: "https://embed.trychroma.com") do |builder|
|
|
20
|
+
builder.headers["x-chroma-token"] = @api_key
|
|
21
|
+
builder.headers["x-chroma-embedding-model"] = @model
|
|
22
|
+
builder.headers["Content-Type"] = "application/json"
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def call(texts)
|
|
27
|
+
return [] if texts.nil? || texts.empty?
|
|
28
|
+
|
|
29
|
+
payload = {
|
|
30
|
+
"texts" => texts,
|
|
31
|
+
"task" => "",
|
|
32
|
+
"target" => "",
|
|
33
|
+
"fetch_tokens" => @include_tokens ? "true" : "false"
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
response = @connection.post("/embed_sparse") do |req|
|
|
37
|
+
req.body = JSON.generate(payload)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
parse_response(response)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def embed_query(texts)
|
|
44
|
+
call(texts)
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def name
|
|
48
|
+
NAME
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def get_config
|
|
52
|
+
{
|
|
53
|
+
"api_key_env_var" => @api_key_env_var,
|
|
54
|
+
"model" => @model,
|
|
55
|
+
"include_tokens" => @include_tokens
|
|
56
|
+
}
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def validate_config(config)
|
|
60
|
+
EmbeddingFunctions.validate_config_schema(config, "chroma-cloud-splade")
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def validate_config_update(old_config, new_config)
|
|
64
|
+
%w[include_tokens model].each do |key|
|
|
65
|
+
next unless new_config.key?(key)
|
|
66
|
+
raise ArgumentError, "Updating '#{key}' is not supported for chroma-cloud-splade" if new_config[key] != old_config[key]
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def self.build_from_config(config, client: nil)
|
|
71
|
+
api_key_env_var = config["api_key_env_var"] || config[:api_key_env_var]
|
|
72
|
+
model = config["model"] || config[:model]
|
|
73
|
+
include_tokens = config.key?("include_tokens") ? config["include_tokens"] : config[:include_tokens]
|
|
74
|
+
|
|
75
|
+
raise ArgumentError, "model must be provided in config" if model.nil?
|
|
76
|
+
raise ArgumentError, "api_key_env_var must be provided in config" if api_key_env_var.nil? || api_key_env_var.to_s.empty?
|
|
77
|
+
|
|
78
|
+
ChromaCloudSpladeEmbeddingFunction.new(
|
|
79
|
+
api_key_env_var: api_key_env_var,
|
|
80
|
+
model: model,
|
|
81
|
+
include_tokens: include_tokens || false,
|
|
82
|
+
)
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def self.validate_config(config)
|
|
86
|
+
EmbeddingFunctions.validate_config_schema(config, "chroma-cloud-splade")
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
private
|
|
90
|
+
|
|
91
|
+
def parse_response(response)
|
|
92
|
+
unless response.success?
|
|
93
|
+
raise RuntimeError, "Failed to get embeddings from Chroma Cloud API: HTTP #{response.status} - #{response.body}"
|
|
94
|
+
end
|
|
95
|
+
data = JSON.parse(response.body)
|
|
96
|
+
raw_embeddings = data["embeddings"] || []
|
|
97
|
+
|
|
98
|
+
raw_embeddings.map do |embedding|
|
|
99
|
+
if embedding.is_a?(Hash)
|
|
100
|
+
indices = embedding["indices"] || []
|
|
101
|
+
values = embedding["values"] || []
|
|
102
|
+
labels = @include_tokens ? (embedding["labels"] || embedding["tokens"]) : nil
|
|
103
|
+
Types::Validation.normalize_sparse_vector(indices: indices, values: values, labels: labels)
|
|
104
|
+
elsif embedding.is_a?(Types::SparseVector)
|
|
105
|
+
Types::Validation.normalize_sparse_vector(
|
|
106
|
+
indices: embedding.indices,
|
|
107
|
+
values: embedding.values,
|
|
108
|
+
labels: @include_tokens ? embedding.labels : nil,
|
|
109
|
+
)
|
|
110
|
+
else
|
|
111
|
+
raise ArgumentError, "Unexpected sparse embedding format: #{embedding.inspect}"
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
rescue JSON::ParserError
|
|
115
|
+
raise RuntimeError, "Invalid JSON response from Chroma Cloud API"
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
register_sparse_embedding_function(ChromaCloudSpladeEmbeddingFunction::NAME, ChromaCloudSpladeEmbeddingFunction)
|
|
120
|
+
end
|
|
121
|
+
end
|