chub-dev 0.1.0 → 0.1.2-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. package/README.md +55 -0
  2. package/bin/chub-mcp +2 -0
  3. package/dist/airtable/docs/database/javascript/DOC.md +1437 -0
  4. package/dist/airtable/docs/database/python/DOC.md +1735 -0
  5. package/dist/amplitude/docs/analytics/javascript/DOC.md +1282 -0
  6. package/dist/amplitude/docs/analytics/python/DOC.md +1199 -0
  7. package/dist/anthropic/docs/claude-api/javascript/DOC.md +503 -0
  8. package/dist/anthropic/docs/claude-api/python/DOC.md +389 -0
  9. package/dist/asana/docs/tasks/DOC.md +1396 -0
  10. package/dist/assemblyai/docs/transcription/DOC.md +1043 -0
  11. package/dist/atlassian/docs/confluence/javascript/DOC.md +1347 -0
  12. package/dist/atlassian/docs/confluence/python/DOC.md +1604 -0
  13. package/dist/auth0/docs/identity/javascript/DOC.md +968 -0
  14. package/dist/auth0/docs/identity/python/DOC.md +1199 -0
  15. package/dist/aws/docs/s3/javascript/DOC.md +1773 -0
  16. package/dist/aws/docs/s3/python/DOC.md +1807 -0
  17. package/dist/binance/docs/trading/javascript/DOC.md +1315 -0
  18. package/dist/binance/docs/trading/python/DOC.md +1454 -0
  19. package/dist/braintree/docs/gateway/javascript/DOC.md +1278 -0
  20. package/dist/braintree/docs/gateway/python/DOC.md +1179 -0
  21. package/dist/chromadb/docs/embeddings-db/javascript/DOC.md +1263 -0
  22. package/dist/chromadb/docs/embeddings-db/python/DOC.md +1707 -0
  23. package/dist/clerk/docs/auth/javascript/DOC.md +1220 -0
  24. package/dist/clerk/docs/auth/python/DOC.md +274 -0
  25. package/dist/cloudflare/docs/workers/javascript/DOC.md +918 -0
  26. package/dist/cloudflare/docs/workers/python/DOC.md +994 -0
  27. package/dist/cockroachdb/docs/distributed-db/DOC.md +1500 -0
  28. package/dist/cohere/docs/llm/DOC.md +1335 -0
  29. package/dist/datadog/docs/monitoring/javascript/DOC.md +1740 -0
  30. package/dist/datadog/docs/monitoring/python/DOC.md +1815 -0
  31. package/dist/deepgram/docs/speech/javascript/DOC.md +885 -0
  32. package/dist/deepgram/docs/speech/python/DOC.md +685 -0
  33. package/dist/deepl/docs/translation/javascript/DOC.md +887 -0
  34. package/dist/deepl/docs/translation/python/DOC.md +944 -0
  35. package/dist/deepseek/docs/llm/DOC.md +1220 -0
  36. package/dist/directus/docs/headless-cms/javascript/DOC.md +1128 -0
  37. package/dist/directus/docs/headless-cms/python/DOC.md +1276 -0
  38. package/dist/discord/docs/bot/javascript/DOC.md +1090 -0
  39. package/dist/discord/docs/bot/python/DOC.md +1130 -0
  40. package/dist/elasticsearch/docs/search/DOC.md +1634 -0
  41. package/dist/elevenlabs/docs/text-to-speech/javascript/DOC.md +336 -0
  42. package/dist/elevenlabs/docs/text-to-speech/python/DOC.md +552 -0
  43. package/dist/firebase/docs/auth/DOC.md +1015 -0
  44. package/dist/gemini/docs/genai/javascript/DOC.md +691 -0
  45. package/dist/gemini/docs/genai/python/DOC.md +555 -0
  46. package/dist/github/docs/octokit/DOC.md +1560 -0
  47. package/dist/google/docs/bigquery/javascript/DOC.md +1688 -0
  48. package/dist/google/docs/bigquery/python/DOC.md +1503 -0
  49. package/dist/hubspot/docs/crm/javascript/DOC.md +1805 -0
  50. package/dist/hubspot/docs/crm/python/DOC.md +2033 -0
  51. package/dist/huggingface/docs/transformers/DOC.md +948 -0
  52. package/dist/intercom/docs/messaging/javascript/DOC.md +1844 -0
  53. package/dist/intercom/docs/messaging/python/DOC.md +1797 -0
  54. package/dist/jira/docs/issues/javascript/DOC.md +1420 -0
  55. package/dist/jira/docs/issues/python/DOC.md +1492 -0
  56. package/dist/kafka/docs/streaming/javascript/DOC.md +1671 -0
  57. package/dist/kafka/docs/streaming/python/DOC.md +1464 -0
  58. package/dist/landingai-ade/docs/api/DOC.md +620 -0
  59. package/dist/landingai-ade/docs/sdk/python/DOC.md +489 -0
  60. package/dist/landingai-ade/docs/sdk/typescript/DOC.md +542 -0
  61. package/dist/landingai-ade/skills/SKILL.md +489 -0
  62. package/dist/launchdarkly/docs/feature-flags/javascript/DOC.md +1191 -0
  63. package/dist/launchdarkly/docs/feature-flags/python/DOC.md +1671 -0
  64. package/dist/linear/docs/tracker/DOC.md +1554 -0
  65. package/dist/livekit/docs/realtime/javascript/DOC.md +303 -0
  66. package/dist/livekit/docs/realtime/python/DOC.md +163 -0
  67. package/dist/mailchimp/docs/marketing/DOC.md +1420 -0
  68. package/dist/meilisearch/docs/search/DOC.md +1241 -0
  69. package/dist/microsoft/docs/onedrive/javascript/DOC.md +1421 -0
  70. package/dist/microsoft/docs/onedrive/python/DOC.md +1549 -0
  71. package/dist/mongodb/docs/atlas/DOC.md +2041 -0
  72. package/dist/notion/docs/workspace-api/javascript/DOC.md +1435 -0
  73. package/dist/notion/docs/workspace-api/python/DOC.md +1400 -0
  74. package/dist/okta/docs/identity/javascript/DOC.md +1171 -0
  75. package/dist/okta/docs/identity/python/DOC.md +1401 -0
  76. package/dist/openai/docs/chat/javascript/DOC.md +407 -0
  77. package/dist/openai/docs/chat/python/DOC.md +568 -0
  78. package/dist/paypal/docs/checkout/DOC.md +278 -0
  79. package/dist/pinecone/docs/sdk/javascript/DOC.md +984 -0
  80. package/dist/pinecone/docs/sdk/python/DOC.md +1395 -0
  81. package/dist/plaid/docs/banking/javascript/DOC.md +1163 -0
  82. package/dist/plaid/docs/banking/python/DOC.md +1203 -0
  83. package/dist/playwright-community/skills/login-flows/SKILL.md +108 -0
  84. package/dist/postmark/docs/transactional-email/DOC.md +1168 -0
  85. package/dist/prisma/docs/orm/javascript/DOC.md +1419 -0
  86. package/dist/prisma/docs/orm/python/DOC.md +1317 -0
  87. package/dist/qdrant/docs/vector-search/javascript/DOC.md +1221 -0
  88. package/dist/qdrant/docs/vector-search/python/DOC.md +1653 -0
  89. package/dist/rabbitmq/docs/message-queue/javascript/DOC.md +1193 -0
  90. package/dist/rabbitmq/docs/message-queue/python/DOC.md +1243 -0
  91. package/dist/razorpay/docs/payments/javascript/DOC.md +1219 -0
  92. package/dist/razorpay/docs/payments/python/DOC.md +1330 -0
  93. package/dist/redis/docs/key-value/javascript/DOC.md +1851 -0
  94. package/dist/redis/docs/key-value/python/DOC.md +2054 -0
  95. package/dist/registry.json +2817 -0
  96. package/dist/replicate/docs/model-hosting/DOC.md +1318 -0
  97. package/dist/resend/docs/email/DOC.md +1271 -0
  98. package/dist/salesforce/docs/crm/javascript/DOC.md +1241 -0
  99. package/dist/salesforce/docs/crm/python/DOC.md +1183 -0
  100. package/dist/search-index.json +1 -0
  101. package/dist/sendgrid/docs/email-api/javascript/DOC.md +371 -0
  102. package/dist/sendgrid/docs/email-api/python/DOC.md +656 -0
  103. package/dist/sentry/docs/error-tracking/javascript/DOC.md +1073 -0
  104. package/dist/sentry/docs/error-tracking/python/DOC.md +1309 -0
  105. package/dist/shopify/docs/storefront/DOC.md +457 -0
  106. package/dist/slack/docs/workspace/javascript/DOC.md +933 -0
  107. package/dist/slack/docs/workspace/python/DOC.md +271 -0
  108. package/dist/square/docs/payments/javascript/DOC.md +1855 -0
  109. package/dist/square/docs/payments/python/DOC.md +1728 -0
  110. package/dist/stripe/docs/api/DOC.md +1727 -0
  111. package/dist/stripe/docs/payments/DOC.md +1726 -0
  112. package/dist/stytch/docs/auth/javascript/DOC.md +1813 -0
  113. package/dist/stytch/docs/auth/python/DOC.md +1962 -0
  114. package/dist/supabase/docs/client/DOC.md +1606 -0
  115. package/dist/twilio/docs/messaging/python/DOC.md +469 -0
  116. package/dist/twilio/docs/messaging/typescript/DOC.md +946 -0
  117. package/dist/vercel/docs/platform/DOC.md +1940 -0
  118. package/dist/weaviate/docs/vector-db/javascript/DOC.md +1268 -0
  119. package/dist/weaviate/docs/vector-db/python/DOC.md +1388 -0
  120. package/dist/zendesk/docs/support/javascript/DOC.md +2150 -0
  121. package/dist/zendesk/docs/support/python/DOC.md +2297 -0
  122. package/package.json +22 -6
  123. package/skills/get-api-docs/SKILL.md +84 -0
  124. package/src/commands/annotate.js +83 -0
  125. package/src/commands/build.js +12 -1
  126. package/src/commands/feedback.js +150 -0
  127. package/src/commands/get.js +83 -42
  128. package/src/commands/search.js +7 -0
  129. package/src/index.js +43 -17
  130. package/src/lib/analytics.js +90 -0
  131. package/src/lib/annotations.js +57 -0
  132. package/src/lib/bm25.js +170 -0
  133. package/src/lib/cache.js +69 -6
  134. package/src/lib/config.js +8 -3
  135. package/src/lib/identity.js +99 -0
  136. package/src/lib/registry.js +103 -20
  137. package/src/lib/telemetry.js +86 -0
  138. package/src/mcp/server.js +177 -0
  139. package/src/mcp/tools.js +251 -0
@@ -0,0 +1,1707 @@
1
+ ---
2
+ name: embeddings-db
3
+ description: "ChromaDB Python SDK for vector embeddings and AI-powered search"
4
+ metadata:
5
+ languages: "python"
6
+ versions: "1.2.1"
7
+ updated-on: "2026-03-02"
8
+ source: maintainer
9
+ tags: "chromadb,embeddings,vector-db,ai,search"
10
+ ---
11
+
12
+ # ChromaDB Python SDK - v1.2.1
13
+
14
+ ## Golden Rule
15
+
16
+ **ALWAYS use the official `chromadb` package (v1.2.1 or later) for Python projects.**
17
+
18
+ ```bash
19
+ pip install chromadb
20
+ ```
21
+
22
+ **DO NOT use:**
23
+ - Deprecated packages like `chromadb-client`
24
+ - Old versions below 1.0
25
+ - Community wrappers that may be outdated
26
+
27
+ ChromaDB is the official AI-native open-source vector database. It handles embeddings, indexing, and vector similarity search automatically.
28
+
29
+ **Requires Python >= 3.9**
30
+
31
+ ---
32
+
33
+ ## Installation
34
+
35
+ ### Using pip
36
+
37
+ ```bash
38
+ pip install chromadb
39
+ ```
40
+
41
+ ### Using Poetry
42
+
43
+ ```bash
44
+ poetry add chromadb
45
+ ```
46
+
47
+ ### Using uv
48
+
49
+ ```bash
50
+ uv pip install chromadb
51
+ ```
52
+
53
+ ### Install with Specific Version
54
+
55
+ ```bash
56
+ pip install chromadb==1.2.1
57
+ ```
58
+
59
+ ---
60
+
61
+ ## Initialization
62
+
63
+ ### Ephemeral Client (In-Memory)
64
+
65
+ ```python
66
+ import chromadb
67
+
68
+ client = chromadb.EphemeralClient()
69
+ ```
70
+
71
+ Use for experimentation, testing, and prototyping. Data is lost when the process ends.
72
+
73
+ ### Persistent Client (Local Storage)
74
+
75
+ ```python
76
+ import chromadb
77
+
78
+ client = chromadb.PersistentClient(path="./chroma_data")
79
+ ```
80
+
81
+ Stores data locally at the specified path. Creates the directory if it doesn't exist.
82
+
83
+ ### Persistent Client with Default Path
84
+
85
+ ```python
86
+ import chromadb
87
+
88
+ client = chromadb.PersistentClient()
89
+ ```
90
+
91
+ Defaults to `./chroma` in the current working directory.
92
+
93
+ ### HTTP Client (Remote Server)
94
+
95
+ ```python
96
+ import chromadb
97
+
98
+ client = chromadb.HttpClient(host="localhost", port=8000)
99
+ ```
100
+
101
+ Connects to a remote ChromaDB server.
102
+
103
+ ### HTTP Client with Custom Configuration
104
+
105
+ ```python
106
+ import chromadb
107
+ from chromadb.config import Settings
108
+
109
+ client = chromadb.HttpClient(
110
+ host="localhost",
111
+ port=8000,
112
+ ssl=False,
113
+ headers={"Authorization": "Bearer token"},
114
+ settings=Settings(),
115
+ tenant="default_tenant",
116
+ database="default_database"
117
+ )
118
+ ```
119
+
120
+ ---
121
+
122
+ ## Running ChromaDB Server
123
+
124
+ ### Local Server
125
+
126
+ ```bash
127
+ chroma run --path ./chroma_data
128
+ ```
129
+
130
+ Starts server on `http://localhost:8000`
131
+
132
+ ### Docker
133
+
134
+ ```bash
135
+ docker pull chromadb/chroma
136
+ docker run -p 8000:8000 chromadb/chroma
137
+ ```
138
+
139
+ ---
140
+
141
+ ## Collections
142
+
143
+ ### Create a Collection
144
+
145
+ ```python
146
+ collection = client.create_collection(name="my_collection")
147
+ ```
148
+
149
+ ### Create Collection with Distance Metric
150
+
151
+ ```python
152
+ collection = client.create_collection(
153
+ name="my_collection",
154
+ metadata={"hnsw:space": "cosine"}
155
+ )
156
+ ```
157
+
158
+ **Distance Metrics:**
159
+ - `cosine`: Cosine similarity (best for text, normalized vectors)
160
+ - `l2`: Euclidean/L2 distance (default, sensitive to magnitude)
161
+ - `ip`: Inner product (for recommendation systems)
162
+
163
+ ### Get an Existing Collection
164
+
165
+ ```python
166
+ collection = client.get_collection(name="my_collection")
167
+ ```
168
+
169
+ ### Get or Create Collection
170
+
171
+ ```python
172
+ collection = client.get_or_create_collection(name="my_collection")
173
+ ```
174
+
175
+ ### List All Collections
176
+
177
+ ```python
178
+ collections = client.list_collections()
179
+ for col in collections:
180
+ print(col.name)
181
+ ```
182
+
183
+ ### Delete a Collection
184
+
185
+ ```python
186
+ client.delete_collection(name="my_collection")
187
+ ```
188
+
189
+ ---
190
+
191
+ ## Adding Data
192
+
193
+ ### Add Documents (Auto-Embedding)
194
+
195
+ ```python
196
+ collection.add(
197
+ ids=["id1", "id2", "id3"],
198
+ documents=[
199
+ "This is a document about pineapples",
200
+ "This is a document about oranges",
201
+ "This is a document about apples"
202
+ ]
203
+ )
204
+ ```
205
+
206
+ ChromaDB automatically embeds the documents using the default embedding function.
207
+
208
+ ### Add with Metadata
209
+
210
+ ```python
211
+ collection.add(
212
+ ids=["id1", "id2", "id3"],
213
+ documents=[
214
+ "This is a document about pineapples",
215
+ "This is a document about oranges",
216
+ "This is a document about apples"
217
+ ],
218
+ metadatas=[
219
+ {"category": "tropical", "color": "yellow"},
220
+ {"category": "citrus", "color": "orange"},
221
+ {"category": "temperate", "color": "red"}
222
+ ]
223
+ )
224
+ ```
225
+
226
+ ### Add with Custom Embeddings
227
+
228
+ ```python
229
+ collection.add(
230
+ ids=["id1", "id2"],
231
+ embeddings=[
232
+ [1.5, 2.9, 3.4, 1.2, 0.8],
233
+ [9.8, 2.3, 2.9, 4.1, 3.3]
234
+ ],
235
+ documents=["Document one", "Document two"],
236
+ metadatas=[
237
+ {"source": "manual"},
238
+ {"source": "manual"}
239
+ ]
240
+ )
241
+ ```
242
+
243
+ ### Batch Adding (Large Datasets)
244
+
245
+ ```python
246
+ batch_size = 5000
247
+ for i in range(0, len(documents), batch_size):
248
+ batch_docs = documents[i:i + batch_size]
249
+ batch_ids = [f"id{j}" for j in range(i, i + len(batch_docs))]
250
+
251
+ collection.add(
252
+ ids=batch_ids,
253
+ documents=batch_docs
254
+ )
255
+ ```
256
+
257
+ ChromaDB supports adding up to 100k+ documents at once.
258
+
259
+ ---
260
+
261
+ ## Querying Data
262
+
263
+ ### Query with Text (Auto-Embedding)
264
+
265
+ ```python
266
+ results = collection.query(
267
+ query_texts=["What fruits are tropical?"],
268
+ n_results=2
269
+ )
270
+
271
+ print(results)
272
+ ```
273
+
274
+ **Response Structure:**
275
+
276
+ ```python
277
+ {
278
+ 'ids': [['id1', 'id2']],
279
+ 'distances': [[0.1234, 0.5678]],
280
+ 'documents': [['This is a document about pineapples', 'This is...']],
281
+ 'metadatas': [[{'category': 'tropical', 'color': 'yellow'}, {...}]],
282
+ 'embeddings': None # Not included by default
283
+ }
284
+ ```
285
+
286
+ ### Query with Multiple Texts
287
+
288
+ ```python
289
+ results = collection.query(
290
+ query_texts=[
291
+ "What fruits are tropical?",
292
+ "What fruits are citrus?"
293
+ ],
294
+ n_results=2
295
+ )
296
+ ```
297
+
298
+ Returns `n_results` for each query text.
299
+
300
+ ### Query with Custom Embeddings
301
+
302
+ ```python
303
+ results = collection.query(
304
+ query_embeddings=[[1.5, 2.9, 3.4, 1.2, 0.8]],
305
+ n_results=3
306
+ )
307
+ ```
308
+
309
+ ### Query with Metadata Filters
310
+
311
+ ```python
312
+ results = collection.query(
313
+ query_texts=["What fruits are available?"],
314
+ n_results=5,
315
+ where={"category": "tropical"}
316
+ )
317
+ ```
318
+
319
+ ### Complex Metadata Filtering
320
+
321
+ ```python
322
+ # Using $or operator
323
+ results = collection.query(
324
+ query_texts=["Find fruits"],
325
+ n_results=5,
326
+ where={
327
+ "$or": [
328
+ {"category": "tropical"},
329
+ {"category": "citrus"}
330
+ ]
331
+ }
332
+ )
333
+
334
+ # Using $and operator
335
+ results = collection.query(
336
+ query_texts=["Find fruits"],
337
+ n_results=5,
338
+ where={
339
+ "$and": [
340
+ {"category": "tropical"},
341
+ {"color": "yellow"}
342
+ ]
343
+ }
344
+ )
345
+
346
+ # Using comparison operators
347
+ results = collection.query(
348
+ query_texts=["Find items"],
349
+ n_results=5,
350
+ where={
351
+ "price": {"$gt": 10} # $gt, $gte, $lt, $lte, $ne, $eq
352
+ }
353
+ )
354
+ ```
355
+
356
+ ### Query with Document Content Filters
357
+
358
+ ```python
359
+ results = collection.query(
360
+ query_texts=["Find documents"],
361
+ n_results=5,
362
+ where_document={"$contains": "pineapple"}
363
+ )
364
+
365
+ # Using $not_contains
366
+ results = collection.query(
367
+ query_texts=["Find documents"],
368
+ n_results=5,
369
+ where_document={"$not_contains": "apple"}
370
+ )
371
+ ```
372
+
373
+ ### Query with Include Options
374
+
375
+ ```python
376
+ results = collection.query(
377
+ query_texts=["What fruits are tropical?"],
378
+ n_results=2,
379
+ include=["documents", "metadatas", "distances", "embeddings"]
380
+ )
381
+ ```
382
+
383
+ **Include Options:**
384
+ - `documents`: The document text (included by default)
385
+ - `metadatas`: Metadata for each document (included by default)
386
+ - `distances`: Distance/similarity scores (included by default)
387
+ - `embeddings`: Vector embeddings (not included by default for performance)
388
+
389
+ ---
390
+
391
+ ## Getting Data
392
+
393
+ ### Get Documents by IDs
394
+
395
+ ```python
396
+ results = collection.get(
397
+ ids=["id1", "id2"]
398
+ )
399
+
400
+ print(results)
401
+ ```
402
+
403
+ ### Get All Documents
404
+
405
+ ```python
406
+ results = collection.get()
407
+ ```
408
+
409
+ Returns all documents in the collection.
410
+
411
+ ### Get with Metadata Filter
412
+
413
+ ```python
414
+ results = collection.get(
415
+ where={"category": "tropical"}
416
+ )
417
+ ```
418
+
419
+ ### Get with Document Filter
420
+
421
+ ```python
422
+ results = collection.get(
423
+ where_document={"$contains": "pineapple"}
424
+ )
425
+ ```
426
+
427
+ ### Get with Limit and Offset
428
+
429
+ ```python
430
+ results = collection.get(
431
+ limit=10,
432
+ offset=20
433
+ )
434
+ ```
435
+
436
+ ### Get with Include Options
437
+
438
+ ```python
439
+ results = collection.get(
440
+ ids=["id1", "id2"],
441
+ include=["documents", "metadatas", "embeddings"]
442
+ )
443
+ ```
444
+
445
+ ---
446
+
447
+ ## Updating Data
448
+
449
+ ### Update Documents
450
+
451
+ ```python
452
+ collection.update(
453
+ ids=["id1", "id2"],
454
+ documents=[
455
+ "Updated document about pineapples",
456
+ "Updated document about oranges"
457
+ ],
458
+ metadatas=[
459
+ {"category": "tropical", "color": "yellow", "updated": True},
460
+ {"category": "citrus", "color": "orange", "updated": True}
461
+ ]
462
+ )
463
+ ```
464
+
465
+ ### Update with Custom Embeddings
466
+
467
+ ```python
468
+ collection.update(
469
+ ids=["id1"],
470
+ embeddings=[[1.1, 2.2, 3.3, 4.4, 5.5]],
471
+ documents=["Updated document"],
472
+ metadatas=[{"source": "updated"}]
473
+ )
474
+ ```
475
+
476
+ ---
477
+
478
+ ## Upsert (Add or Update)
479
+
480
+ ### Upsert Documents
481
+
482
+ ```python
483
+ collection.upsert(
484
+ ids=["id1", "id2", "id3"],
485
+ documents=[
486
+ "Document one - may be new or updated",
487
+ "Document two - may be new or updated",
488
+ "Document three - may be new or updated"
489
+ ],
490
+ metadatas=[
491
+ {"version": 2},
492
+ {"version": 2},
493
+ {"version": 1}
494
+ ]
495
+ )
496
+ ```
497
+
498
+ If the ID exists, it updates the document. If not, it adds it as new.
499
+
500
+ ---
501
+
502
+ ## Deleting Data
503
+
504
+ ### Delete by IDs
505
+
506
+ ```python
507
+ collection.delete(
508
+ ids=["id1", "id2"]
509
+ )
510
+ ```
511
+
512
+ ### Delete with Metadata Filter
513
+
514
+ ```python
515
+ collection.delete(
516
+ where={"category": "tropical"}
517
+ )
518
+ ```
519
+
520
+ ### Delete with Document Filter
521
+
522
+ ```python
523
+ collection.delete(
524
+ where_document={"$contains": "deprecated"}
525
+ )
526
+ ```
527
+
528
+ ### Delete All Documents (Keep Collection)
529
+
530
+ ```python
531
+ collection.delete()
532
+ ```
533
+
534
+ ---
535
+
536
+ ## Collection Utilities
537
+
538
+ ### Count Documents
539
+
540
+ ```python
541
+ count = collection.count()
542
+ print(f"Total documents: {count}")
543
+ ```
544
+
545
+ ### Peek at First Documents
546
+
547
+ ```python
548
+ first_docs = collection.peek(limit=5)
549
+ print(first_docs)
550
+ ```
551
+
552
+ Returns the first 5 documents in the collection.
553
+
554
+ ### Modify Collection Metadata
555
+
556
+ ```python
557
+ collection.modify(
558
+ metadata={
559
+ "description": "Collection of fruit documents",
560
+ "version": "1.0"
561
+ }
562
+ )
563
+ ```
564
+
565
+ ---
566
+
567
+ ## Embedding Functions
568
+
569
+ ### Using Default Embedding Function
570
+
571
+ ```python
572
+ import chromadb
573
+
574
+ client = chromadb.PersistentClient()
575
+ collection = client.create_collection(name="my_collection")
576
+ ```
577
+
578
+ By default, ChromaDB uses the Sentence Transformers `all-MiniLM-L6-v2` model.
579
+
580
+ ### Using OpenAI Embeddings
581
+
582
+ ```python
583
+ import chromadb
584
+ from chromadb.utils import embedding_functions
585
+
586
+ openai_ef = embedding_functions.OpenAIEmbeddingFunction(
587
+ api_key="your-openai-api-key",
588
+ model_name="text-embedding-3-small"
589
+ )
590
+
591
+ collection = client.create_collection(
592
+ name="openai_collection",
593
+ embedding_function=openai_ef
594
+ )
595
+ ```
596
+
597
+ **Available OpenAI Models:**
598
+ - `text-embedding-3-small`
599
+ - `text-embedding-3-large`
600
+ - `text-embedding-ada-002`
601
+
602
+ ### Using Cohere Embeddings
603
+
604
+ ```python
605
+ import chromadb
606
+ from chromadb.utils import embedding_functions
607
+
608
+ cohere_ef = embedding_functions.CohereEmbeddingFunction(
609
+ api_key="your-cohere-api-key",
610
+ model_name="embed-english-v3.0"
611
+ )
612
+
613
+ collection = client.create_collection(
614
+ name="cohere_collection",
615
+ embedding_function=cohere_ef
616
+ )
617
+ ```
618
+
619
+ **Available Cohere Models:**
620
+ - `embed-english-v3.0`
621
+ - `embed-multilingual-v3.0`
622
+ - `embed-english-light-v3.0`
623
+
624
+ ### Using Hugging Face Embeddings
625
+
626
+ ```python
627
+ import chromadb
628
+ from chromadb.utils import embedding_functions
629
+
630
+ huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(
631
+ api_key="your-hf-api-key",
632
+ model_name="sentence-transformers/all-MiniLM-L6-v2"
633
+ )
634
+
635
+ collection = client.create_collection(
636
+ name="hf_collection",
637
+ embedding_function=huggingface_ef
638
+ )
639
+ ```
640
+
641
+ ### Using Sentence Transformers (Local)
642
+
643
+ ```python
644
+ import chromadb
645
+ from chromadb.utils import embedding_functions
646
+
647
+ sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
648
+ model_name="all-MiniLM-L6-v2"
649
+ )
650
+
651
+ collection = client.create_collection(
652
+ name="local_collection",
653
+ embedding_function=sentence_transformer_ef
654
+ )
655
+ ```
656
+
657
+ ### Using Ollama Embeddings (Local)
658
+
659
+ ```python
660
+ import chromadb
661
+ from chromadb.utils import embedding_functions
662
+
663
+ ollama_ef = embedding_functions.OllamaEmbeddingFunction(
664
+ url="http://localhost:11434/api/embeddings",
665
+ model_name="llama2"
666
+ )
667
+
668
+ collection = client.create_collection(
669
+ name="ollama_collection",
670
+ embedding_function=ollama_ef
671
+ )
672
+ ```
673
+
674
+ ### Custom Embedding Function
675
+
676
+ ```python
677
+ import chromadb
678
+ from chromadb import Documents, EmbeddingFunction, Embeddings
679
+
680
+ class CustomEmbeddingFunction(EmbeddingFunction):
681
+ def __call__(self, input: Documents) -> Embeddings:
682
+ # Your custom embedding logic here
683
+ embeddings = []
684
+ for doc in input:
685
+ # Example: simple character code embedding (replace with real model)
686
+ embedding = [ord(c) / 255.0 for c in doc[:384]]
687
+ # Pad to fixed length
688
+ embedding.extend([0.0] * (384 - len(embedding)))
689
+ embeddings.append(embedding)
690
+ return embeddings
691
+
692
+ custom_ef = CustomEmbeddingFunction()
693
+ collection = client.create_collection(
694
+ name="custom_collection",
695
+ embedding_function=custom_ef
696
+ )
697
+ ```
698
+
699
+ ---
700
+
701
+ ## Advanced Client Configuration
702
+
703
+ ### PersistentClient with Full Options
704
+
705
+ ```python
706
+ import chromadb
707
+ from chromadb.config import Settings
708
+
709
+ client = chromadb.PersistentClient(
710
+ path="./my_chroma_data",
711
+ settings=Settings(
712
+ anonymized_telemetry=False,
713
+ allow_reset=True
714
+ ),
715
+ tenant="default_tenant",
716
+ database="default_database"
717
+ )
718
+ ```
719
+
720
+ ### HttpClient with Authentication
721
+
722
+ ```python
723
+ import chromadb
724
+ from chromadb.config import Settings
725
+
726
+ client = chromadb.HttpClient(
727
+ host="my-chroma-server.com",
728
+ port=8000,
729
+ ssl=True,
730
+ headers={"Authorization": "Bearer my-token"},
731
+ settings=Settings(
732
+ chroma_client_auth_provider="chromadb.auth.token_authn.TokenAuthClientProvider",
733
+ chroma_client_auth_credentials="my-token"
734
+ )
735
+ )
736
+ ```
737
+
738
+ ### Multi-Tenancy Setup
739
+
740
+ ```python
741
+ import chromadb
742
+
743
+ client = chromadb.HttpClient(host="localhost", port=8000)
744
+
745
+ # Create a new tenant and database
746
+ client.create_tenant(name="acme_corp")
747
+ client.create_database(name="production", tenant="acme_corp")
748
+
749
+ # Connect to specific tenant/database
750
+ tenant_client = chromadb.HttpClient(
751
+ host="localhost",
752
+ port=8000,
753
+ tenant="acme_corp",
754
+ database="production"
755
+ )
756
+ ```
757
+
758
+ ---
759
+
760
+ ## Complete Example: Document Search System
761
+
762
+ ```python
763
+ import chromadb
764
+
765
+ def main():
766
+ # Initialize client
767
+ client = chromadb.PersistentClient(path="./search_db")
768
+
769
+ # Create or get collection
770
+ collection = client.get_or_create_collection(
771
+ name="knowledge_base",
772
+ metadata={"hnsw:space": "cosine"}
773
+ )
774
+
775
+ # Add documents
776
+ collection.add(
777
+ ids=["doc1", "doc2", "doc3", "doc4"],
778
+ documents=[
779
+ "The quick brown fox jumps over the lazy dog",
780
+ "Machine learning is a subset of artificial intelligence",
781
+ "Python is a popular programming language",
782
+ "ChromaDB is a vector database for AI applications"
783
+ ],
784
+ metadatas=[
785
+ {"category": "phrases", "language": "english"},
786
+ {"category": "ai", "language": "english"},
787
+ {"category": "programming", "language": "english"},
788
+ {"category": "database", "language": "english"}
789
+ ]
790
+ )
791
+
792
+ # Query the collection
793
+ results = collection.query(
794
+ query_texts=["What is AI?"],
795
+ n_results=2,
796
+ where={"category": "ai"}
797
+ )
798
+
799
+ print("Search Results:")
800
+ print(results["documents"][0])
801
+ print(results["metadatas"][0])
802
+ print(results["distances"][0])
803
+
804
+ # Get document count
805
+ count = collection.count()
806
+ print(f"Total documents: {count}")
807
+
808
+ # Update a document
809
+ collection.update(
810
+ ids=["doc2"],
811
+ documents=["Machine learning is a powerful subset of artificial intelligence"],
812
+ metadatas=[{"category": "ai", "language": "english", "updated": True}]
813
+ )
814
+
815
+ # Delete documents
816
+ collection.delete(ids=["doc4"])
817
+
818
+ if __name__ == "__main__":
819
+ main()
820
+ ```
821
+
822
+ ---
823
+
824
+ ## Complete Example: Semantic Search with OpenAI
825
+
826
+ ```python
827
+ import chromadb
828
+ from chromadb.utils import embedding_functions
829
+ import os
830
+ from dotenv import load_dotenv
831
+
832
+ load_dotenv()
833
+
834
+ def semantic_search():
835
+ client = chromadb.PersistentClient(path="./semantic_db")
836
+
837
+ openai_ef = embedding_functions.OpenAIEmbeddingFunction(
838
+ api_key=os.getenv("OPENAI_API_KEY"),
839
+ model_name="text-embedding-3-small"
840
+ )
841
+
842
+ collection = client.get_or_create_collection(
843
+ name="articles",
844
+ embedding_function=openai_ef,
845
+ metadata={"hnsw:space": "cosine"}
846
+ )
847
+
848
+ # Add articles
849
+ collection.add(
850
+ ids=["art1", "art2", "art3"],
851
+ documents=[
852
+ "Climate change is affecting global weather patterns",
853
+ "New breakthrough in quantum computing announced",
854
+ "The future of renewable energy looks promising"
855
+ ],
856
+ metadatas=[
857
+ {"topic": "environment", "date": "2024-01-15"},
858
+ {"topic": "technology", "date": "2024-01-16"},
859
+ {"topic": "energy", "date": "2024-01-17"}
860
+ ]
861
+ )
862
+
863
+ # Search for relevant articles
864
+ results = collection.query(
865
+ query_texts=["Tell me about environmental issues"],
866
+ n_results=3
867
+ )
868
+
869
+ for idx, doc in enumerate(results["documents"][0]):
870
+ print(f"Result {idx + 1}:")
871
+ print(f"Document: {doc}")
872
+ print(f"Metadata: {results['metadatas'][0][idx]}")
873
+ print(f"Distance: {results['distances'][0][idx]}")
874
+ print("---")
875
+
876
+ if __name__ == "__main__":
877
+ semantic_search()
878
+ ```
879
+
880
+ ---
881
+
882
+ ## Complete Example: RAG (Retrieval-Augmented Generation)
883
+
884
+ ```python
885
+ import chromadb
886
+ from openai import OpenAI
887
+ import os
888
+ from dotenv import load_dotenv
889
+
890
+ load_dotenv()
891
+
892
+ def rag_example():
893
+ # Initialize ChromaDB
894
+ chroma_client = chromadb.PersistentClient(path="./rag_db")
895
+ collection = chroma_client.get_or_create_collection(
896
+ name="company_docs",
897
+ metadata={"hnsw:space": "cosine"}
898
+ )
899
+
900
+ # Add company knowledge base
901
+ collection.add(
902
+ ids=["policy1", "policy2", "policy3"],
903
+ documents=[
904
+ "Our company offers 20 days of paid vacation per year",
905
+ "Remote work is available 3 days per week",
906
+ "Health insurance includes dental and vision coverage"
907
+ ],
908
+ metadatas=[
909
+ {"type": "policy", "category": "time-off"},
910
+ {"type": "policy", "category": "work-arrangement"},
911
+ {"type": "policy", "category": "benefits"}
912
+ ]
913
+ )
914
+
915
+ # User question
916
+ question = "How many vacation days do I get?"
917
+
918
+ # Retrieve relevant context
919
+ search_results = collection.query(
920
+ query_texts=[question],
921
+ n_results=2
922
+ )
923
+
924
+ context = "\n".join(search_results["documents"][0])
925
+
926
+ # Generate answer with OpenAI
927
+ openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
928
+
929
+ completion = openai_client.chat.completions.create(
930
+ model="gpt-4",
931
+ messages=[
932
+ {
933
+ "role": "system",
934
+ "content": "Answer the question based on the context provided."
935
+ },
936
+ {
937
+ "role": "user",
938
+ "content": f"Context:\n{context}\n\nQuestion: {question}"
939
+ }
940
+ ]
941
+ )
942
+
943
+ print("Answer:", completion.choices[0].message.content)
944
+
945
+ if __name__ == "__main__":
946
+ rag_example()
947
+ ```
948
+
949
+ ---
950
+
951
+ ## Complete Example: Streaming Large Dataset
952
+
953
+ ```python
954
+ import chromadb
955
+ from typing import List, Generator
956
+
957
+ def document_generator(file_path: str) -> Generator[str, None, None]:
958
+ """Generator to stream documents from a file."""
959
+ with open(file_path, 'r') as f:
960
+ for line in f:
961
+ yield line.strip()
962
+
963
+ def stream_add_documents(collection, doc_gen: Generator, batch_size: int = 1000):
964
+ """Add documents in batches from a generator."""
965
+ batch_ids = []
966
+ batch_docs = []
967
+ counter = 0
968
+
969
+ for doc in doc_gen:
970
+ batch_ids.append(f"doc{counter}")
971
+ batch_docs.append(doc)
972
+ counter += 1
973
+
974
+ if len(batch_docs) >= batch_size:
975
+ collection.add(
976
+ ids=batch_ids,
977
+ documents=batch_docs
978
+ )
979
+ print(f"Added {counter} documents...")
980
+ batch_ids = []
981
+ batch_docs = []
982
+
983
+ # Add remaining documents
984
+ if batch_docs:
985
+ collection.add(
986
+ ids=batch_ids,
987
+ documents=batch_docs
988
+ )
989
+ print(f"Added final {len(batch_docs)} documents. Total: {counter}")
990
+
991
+ def main():
992
+ client = chromadb.PersistentClient(path="./large_db")
993
+ collection = client.get_or_create_collection(name="large_collection")
994
+
995
+ # Stream and add documents
996
+ doc_gen = document_generator("large_dataset.txt")
997
+ stream_add_documents(collection, doc_gen, batch_size=5000)
998
+
999
+ print(f"Total documents in collection: {collection.count()}")
1000
+
1001
+ if __name__ == "__main__":
1002
+ main()
1003
+ ```
1004
+
1005
+ ---
1006
+
1007
+ ## Environment Variables
1008
+
1009
+ ### .env File Setup
1010
+
1011
+ ```bash
1012
+ # ChromaDB Server
1013
+ CHROMA_HOST=localhost
1014
+ CHROMA_PORT=8000
1015
+
1016
+ # Authentication (if required)
1017
+ CHROMA_AUTH_TOKEN=your-auth-token
1018
+
1019
+ # Embedding API Keys
1020
+ OPENAI_API_KEY=sk-...
1021
+ COHERE_API_KEY=...
1022
+ HF_API_KEY=...
1023
+ ```
1024
+
1025
+ ### Using Environment Variables
1026
+
1027
+ ```python
1028
+ import chromadb
1029
+ import os
1030
+ from dotenv import load_dotenv
1031
+
1032
+ load_dotenv()
1033
+
1034
+ client = chromadb.HttpClient(
1035
+ host=os.getenv("CHROMA_HOST", "localhost"),
1036
+ port=int(os.getenv("CHROMA_PORT", "8000")),
1037
+ headers={"Authorization": f"Bearer {os.getenv('CHROMA_AUTH_TOKEN')}"}
1038
+ if os.getenv("CHROMA_AUTH_TOKEN") else None
1039
+ )
1040
+ ```
1041
+
1042
+ ---
1043
+
1044
+ ## Type Hints
1045
+
1046
+ ### Query Results Type
1047
+
1048
+ ```python
1049
+ from typing import TypedDict, List, Optional
1050
+
1051
+ class QueryResult(TypedDict):
1052
+ ids: List[List[str]]
1053
+ embeddings: Optional[List[List[List[float]]]]
1054
+ documents: List[List[Optional[str]]]
1055
+ metadatas: List[List[Optional[dict]]]
1056
+ distances: Optional[List[List[float]]]
1057
+ ```
1058
+
1059
+ ### Get Results Type
1060
+
1061
+ ```python
1062
+ from typing import TypedDict, List, Optional
1063
+
1064
+ class GetResult(TypedDict):
1065
+ ids: List[str]
1066
+ embeddings: Optional[List[List[float]]]
1067
+ documents: List[Optional[str]]
1068
+ metadatas: List[Optional[dict]]
1069
+ ```
1070
+
1071
+ ### Type-Safe Collection Operations
1072
+
1073
+ ```python
1074
+ from typing import List, Dict, Optional
1075
+
1076
+ def add_typed_documents(
1077
+ collection,
1078
+ ids: List[str],
1079
+ documents: List[str],
1080
+ metadatas: Optional[List[Dict[str, any]]] = None
1081
+ ) -> None:
1082
+ collection.add(
1083
+ ids=ids,
1084
+ documents=documents,
1085
+ metadatas=metadatas
1086
+ )
1087
+ ```
1088
+
1089
+ ---
1090
+
1091
+ ## Error Handling
1092
+
1093
+ ### Handle Collection Not Found
1094
+
1095
+ ```python
1096
+ import chromadb
1097
+
1098
+ client = chromadb.PersistentClient()
1099
+
1100
+ try:
1101
+ collection = client.get_collection(name="nonexistent_collection")
1102
+ except ValueError as e:
1103
+ if "does not exist" in str(e):
1104
+ print("Collection not found, creating...")
1105
+ collection = client.create_collection(name="nonexistent_collection")
1106
+ else:
1107
+ raise
1108
+ ```
1109
+
1110
+ ### Handle Duplicate IDs
1111
+
1112
+ ```python
1113
+ try:
1114
+ collection.add(
1115
+ ids=["id1"],
1116
+ documents=["Document"]
1117
+ )
1118
+
1119
+ # This will fail - ID already exists
1120
+ collection.add(
1121
+ ids=["id1"],
1122
+ documents=["Another document"]
1123
+ )
1124
+ except Exception as e:
1125
+ print(f"ID already exists. Use update() or upsert() instead. Error: {e}")
1126
+
1127
+ # Use upsert to add or update
1128
+ collection.upsert(
1129
+ ids=["id1"],
1130
+ documents=["Another document"]
1131
+ )
1132
+ ```
1133
+
1134
+ ### Handle Connection Errors
1135
+
1136
+ ```python
1137
+ import chromadb
1138
+ from requests.exceptions import ConnectionError
1139
+
1140
+ try:
1141
+ client = chromadb.HttpClient(host="localhost", port=8000)
1142
+ collections = client.list_collections()
1143
+ except ConnectionError:
1144
+ print("Cannot connect to ChromaDB server. Make sure it's running.")
1145
+ print("Start server with: chroma run --path ./data")
1146
+ except Exception as e:
1147
+ print(f"Error: {e}")
1148
+ ```
1149
+
1150
+ ### Handle Empty Results
1151
+
1152
+ ```python
1153
+ results = collection.query(
1154
+ query_texts=["Very specific query"],
1155
+ n_results=5
1156
+ )
1157
+
1158
+ if not results["ids"][0]:
1159
+ print("No results found. Try a broader query.")
1160
+ else:
1161
+ print(f"Found {len(results['ids'][0])} results")
1162
+ ```
1163
+
1164
+ ---
1165
+
1166
+ ## Performance Optimization
1167
+
1168
+ ### Batch Operations
1169
+
1170
+ ```python
1171
+ # Batch add for large datasets
1172
+ chunk_size = 5000
1173
+ for i in range(0, len(documents), chunk_size):
1174
+ collection.add(
1175
+ ids=ids[i:i + chunk_size],
1176
+ documents=documents[i:i + chunk_size],
1177
+ metadatas=metadatas[i:i + chunk_size]
1178
+ )
1179
+ ```
1180
+
1181
+ ### Parallel Queries
1182
+
1183
+ ```python
1184
+ import concurrent.futures
1185
+
1186
+ queries = [
1187
+ "Query about topic A",
1188
+ "Query about topic B",
1189
+ "Query about topic C"
1190
+ ]
1191
+
1192
+ def run_query(query):
1193
+ return collection.query(
1194
+ query_texts=[query],
1195
+ n_results=5
1196
+ )
1197
+
1198
+ with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
1199
+ results = list(executor.map(run_query, queries))
1200
+ ```
1201
+
1202
+ ### Limit Included Fields
1203
+
1204
+ ```python
1205
+ # Exclude embeddings for better performance
1206
+ results = collection.query(
1207
+ query_texts=["Search query"],
1208
+ n_results=10,
1209
+ include=["documents", "metadatas", "distances"]
1210
+ # Don't include embeddings unless needed
1211
+ )
1212
+ ```
1213
+
1214
+ ### Reuse Embedding Function
1215
+
1216
+ ```python
1217
+ from chromadb.utils import embedding_functions
1218
+
1219
+ # Create once, reuse for multiple collections
1220
+ openai_ef = embedding_functions.OpenAIEmbeddingFunction(
1221
+ api_key="your-api-key",
1222
+ model_name="text-embedding-3-small"
1223
+ )
1224
+
1225
+ collection1 = client.create_collection(
1226
+ name="collection1",
1227
+ embedding_function=openai_ef
1228
+ )
1229
+
1230
+ collection2 = client.create_collection(
1231
+ name="collection2",
1232
+ embedding_function=openai_ef
1233
+ )
1234
+ ```
1235
+
1236
+ ---
1237
+
1238
+ ## Common Patterns
1239
+
1240
+ ### Incremental Updates
1241
+
1242
+ ```python
1243
+ def add_daily_documents(collection, new_docs: List[str]):
1244
+ """Add new documents daily."""
1245
+ count = collection.count()
1246
+ new_ids = [f"doc{count + i}" for i in range(len(new_docs))]
1247
+
1248
+ from datetime import datetime
1249
+ collection.add(
1250
+ ids=new_ids,
1251
+ documents=new_docs,
1252
+ metadatas=[{"added_date": datetime.now().isoformat()} for _ in new_docs]
1253
+ )
1254
+ ```
1255
+
1256
+ ### Search with Fallback
1257
+
1258
+ ```python
1259
+ def search_with_fallback(collection, query: str):
1260
+ """Try specific category first, fallback to all documents."""
1261
+ # Try specific category first
1262
+ results = collection.query(
1263
+ query_texts=[query],
1264
+ n_results=5,
1265
+ where={"category": "premium"}
1266
+ )
1267
+
1268
+ # If no results, search all documents
1269
+ if not results["ids"][0]:
1270
+ results = collection.query(
1271
+ query_texts=[query],
1272
+ n_results=5
1273
+ )
1274
+
1275
+ return results
1276
+ ```
1277
+
1278
+ ### Deduplicate Documents
1279
+
1280
+ ```python
1281
+ def add_unique(collection, doc_id: str, document: str, metadata: dict):
1282
+ """Add document only if ID doesn't exist, otherwise update."""
1283
+ try:
1284
+ existing = collection.get(ids=[doc_id])
1285
+ if existing["ids"]:
1286
+ print("Document already exists, updating...")
1287
+ collection.update(
1288
+ ids=[doc_id],
1289
+ documents=[document],
1290
+ metadatas=[metadata]
1291
+ )
1292
+ else:
1293
+ collection.add(
1294
+ ids=[doc_id],
1295
+ documents=[document],
1296
+ metadatas=[metadata]
1297
+ )
1298
+ except Exception:
1299
+ collection.add(
1300
+ ids=[doc_id],
1301
+ documents=[document],
1302
+ metadatas=[metadata]
1303
+ )
1304
+ ```
1305
+
1306
+ ### Pagination
1307
+
1308
+ ```python
1309
+ def paginate_collection(collection, page_size: int = 100):
1310
+ """Iterate through all documents in pages."""
1311
+ total = collection.count()
1312
+
1313
+ for offset in range(0, total, page_size):
1314
+ results = collection.get(
1315
+ limit=page_size,
1316
+ offset=offset
1317
+ )
1318
+ yield results
1319
+
1320
+ # Usage
1321
+ for page in paginate_collection(collection, page_size=100):
1322
+ print(f"Processing {len(page['ids'])} documents...")
1323
+ # Process documents
1324
+ ```
1325
+
1326
+ ---
1327
+
1328
+ ## Metadata Filter Operators
1329
+
1330
+ ### Comparison Operators
1331
+
1332
+ ```python
1333
+ # Equal
1334
+ where = {"category": "tech"}
1335
+ where = {"category": {"$eq": "tech"}}
1336
+
1337
+ # Not equal
1338
+ where = {"category": {"$ne": "tech"}}
1339
+
1340
+ # Greater than
1341
+ where = {"price": {"$gt": 100}}
1342
+
1343
+ # Greater than or equal
1344
+ where = {"price": {"$gte": 100}}
1345
+
1346
+ # Less than
1347
+ where = {"price": {"$lt": 100}}
1348
+
1349
+ # Less than or equal
1350
+ where = {"price": {"$lte": 100}}
1351
+ ```
1352
+
1353
+ ### Logical Operators
1354
+
1355
+ ```python
1356
+ # AND
1357
+ where = {
1358
+ "$and": [
1359
+ {"category": "tech"},
1360
+ {"price": {"$lt": 1000}}
1361
+ ]
1362
+ }
1363
+
1364
+ # OR
1365
+ where = {
1366
+ "$or": [
1367
+ {"category": "tech"},
1368
+ {"category": "science"}
1369
+ ]
1370
+ }
1371
+
1372
+ # NOT
1373
+ where = {
1374
+ "$not": {"category": "archived"}
1375
+ }
1376
+ ```
1377
+
1378
+ ### Set Operators
1379
+
1380
+ ```python
1381
+ # In array
1382
+ where = {
1383
+ "category": {"$in": ["tech", "science", "health"]}
1384
+ }
1385
+
1386
+ # Not in array
1387
+ where = {
1388
+ "category": {"$nin": ["archived", "deleted"]}
1389
+ }
1390
+ ```
1391
+
1392
+ ---
1393
+
1394
+ ## Document Filter Operators
1395
+
1396
+ ### Contains
1397
+
1398
+ ```python
1399
+ where_document = {"$contains": "machine learning"}
1400
+ ```
1401
+
1402
+ ### Not Contains
1403
+
1404
+ ```python
1405
+ where_document = {"$not_contains": "deprecated"}
1406
+ ```
1407
+
1408
+ ### Combined with Metadata
1409
+
1410
+ ```python
1411
+ results = collection.query(
1412
+ query_texts=["AI research"],
1413
+ where={"category": "research"},
1414
+ where_document={"$contains": "neural network"},
1415
+ n_results=10
1416
+ )
1417
+ ```
1418
+
1419
+ ---
1420
+
1421
+ ## Working with Different Data Types
1422
+
1423
+ ### Numeric Metadata
1424
+
1425
+ ```python
1426
+ collection.add(
1427
+ ids=["p1", "p2", "p3"],
1428
+ documents=["Product A", "Product B", "Product C"],
1429
+ metadatas=[
1430
+ {"price": 29.99, "stock": 100},
1431
+ {"price": 49.99, "stock": 50},
1432
+ {"price": 19.99, "stock": 200}
1433
+ ]
1434
+ )
1435
+
1436
+ # Query by price range
1437
+ results = collection.query(
1438
+ query_texts=["affordable products"],
1439
+ where={
1440
+ "$and": [
1441
+ {"price": {"$lt": 50}},
1442
+ {"stock": {"$gt": 75}}
1443
+ ]
1444
+ },
1445
+ n_results=5
1446
+ )
1447
+ ```
1448
+
1449
+ ### Boolean Metadata
1450
+
1451
+ ```python
1452
+ collection.add(
1453
+ ids=["u1", "u2", "u3"],
1454
+ documents=["User profile A", "User profile B", "User profile C"],
1455
+ metadatas=[
1456
+ {"active": True, "premium": False},
1457
+ {"active": True, "premium": True},
1458
+ {"active": False, "premium": False}
1459
+ ]
1460
+ )
1461
+
1462
+ # Query active premium users
1463
+ results = collection.query(
1464
+ query_texts=["find users"],
1465
+ where={
1466
+ "$and": [
1467
+ {"active": True},
1468
+ {"premium": True}
1469
+ ]
1470
+ },
1471
+ n_results=10
1472
+ )
1473
+ ```
1474
+
1475
+ ### List Metadata
1476
+
1477
+ ```python
1478
+ collection.add(
1479
+ ids=["a1", "a2"],
1480
+ documents=["Article about AI and ML", "Article about databases"],
1481
+ metadatas=[
1482
+ {"tags": ["ai", "machine-learning", "neural-networks"]},
1483
+ {"tags": ["database", "vector-search"]}
1484
+ ]
1485
+ )
1486
+
1487
+ # Note: ChromaDB metadata values should be strings, numbers, or booleans
1488
+ # For list-like metadata, use multiple metadata fields or JSON strings
1489
+ ```
1490
+
1491
+ ---
1492
+
1493
+ ## Context Manager Pattern
1494
+
1495
+ ```python
1496
+ import chromadb
1497
+ from contextlib import contextmanager
1498
+
1499
+ @contextmanager
1500
+ def get_collection(collection_name: str, path: str = "./chroma_db"):
1501
+ """Context manager for ChromaDB collection."""
1502
+ client = chromadb.PersistentClient(path=path)
1503
+ collection = client.get_or_create_collection(name=collection_name)
1504
+ try:
1505
+ yield collection
1506
+ finally:
1507
+ # Cleanup if needed
1508
+ pass
1509
+
1510
+ # Usage
1511
+ with get_collection("my_collection") as collection:
1512
+ collection.add(
1513
+ ids=["id1"],
1514
+ documents=["Document"]
1515
+ )
1516
+ results = collection.query(
1517
+ query_texts=["Query"],
1518
+ n_results=5
1519
+ )
1520
+ print(results)
1521
+ ```
1522
+
1523
+ ---
1524
+
1525
+ ## Testing with ChromaDB
1526
+
1527
+ ### Using EphemeralClient for Tests
1528
+
1529
+ ```python
1530
+ import unittest
1531
+ import chromadb
1532
+
1533
+ class TestDocumentSearch(unittest.TestCase):
1534
+ def setUp(self):
1535
+ """Create ephemeral client for each test."""
1536
+ self.client = chromadb.EphemeralClient()
1537
+ self.collection = self.client.create_collection(name="test_collection")
1538
+
1539
+ def test_add_and_query(self):
1540
+ """Test adding documents and querying."""
1541
+ self.collection.add(
1542
+ ids=["test1", "test2"],
1543
+ documents=["Document about cats", "Document about dogs"]
1544
+ )
1545
+
1546
+ results = self.collection.query(
1547
+ query_texts=["pets"],
1548
+ n_results=2
1549
+ )
1550
+
1551
+ self.assertEqual(len(results["ids"][0]), 2)
1552
+
1553
+ def test_metadata_filtering(self):
1554
+ """Test metadata filtering."""
1555
+ self.collection.add(
1556
+ ids=["test1", "test2"],
1557
+ documents=["Doc 1", "Doc 2"],
1558
+ metadatas=[{"category": "A"}, {"category": "B"}]
1559
+ )
1560
+
1561
+ results = self.collection.query(
1562
+ query_texts=["search"],
1563
+ where={"category": "A"},
1564
+ n_results=5
1565
+ )
1566
+
1567
+ self.assertEqual(len(results["ids"][0]), 1)
1568
+
1569
+ if __name__ == "__main__":
1570
+ unittest.main()
1571
+ ```
1572
+
1573
+ ---
1574
+
1575
+ ## Monitoring and Debugging
1576
+
1577
+ ### Enable Logging
1578
+
1579
+ ```python
1580
+ import logging
1581
+ import chromadb
1582
+
1583
+ # Enable ChromaDB logging
1584
+ logging.basicConfig(level=logging.DEBUG)
1585
+ logger = logging.getLogger("chromadb")
1586
+ logger.setLevel(logging.DEBUG)
1587
+
1588
+ client = chromadb.PersistentClient(path="./debug_db")
1589
+ ```
1590
+
1591
+ ### Inspect Collection Details
1592
+
1593
+ ```python
1594
+ collection = client.get_collection(name="my_collection")
1595
+
1596
+ print(f"Collection name: {collection.name}")
1597
+ print(f"Collection metadata: {collection.metadata}")
1598
+ print(f"Document count: {collection.count()}")
1599
+
1600
+ # Peek at first documents
1601
+ first_docs = collection.peek(limit=3)
1602
+ print(f"First documents: {first_docs}")
1603
+ ```
1604
+
1605
+ ### Performance Profiling
1606
+
1607
+ ```python
1608
+ import time
1609
+
1610
+ def profile_query(collection, query_text: str, n_results: int = 10):
1611
+ """Profile query performance."""
1612
+ start = time.time()
1613
+ results = collection.query(
1614
+ query_texts=[query_text],
1615
+ n_results=n_results
1616
+ )
1617
+ elapsed = time.time() - start
1618
+
1619
+ print(f"Query: {query_text}")
1620
+ print(f"Time: {elapsed:.3f}s")
1621
+ print(f"Results: {len(results['ids'][0])}")
1622
+ return results
1623
+
1624
+ # Usage
1625
+ profile_query(collection, "machine learning", n_results=100)
1626
+ ```
1627
+
1628
+ ---
1629
+
1630
+ ## Migration and Backup
1631
+
1632
+ ### Export Collection to JSON
1633
+
1634
+ ```python
1635
+ import json
1636
+
1637
+ def export_collection(collection, output_file: str):
1638
+ """Export entire collection to JSON."""
1639
+ all_data = collection.get()
1640
+
1641
+ export_data = {
1642
+ "ids": all_data["ids"],
1643
+ "documents": all_data["documents"],
1644
+ "metadatas": all_data["metadatas"],
1645
+ "embeddings": all_data.get("embeddings")
1646
+ }
1647
+
1648
+ with open(output_file, 'w') as f:
1649
+ json.dump(export_data, f)
1650
+
1651
+ print(f"Exported {len(all_data['ids'])} documents to {output_file}")
1652
+
1653
+ # Usage
1654
+ export_collection(collection, "backup.json")
1655
+ ```
1656
+
1657
+ ### Import Collection from JSON
1658
+
1659
+ ```python
1660
+ import json
1661
+
1662
+ def import_collection(collection, input_file: str):
1663
+ """Import collection from JSON."""
1664
+ with open(input_file, 'r') as f:
1665
+ import_data = json.load(f)
1666
+
1667
+ collection.add(
1668
+ ids=import_data["ids"],
1669
+ documents=import_data["documents"],
1670
+ metadatas=import_data["metadatas"],
1671
+ embeddings=import_data.get("embeddings")
1672
+ )
1673
+
1674
+ print(f"Imported {len(import_data['ids'])} documents from {input_file}")
1675
+
1676
+ # Usage
1677
+ import_collection(collection, "backup.json")
1678
+ ```
1679
+
1680
+ ### Copy Collection
1681
+
1682
+ ```python
1683
+ def copy_collection(source_collection, dest_collection, batch_size: int = 1000):
1684
+ """Copy all data from source to destination collection."""
1685
+ total = source_collection.count()
1686
+
1687
+ for offset in range(0, total, batch_size):
1688
+ data = source_collection.get(
1689
+ limit=batch_size,
1690
+ offset=offset,
1691
+ include=["documents", "metadatas", "embeddings"]
1692
+ )
1693
+
1694
+ dest_collection.add(
1695
+ ids=data["ids"],
1696
+ documents=data["documents"],
1697
+ metadatas=data["metadatas"],
1698
+ embeddings=data.get("embeddings")
1699
+ )
1700
+
1701
+ print(f"Copied {offset + len(data['ids'])}/{total} documents")
1702
+
1703
+ # Usage
1704
+ source = client.get_collection(name="original")
1705
+ dest = client.create_collection(name="backup")
1706
+ copy_collection(source, dest)
1707
+ ```