chub-dev 0.1.0 → 0.1.2-beta.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +55 -0
- package/bin/chub-mcp +2 -0
- package/dist/airtable/docs/database/javascript/DOC.md +1437 -0
- package/dist/airtable/docs/database/python/DOC.md +1735 -0
- package/dist/amplitude/docs/analytics/javascript/DOC.md +1282 -0
- package/dist/amplitude/docs/analytics/python/DOC.md +1199 -0
- package/dist/anthropic/docs/claude-api/javascript/DOC.md +503 -0
- package/dist/anthropic/docs/claude-api/python/DOC.md +389 -0
- package/dist/asana/docs/tasks/DOC.md +1396 -0
- package/dist/assemblyai/docs/transcription/DOC.md +1043 -0
- package/dist/atlassian/docs/confluence/javascript/DOC.md +1347 -0
- package/dist/atlassian/docs/confluence/python/DOC.md +1604 -0
- package/dist/auth0/docs/identity/javascript/DOC.md +968 -0
- package/dist/auth0/docs/identity/python/DOC.md +1199 -0
- package/dist/aws/docs/s3/javascript/DOC.md +1773 -0
- package/dist/aws/docs/s3/python/DOC.md +1807 -0
- package/dist/binance/docs/trading/javascript/DOC.md +1315 -0
- package/dist/binance/docs/trading/python/DOC.md +1454 -0
- package/dist/braintree/docs/gateway/javascript/DOC.md +1278 -0
- package/dist/braintree/docs/gateway/python/DOC.md +1179 -0
- package/dist/chromadb/docs/embeddings-db/javascript/DOC.md +1263 -0
- package/dist/chromadb/docs/embeddings-db/python/DOC.md +1707 -0
- package/dist/clerk/docs/auth/javascript/DOC.md +1220 -0
- package/dist/clerk/docs/auth/python/DOC.md +274 -0
- package/dist/cloudflare/docs/workers/javascript/DOC.md +918 -0
- package/dist/cloudflare/docs/workers/python/DOC.md +994 -0
- package/dist/cockroachdb/docs/distributed-db/DOC.md +1500 -0
- package/dist/cohere/docs/llm/DOC.md +1335 -0
- package/dist/datadog/docs/monitoring/javascript/DOC.md +1740 -0
- package/dist/datadog/docs/monitoring/python/DOC.md +1815 -0
- package/dist/deepgram/docs/speech/javascript/DOC.md +885 -0
- package/dist/deepgram/docs/speech/python/DOC.md +685 -0
- package/dist/deepl/docs/translation/javascript/DOC.md +887 -0
- package/dist/deepl/docs/translation/python/DOC.md +944 -0
- package/dist/deepseek/docs/llm/DOC.md +1220 -0
- package/dist/directus/docs/headless-cms/javascript/DOC.md +1128 -0
- package/dist/directus/docs/headless-cms/python/DOC.md +1276 -0
- package/dist/discord/docs/bot/javascript/DOC.md +1090 -0
- package/dist/discord/docs/bot/python/DOC.md +1130 -0
- package/dist/elasticsearch/docs/search/DOC.md +1634 -0
- package/dist/elevenlabs/docs/text-to-speech/javascript/DOC.md +336 -0
- package/dist/elevenlabs/docs/text-to-speech/python/DOC.md +552 -0
- package/dist/firebase/docs/auth/DOC.md +1015 -0
- package/dist/gemini/docs/genai/javascript/DOC.md +691 -0
- package/dist/gemini/docs/genai/python/DOC.md +555 -0
- package/dist/github/docs/octokit/DOC.md +1560 -0
- package/dist/google/docs/bigquery/javascript/DOC.md +1688 -0
- package/dist/google/docs/bigquery/python/DOC.md +1503 -0
- package/dist/hubspot/docs/crm/javascript/DOC.md +1805 -0
- package/dist/hubspot/docs/crm/python/DOC.md +2033 -0
- package/dist/huggingface/docs/transformers/DOC.md +948 -0
- package/dist/intercom/docs/messaging/javascript/DOC.md +1844 -0
- package/dist/intercom/docs/messaging/python/DOC.md +1797 -0
- package/dist/jira/docs/issues/javascript/DOC.md +1420 -0
- package/dist/jira/docs/issues/python/DOC.md +1492 -0
- package/dist/kafka/docs/streaming/javascript/DOC.md +1671 -0
- package/dist/kafka/docs/streaming/python/DOC.md +1464 -0
- package/dist/landingai-ade/docs/api/DOC.md +620 -0
- package/dist/landingai-ade/docs/sdk/python/DOC.md +489 -0
- package/dist/landingai-ade/docs/sdk/typescript/DOC.md +542 -0
- package/dist/landingai-ade/skills/SKILL.md +489 -0
- package/dist/launchdarkly/docs/feature-flags/javascript/DOC.md +1191 -0
- package/dist/launchdarkly/docs/feature-flags/python/DOC.md +1671 -0
- package/dist/linear/docs/tracker/DOC.md +1554 -0
- package/dist/livekit/docs/realtime/javascript/DOC.md +303 -0
- package/dist/livekit/docs/realtime/python/DOC.md +163 -0
- package/dist/mailchimp/docs/marketing/DOC.md +1420 -0
- package/dist/meilisearch/docs/search/DOC.md +1241 -0
- package/dist/microsoft/docs/onedrive/javascript/DOC.md +1421 -0
- package/dist/microsoft/docs/onedrive/python/DOC.md +1549 -0
- package/dist/mongodb/docs/atlas/DOC.md +2041 -0
- package/dist/notion/docs/workspace-api/javascript/DOC.md +1435 -0
- package/dist/notion/docs/workspace-api/python/DOC.md +1400 -0
- package/dist/okta/docs/identity/javascript/DOC.md +1171 -0
- package/dist/okta/docs/identity/python/DOC.md +1401 -0
- package/dist/openai/docs/chat/javascript/DOC.md +407 -0
- package/dist/openai/docs/chat/python/DOC.md +568 -0
- package/dist/paypal/docs/checkout/DOC.md +278 -0
- package/dist/pinecone/docs/sdk/javascript/DOC.md +984 -0
- package/dist/pinecone/docs/sdk/python/DOC.md +1395 -0
- package/dist/plaid/docs/banking/javascript/DOC.md +1163 -0
- package/dist/plaid/docs/banking/python/DOC.md +1203 -0
- package/dist/playwright-community/skills/login-flows/SKILL.md +108 -0
- package/dist/postmark/docs/transactional-email/DOC.md +1168 -0
- package/dist/prisma/docs/orm/javascript/DOC.md +1419 -0
- package/dist/prisma/docs/orm/python/DOC.md +1317 -0
- package/dist/qdrant/docs/vector-search/javascript/DOC.md +1221 -0
- package/dist/qdrant/docs/vector-search/python/DOC.md +1653 -0
- package/dist/rabbitmq/docs/message-queue/javascript/DOC.md +1193 -0
- package/dist/rabbitmq/docs/message-queue/python/DOC.md +1243 -0
- package/dist/razorpay/docs/payments/javascript/DOC.md +1219 -0
- package/dist/razorpay/docs/payments/python/DOC.md +1330 -0
- package/dist/redis/docs/key-value/javascript/DOC.md +1851 -0
- package/dist/redis/docs/key-value/python/DOC.md +2054 -0
- package/dist/registry.json +2817 -0
- package/dist/replicate/docs/model-hosting/DOC.md +1318 -0
- package/dist/resend/docs/email/DOC.md +1271 -0
- package/dist/salesforce/docs/crm/javascript/DOC.md +1241 -0
- package/dist/salesforce/docs/crm/python/DOC.md +1183 -0
- package/dist/search-index.json +1 -0
- package/dist/sendgrid/docs/email-api/javascript/DOC.md +371 -0
- package/dist/sendgrid/docs/email-api/python/DOC.md +656 -0
- package/dist/sentry/docs/error-tracking/javascript/DOC.md +1073 -0
- package/dist/sentry/docs/error-tracking/python/DOC.md +1309 -0
- package/dist/shopify/docs/storefront/DOC.md +457 -0
- package/dist/slack/docs/workspace/javascript/DOC.md +933 -0
- package/dist/slack/docs/workspace/python/DOC.md +271 -0
- package/dist/square/docs/payments/javascript/DOC.md +1855 -0
- package/dist/square/docs/payments/python/DOC.md +1728 -0
- package/dist/stripe/docs/api/DOC.md +1727 -0
- package/dist/stripe/docs/payments/DOC.md +1726 -0
- package/dist/stytch/docs/auth/javascript/DOC.md +1813 -0
- package/dist/stytch/docs/auth/python/DOC.md +1962 -0
- package/dist/supabase/docs/client/DOC.md +1606 -0
- package/dist/twilio/docs/messaging/python/DOC.md +469 -0
- package/dist/twilio/docs/messaging/typescript/DOC.md +946 -0
- package/dist/vercel/docs/platform/DOC.md +1940 -0
- package/dist/weaviate/docs/vector-db/javascript/DOC.md +1268 -0
- package/dist/weaviate/docs/vector-db/python/DOC.md +1388 -0
- package/dist/zendesk/docs/support/javascript/DOC.md +2150 -0
- package/dist/zendesk/docs/support/python/DOC.md +2297 -0
- package/package.json +22 -6
- package/skills/get-api-docs/SKILL.md +84 -0
- package/src/commands/annotate.js +83 -0
- package/src/commands/build.js +12 -1
- package/src/commands/feedback.js +150 -0
- package/src/commands/get.js +83 -42
- package/src/commands/search.js +7 -0
- package/src/index.js +43 -17
- package/src/lib/analytics.js +90 -0
- package/src/lib/annotations.js +57 -0
- package/src/lib/bm25.js +170 -0
- package/src/lib/cache.js +69 -6
- package/src/lib/config.js +8 -3
- package/src/lib/identity.js +99 -0
- package/src/lib/registry.js +103 -20
- package/src/lib/telemetry.js +86 -0
- package/src/mcp/server.js +177 -0
- package/src/mcp/tools.js +251 -0
|
@@ -0,0 +1,1707 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: embeddings-db
|
|
3
|
+
description: "ChromaDB Python SDK for vector embeddings and AI-powered search"
|
|
4
|
+
metadata:
|
|
5
|
+
languages: "python"
|
|
6
|
+
versions: "1.2.1"
|
|
7
|
+
updated-on: "2026-03-02"
|
|
8
|
+
source: maintainer
|
|
9
|
+
tags: "chromadb,embeddings,vector-db,ai,search"
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
# ChromaDB Python SDK - v1.2.1
|
|
13
|
+
|
|
14
|
+
## Golden Rule
|
|
15
|
+
|
|
16
|
+
**ALWAYS use the official `chromadb` package (v1.2.1 or later) for Python projects.**
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install chromadb
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
**DO NOT use:**
|
|
23
|
+
- Deprecated packages like `chromadb-client`
|
|
24
|
+
- Old versions below 1.0
|
|
25
|
+
- Community wrappers that may be outdated
|
|
26
|
+
|
|
27
|
+
ChromaDB is the official AI-native open-source vector database. It handles embeddings, indexing, and vector similarity search automatically.
|
|
28
|
+
|
|
29
|
+
**Requires Python >= 3.9**
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## Installation
|
|
34
|
+
|
|
35
|
+
### Using pip
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install chromadb
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
### Using Poetry
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
poetry add chromadb
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
### Using uv
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
uv pip install chromadb
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### Install with Specific Version
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
pip install chromadb==1.2.1
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
## Initialization
|
|
62
|
+
|
|
63
|
+
### Ephemeral Client (In-Memory)
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
import chromadb
|
|
67
|
+
|
|
68
|
+
client = chromadb.EphemeralClient()
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Use for experimentation, testing, and prototyping. Data is lost when the process ends.
|
|
72
|
+
|
|
73
|
+
### Persistent Client (Local Storage)
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
import chromadb
|
|
77
|
+
|
|
78
|
+
client = chromadb.PersistentClient(path="./chroma_data")
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Stores data locally at the specified path. Creates the directory if it doesn't exist.
|
|
82
|
+
|
|
83
|
+
### Persistent Client with Default Path
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
import chromadb
|
|
87
|
+
|
|
88
|
+
client = chromadb.PersistentClient()
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
Defaults to `./chroma` in the current working directory.
|
|
92
|
+
|
|
93
|
+
### HTTP Client (Remote Server)
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
import chromadb
|
|
97
|
+
|
|
98
|
+
client = chromadb.HttpClient(host="localhost", port=8000)
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
Connects to a remote ChromaDB server.
|
|
102
|
+
|
|
103
|
+
### HTTP Client with Custom Configuration
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
import chromadb
|
|
107
|
+
from chromadb.config import Settings
|
|
108
|
+
|
|
109
|
+
client = chromadb.HttpClient(
|
|
110
|
+
host="localhost",
|
|
111
|
+
port=8000,
|
|
112
|
+
ssl=False,
|
|
113
|
+
headers={"Authorization": "Bearer token"},
|
|
114
|
+
settings=Settings(),
|
|
115
|
+
tenant="default_tenant",
|
|
116
|
+
database="default_database"
|
|
117
|
+
)
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
---
|
|
121
|
+
|
|
122
|
+
## Running ChromaDB Server
|
|
123
|
+
|
|
124
|
+
### Local Server
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
chroma run --path ./chroma_data
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
Starts server on `http://localhost:8000`
|
|
131
|
+
|
|
132
|
+
### Docker
|
|
133
|
+
|
|
134
|
+
```bash
|
|
135
|
+
docker pull chromadb/chroma
|
|
136
|
+
docker run -p 8000:8000 chromadb/chroma
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
---
|
|
140
|
+
|
|
141
|
+
## Collections
|
|
142
|
+
|
|
143
|
+
### Create a Collection
|
|
144
|
+
|
|
145
|
+
```python
|
|
146
|
+
collection = client.create_collection(name="my_collection")
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
### Create Collection with Distance Metric
|
|
150
|
+
|
|
151
|
+
```python
|
|
152
|
+
collection = client.create_collection(
|
|
153
|
+
name="my_collection",
|
|
154
|
+
metadata={"hnsw:space": "cosine"}
|
|
155
|
+
)
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
**Distance Metrics:**
|
|
159
|
+
- `cosine`: Cosine similarity (best for text, normalized vectors)
|
|
160
|
+
- `l2`: Euclidean/L2 distance (default, sensitive to magnitude)
|
|
161
|
+
- `ip`: Inner product (for recommendation systems)
|
|
162
|
+
|
|
163
|
+
### Get an Existing Collection
|
|
164
|
+
|
|
165
|
+
```python
|
|
166
|
+
collection = client.get_collection(name="my_collection")
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
### Get or Create Collection
|
|
170
|
+
|
|
171
|
+
```python
|
|
172
|
+
collection = client.get_or_create_collection(name="my_collection")
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
### List All Collections
|
|
176
|
+
|
|
177
|
+
```python
|
|
178
|
+
collections = client.list_collections()
|
|
179
|
+
for col in collections:
|
|
180
|
+
print(col.name)
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
### Delete a Collection
|
|
184
|
+
|
|
185
|
+
```python
|
|
186
|
+
client.delete_collection(name="my_collection")
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
---
|
|
190
|
+
|
|
191
|
+
## Adding Data
|
|
192
|
+
|
|
193
|
+
### Add Documents (Auto-Embedding)
|
|
194
|
+
|
|
195
|
+
```python
|
|
196
|
+
collection.add(
|
|
197
|
+
ids=["id1", "id2", "id3"],
|
|
198
|
+
documents=[
|
|
199
|
+
"This is a document about pineapples",
|
|
200
|
+
"This is a document about oranges",
|
|
201
|
+
"This is a document about apples"
|
|
202
|
+
]
|
|
203
|
+
)
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
ChromaDB automatically embeds the documents using the default embedding function.
|
|
207
|
+
|
|
208
|
+
### Add with Metadata
|
|
209
|
+
|
|
210
|
+
```python
|
|
211
|
+
collection.add(
|
|
212
|
+
ids=["id1", "id2", "id3"],
|
|
213
|
+
documents=[
|
|
214
|
+
"This is a document about pineapples",
|
|
215
|
+
"This is a document about oranges",
|
|
216
|
+
"This is a document about apples"
|
|
217
|
+
],
|
|
218
|
+
metadatas=[
|
|
219
|
+
{"category": "tropical", "color": "yellow"},
|
|
220
|
+
{"category": "citrus", "color": "orange"},
|
|
221
|
+
{"category": "temperate", "color": "red"}
|
|
222
|
+
]
|
|
223
|
+
)
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
### Add with Custom Embeddings
|
|
227
|
+
|
|
228
|
+
```python
|
|
229
|
+
collection.add(
|
|
230
|
+
ids=["id1", "id2"],
|
|
231
|
+
embeddings=[
|
|
232
|
+
[1.5, 2.9, 3.4, 1.2, 0.8],
|
|
233
|
+
[9.8, 2.3, 2.9, 4.1, 3.3]
|
|
234
|
+
],
|
|
235
|
+
documents=["Document one", "Document two"],
|
|
236
|
+
metadatas=[
|
|
237
|
+
{"source": "manual"},
|
|
238
|
+
{"source": "manual"}
|
|
239
|
+
]
|
|
240
|
+
)
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
### Batch Adding (Large Datasets)
|
|
244
|
+
|
|
245
|
+
```python
|
|
246
|
+
batch_size = 5000
|
|
247
|
+
for i in range(0, len(documents), batch_size):
|
|
248
|
+
batch_docs = documents[i:i + batch_size]
|
|
249
|
+
batch_ids = [f"id{j}" for j in range(i, i + len(batch_docs))]
|
|
250
|
+
|
|
251
|
+
collection.add(
|
|
252
|
+
ids=batch_ids,
|
|
253
|
+
documents=batch_docs
|
|
254
|
+
)
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
ChromaDB supports adding up to 100k+ documents at once.
|
|
258
|
+
|
|
259
|
+
---
|
|
260
|
+
|
|
261
|
+
## Querying Data
|
|
262
|
+
|
|
263
|
+
### Query with Text (Auto-Embedding)
|
|
264
|
+
|
|
265
|
+
```python
|
|
266
|
+
results = collection.query(
|
|
267
|
+
query_texts=["What fruits are tropical?"],
|
|
268
|
+
n_results=2
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
print(results)
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
**Response Structure:**
|
|
275
|
+
|
|
276
|
+
```python
|
|
277
|
+
{
|
|
278
|
+
'ids': [['id1', 'id2']],
|
|
279
|
+
'distances': [[0.1234, 0.5678]],
|
|
280
|
+
'documents': [['This is a document about pineapples', 'This is...']],
|
|
281
|
+
'metadatas': [[{'category': 'tropical', 'color': 'yellow'}, {...}]],
|
|
282
|
+
'embeddings': None # Not included by default
|
|
283
|
+
}
|
|
284
|
+
```
|
|
285
|
+
|
|
286
|
+
### Query with Multiple Texts
|
|
287
|
+
|
|
288
|
+
```python
|
|
289
|
+
results = collection.query(
|
|
290
|
+
query_texts=[
|
|
291
|
+
"What fruits are tropical?",
|
|
292
|
+
"What fruits are citrus?"
|
|
293
|
+
],
|
|
294
|
+
n_results=2
|
|
295
|
+
)
|
|
296
|
+
```
|
|
297
|
+
|
|
298
|
+
Returns `n_results` for each query text.
|
|
299
|
+
|
|
300
|
+
### Query with Custom Embeddings
|
|
301
|
+
|
|
302
|
+
```python
|
|
303
|
+
results = collection.query(
|
|
304
|
+
query_embeddings=[[1.5, 2.9, 3.4, 1.2, 0.8]],
|
|
305
|
+
n_results=3
|
|
306
|
+
)
|
|
307
|
+
```
|
|
308
|
+
|
|
309
|
+
### Query with Metadata Filters
|
|
310
|
+
|
|
311
|
+
```python
|
|
312
|
+
results = collection.query(
|
|
313
|
+
query_texts=["What fruits are available?"],
|
|
314
|
+
n_results=5,
|
|
315
|
+
where={"category": "tropical"}
|
|
316
|
+
)
|
|
317
|
+
```
|
|
318
|
+
|
|
319
|
+
### Complex Metadata Filtering
|
|
320
|
+
|
|
321
|
+
```python
|
|
322
|
+
# Using $or operator
|
|
323
|
+
results = collection.query(
|
|
324
|
+
query_texts=["Find fruits"],
|
|
325
|
+
n_results=5,
|
|
326
|
+
where={
|
|
327
|
+
"$or": [
|
|
328
|
+
{"category": "tropical"},
|
|
329
|
+
{"category": "citrus"}
|
|
330
|
+
]
|
|
331
|
+
}
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
# Using $and operator
|
|
335
|
+
results = collection.query(
|
|
336
|
+
query_texts=["Find fruits"],
|
|
337
|
+
n_results=5,
|
|
338
|
+
where={
|
|
339
|
+
"$and": [
|
|
340
|
+
{"category": "tropical"},
|
|
341
|
+
{"color": "yellow"}
|
|
342
|
+
]
|
|
343
|
+
}
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
# Using comparison operators
|
|
347
|
+
results = collection.query(
|
|
348
|
+
query_texts=["Find items"],
|
|
349
|
+
n_results=5,
|
|
350
|
+
where={
|
|
351
|
+
"price": {"$gt": 10} # $gt, $gte, $lt, $lte, $ne, $eq
|
|
352
|
+
}
|
|
353
|
+
)
|
|
354
|
+
```
|
|
355
|
+
|
|
356
|
+
### Query with Document Content Filters
|
|
357
|
+
|
|
358
|
+
```python
|
|
359
|
+
results = collection.query(
|
|
360
|
+
query_texts=["Find documents"],
|
|
361
|
+
n_results=5,
|
|
362
|
+
where_document={"$contains": "pineapple"}
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
# Using $not_contains
|
|
366
|
+
results = collection.query(
|
|
367
|
+
query_texts=["Find documents"],
|
|
368
|
+
n_results=5,
|
|
369
|
+
where_document={"$not_contains": "apple"}
|
|
370
|
+
)
|
|
371
|
+
```
|
|
372
|
+
|
|
373
|
+
### Query with Include Options
|
|
374
|
+
|
|
375
|
+
```python
|
|
376
|
+
results = collection.query(
|
|
377
|
+
query_texts=["What fruits are tropical?"],
|
|
378
|
+
n_results=2,
|
|
379
|
+
include=["documents", "metadatas", "distances", "embeddings"]
|
|
380
|
+
)
|
|
381
|
+
```
|
|
382
|
+
|
|
383
|
+
**Include Options:**
|
|
384
|
+
- `documents`: The document text (included by default)
|
|
385
|
+
- `metadatas`: Metadata for each document (included by default)
|
|
386
|
+
- `distances`: Distance/similarity scores (included by default)
|
|
387
|
+
- `embeddings`: Vector embeddings (not included by default for performance)
|
|
388
|
+
|
|
389
|
+
---
|
|
390
|
+
|
|
391
|
+
## Getting Data
|
|
392
|
+
|
|
393
|
+
### Get Documents by IDs
|
|
394
|
+
|
|
395
|
+
```python
|
|
396
|
+
results = collection.get(
|
|
397
|
+
ids=["id1", "id2"]
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
print(results)
|
|
401
|
+
```
|
|
402
|
+
|
|
403
|
+
### Get All Documents
|
|
404
|
+
|
|
405
|
+
```python
|
|
406
|
+
results = collection.get()
|
|
407
|
+
```
|
|
408
|
+
|
|
409
|
+
Returns all documents in the collection.
|
|
410
|
+
|
|
411
|
+
### Get with Metadata Filter
|
|
412
|
+
|
|
413
|
+
```python
|
|
414
|
+
results = collection.get(
|
|
415
|
+
where={"category": "tropical"}
|
|
416
|
+
)
|
|
417
|
+
```
|
|
418
|
+
|
|
419
|
+
### Get with Document Filter
|
|
420
|
+
|
|
421
|
+
```python
|
|
422
|
+
results = collection.get(
|
|
423
|
+
where_document={"$contains": "pineapple"}
|
|
424
|
+
)
|
|
425
|
+
```
|
|
426
|
+
|
|
427
|
+
### Get with Limit and Offset
|
|
428
|
+
|
|
429
|
+
```python
|
|
430
|
+
results = collection.get(
|
|
431
|
+
limit=10,
|
|
432
|
+
offset=20
|
|
433
|
+
)
|
|
434
|
+
```
|
|
435
|
+
|
|
436
|
+
### Get with Include Options
|
|
437
|
+
|
|
438
|
+
```python
|
|
439
|
+
results = collection.get(
|
|
440
|
+
ids=["id1", "id2"],
|
|
441
|
+
include=["documents", "metadatas", "embeddings"]
|
|
442
|
+
)
|
|
443
|
+
```
|
|
444
|
+
|
|
445
|
+
---
|
|
446
|
+
|
|
447
|
+
## Updating Data
|
|
448
|
+
|
|
449
|
+
### Update Documents
|
|
450
|
+
|
|
451
|
+
```python
|
|
452
|
+
collection.update(
|
|
453
|
+
ids=["id1", "id2"],
|
|
454
|
+
documents=[
|
|
455
|
+
"Updated document about pineapples",
|
|
456
|
+
"Updated document about oranges"
|
|
457
|
+
],
|
|
458
|
+
metadatas=[
|
|
459
|
+
{"category": "tropical", "color": "yellow", "updated": True},
|
|
460
|
+
{"category": "citrus", "color": "orange", "updated": True}
|
|
461
|
+
]
|
|
462
|
+
)
|
|
463
|
+
```
|
|
464
|
+
|
|
465
|
+
### Update with Custom Embeddings
|
|
466
|
+
|
|
467
|
+
```python
|
|
468
|
+
collection.update(
|
|
469
|
+
ids=["id1"],
|
|
470
|
+
embeddings=[[1.1, 2.2, 3.3, 4.4, 5.5]],
|
|
471
|
+
documents=["Updated document"],
|
|
472
|
+
metadatas=[{"source": "updated"}]
|
|
473
|
+
)
|
|
474
|
+
```
|
|
475
|
+
|
|
476
|
+
---
|
|
477
|
+
|
|
478
|
+
## Upsert (Add or Update)
|
|
479
|
+
|
|
480
|
+
### Upsert Documents
|
|
481
|
+
|
|
482
|
+
```python
|
|
483
|
+
collection.upsert(
|
|
484
|
+
ids=["id1", "id2", "id3"],
|
|
485
|
+
documents=[
|
|
486
|
+
"Document one - may be new or updated",
|
|
487
|
+
"Document two - may be new or updated",
|
|
488
|
+
"Document three - may be new or updated"
|
|
489
|
+
],
|
|
490
|
+
metadatas=[
|
|
491
|
+
{"version": 2},
|
|
492
|
+
{"version": 2},
|
|
493
|
+
{"version": 1}
|
|
494
|
+
]
|
|
495
|
+
)
|
|
496
|
+
```
|
|
497
|
+
|
|
498
|
+
If the ID exists, it updates the document. If not, it adds it as new.
|
|
499
|
+
|
|
500
|
+
---
|
|
501
|
+
|
|
502
|
+
## Deleting Data
|
|
503
|
+
|
|
504
|
+
### Delete by IDs
|
|
505
|
+
|
|
506
|
+
```python
|
|
507
|
+
collection.delete(
|
|
508
|
+
ids=["id1", "id2"]
|
|
509
|
+
)
|
|
510
|
+
```
|
|
511
|
+
|
|
512
|
+
### Delete with Metadata Filter
|
|
513
|
+
|
|
514
|
+
```python
|
|
515
|
+
collection.delete(
|
|
516
|
+
where={"category": "tropical"}
|
|
517
|
+
)
|
|
518
|
+
```
|
|
519
|
+
|
|
520
|
+
### Delete with Document Filter
|
|
521
|
+
|
|
522
|
+
```python
|
|
523
|
+
collection.delete(
|
|
524
|
+
where_document={"$contains": "deprecated"}
|
|
525
|
+
)
|
|
526
|
+
```
|
|
527
|
+
|
|
528
|
+
### Delete All Documents (Keep Collection)
|
|
529
|
+
|
|
530
|
+
```python
|
|
531
|
+
collection.delete()
|
|
532
|
+
```
|
|
533
|
+
|
|
534
|
+
---
|
|
535
|
+
|
|
536
|
+
## Collection Utilities
|
|
537
|
+
|
|
538
|
+
### Count Documents
|
|
539
|
+
|
|
540
|
+
```python
|
|
541
|
+
count = collection.count()
|
|
542
|
+
print(f"Total documents: {count}")
|
|
543
|
+
```
|
|
544
|
+
|
|
545
|
+
### Peek at First Documents
|
|
546
|
+
|
|
547
|
+
```python
|
|
548
|
+
first_docs = collection.peek(limit=5)
|
|
549
|
+
print(first_docs)
|
|
550
|
+
```
|
|
551
|
+
|
|
552
|
+
Returns the first 5 documents in the collection.
|
|
553
|
+
|
|
554
|
+
### Modify Collection Metadata
|
|
555
|
+
|
|
556
|
+
```python
|
|
557
|
+
collection.modify(
|
|
558
|
+
metadata={
|
|
559
|
+
"description": "Collection of fruit documents",
|
|
560
|
+
"version": "1.0"
|
|
561
|
+
}
|
|
562
|
+
)
|
|
563
|
+
```
|
|
564
|
+
|
|
565
|
+
---
|
|
566
|
+
|
|
567
|
+
## Embedding Functions
|
|
568
|
+
|
|
569
|
+
### Using Default Embedding Function
|
|
570
|
+
|
|
571
|
+
```python
|
|
572
|
+
import chromadb
|
|
573
|
+
|
|
574
|
+
client = chromadb.PersistentClient()
|
|
575
|
+
collection = client.create_collection(name="my_collection")
|
|
576
|
+
```
|
|
577
|
+
|
|
578
|
+
By default, ChromaDB uses the Sentence Transformers `all-MiniLM-L6-v2` model.
|
|
579
|
+
|
|
580
|
+
### Using OpenAI Embeddings
|
|
581
|
+
|
|
582
|
+
```python
|
|
583
|
+
import chromadb
|
|
584
|
+
from chromadb.utils import embedding_functions
|
|
585
|
+
|
|
586
|
+
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
|
|
587
|
+
api_key="your-openai-api-key",
|
|
588
|
+
model_name="text-embedding-3-small"
|
|
589
|
+
)
|
|
590
|
+
|
|
591
|
+
collection = client.create_collection(
|
|
592
|
+
name="openai_collection",
|
|
593
|
+
embedding_function=openai_ef
|
|
594
|
+
)
|
|
595
|
+
```
|
|
596
|
+
|
|
597
|
+
**Available OpenAI Models:**
|
|
598
|
+
- `text-embedding-3-small`
|
|
599
|
+
- `text-embedding-3-large`
|
|
600
|
+
- `text-embedding-ada-002`
|
|
601
|
+
|
|
602
|
+
### Using Cohere Embeddings
|
|
603
|
+
|
|
604
|
+
```python
|
|
605
|
+
import chromadb
|
|
606
|
+
from chromadb.utils import embedding_functions
|
|
607
|
+
|
|
608
|
+
cohere_ef = embedding_functions.CohereEmbeddingFunction(
|
|
609
|
+
api_key="your-cohere-api-key",
|
|
610
|
+
model_name="embed-english-v3.0"
|
|
611
|
+
)
|
|
612
|
+
|
|
613
|
+
collection = client.create_collection(
|
|
614
|
+
name="cohere_collection",
|
|
615
|
+
embedding_function=cohere_ef
|
|
616
|
+
)
|
|
617
|
+
```
|
|
618
|
+
|
|
619
|
+
**Available Cohere Models:**
|
|
620
|
+
- `embed-english-v3.0`
|
|
621
|
+
- `embed-multilingual-v3.0`
|
|
622
|
+
- `embed-english-light-v3.0`
|
|
623
|
+
|
|
624
|
+
### Using Hugging Face Embeddings
|
|
625
|
+
|
|
626
|
+
```python
|
|
627
|
+
import chromadb
|
|
628
|
+
from chromadb.utils import embedding_functions
|
|
629
|
+
|
|
630
|
+
huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(
|
|
631
|
+
api_key="your-hf-api-key",
|
|
632
|
+
model_name="sentence-transformers/all-MiniLM-L6-v2"
|
|
633
|
+
)
|
|
634
|
+
|
|
635
|
+
collection = client.create_collection(
|
|
636
|
+
name="hf_collection",
|
|
637
|
+
embedding_function=huggingface_ef
|
|
638
|
+
)
|
|
639
|
+
```
|
|
640
|
+
|
|
641
|
+
### Using Sentence Transformers (Local)
|
|
642
|
+
|
|
643
|
+
```python
|
|
644
|
+
import chromadb
|
|
645
|
+
from chromadb.utils import embedding_functions
|
|
646
|
+
|
|
647
|
+
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
|
|
648
|
+
model_name="all-MiniLM-L6-v2"
|
|
649
|
+
)
|
|
650
|
+
|
|
651
|
+
collection = client.create_collection(
|
|
652
|
+
name="local_collection",
|
|
653
|
+
embedding_function=sentence_transformer_ef
|
|
654
|
+
)
|
|
655
|
+
```
|
|
656
|
+
|
|
657
|
+
### Using Ollama Embeddings (Local)
|
|
658
|
+
|
|
659
|
+
```python
|
|
660
|
+
import chromadb
|
|
661
|
+
from chromadb.utils import embedding_functions
|
|
662
|
+
|
|
663
|
+
ollama_ef = embedding_functions.OllamaEmbeddingFunction(
|
|
664
|
+
url="http://localhost:11434/api/embeddings",
|
|
665
|
+
model_name="llama2"
|
|
666
|
+
)
|
|
667
|
+
|
|
668
|
+
collection = client.create_collection(
|
|
669
|
+
name="ollama_collection",
|
|
670
|
+
embedding_function=ollama_ef
|
|
671
|
+
)
|
|
672
|
+
```
|
|
673
|
+
|
|
674
|
+
### Custom Embedding Function
|
|
675
|
+
|
|
676
|
+
```python
|
|
677
|
+
import chromadb
|
|
678
|
+
from chromadb import Documents, EmbeddingFunction, Embeddings
|
|
679
|
+
|
|
680
|
+
class CustomEmbeddingFunction(EmbeddingFunction):
|
|
681
|
+
def __call__(self, input: Documents) -> Embeddings:
|
|
682
|
+
# Your custom embedding logic here
|
|
683
|
+
embeddings = []
|
|
684
|
+
for doc in input:
|
|
685
|
+
# Example: simple character code embedding (replace with real model)
|
|
686
|
+
embedding = [ord(c) / 255.0 for c in doc[:384]]
|
|
687
|
+
# Pad to fixed length
|
|
688
|
+
embedding.extend([0.0] * (384 - len(embedding)))
|
|
689
|
+
embeddings.append(embedding)
|
|
690
|
+
return embeddings
|
|
691
|
+
|
|
692
|
+
custom_ef = CustomEmbeddingFunction()
|
|
693
|
+
collection = client.create_collection(
|
|
694
|
+
name="custom_collection",
|
|
695
|
+
embedding_function=custom_ef
|
|
696
|
+
)
|
|
697
|
+
```
|
|
698
|
+
|
|
699
|
+
---
|
|
700
|
+
|
|
701
|
+
## Advanced Client Configuration
|
|
702
|
+
|
|
703
|
+
### PersistentClient with Full Options
|
|
704
|
+
|
|
705
|
+
```python
|
|
706
|
+
import chromadb
|
|
707
|
+
from chromadb.config import Settings
|
|
708
|
+
|
|
709
|
+
client = chromadb.PersistentClient(
|
|
710
|
+
path="./my_chroma_data",
|
|
711
|
+
settings=Settings(
|
|
712
|
+
anonymized_telemetry=False,
|
|
713
|
+
allow_reset=True
|
|
714
|
+
),
|
|
715
|
+
tenant="default_tenant",
|
|
716
|
+
database="default_database"
|
|
717
|
+
)
|
|
718
|
+
```
|
|
719
|
+
|
|
720
|
+
### HttpClient with Authentication
|
|
721
|
+
|
|
722
|
+
```python
|
|
723
|
+
import chromadb
|
|
724
|
+
from chromadb.config import Settings
|
|
725
|
+
|
|
726
|
+
client = chromadb.HttpClient(
|
|
727
|
+
host="my-chroma-server.com",
|
|
728
|
+
port=8000,
|
|
729
|
+
ssl=True,
|
|
730
|
+
headers={"Authorization": "Bearer my-token"},
|
|
731
|
+
settings=Settings(
|
|
732
|
+
chroma_client_auth_provider="chromadb.auth.token_authn.TokenAuthClientProvider",
|
|
733
|
+
chroma_client_auth_credentials="my-token"
|
|
734
|
+
)
|
|
735
|
+
)
|
|
736
|
+
```
|
|
737
|
+
|
|
738
|
+
### Multi-Tenancy Setup
|
|
739
|
+
|
|
740
|
+
```python
|
|
741
|
+
import chromadb
|
|
742
|
+
|
|
743
|
+
client = chromadb.HttpClient(host="localhost", port=8000)
|
|
744
|
+
|
|
745
|
+
# Create a new tenant and database
|
|
746
|
+
client.create_tenant(name="acme_corp")
|
|
747
|
+
client.create_database(name="production", tenant="acme_corp")
|
|
748
|
+
|
|
749
|
+
# Connect to specific tenant/database
|
|
750
|
+
tenant_client = chromadb.HttpClient(
|
|
751
|
+
host="localhost",
|
|
752
|
+
port=8000,
|
|
753
|
+
tenant="acme_corp",
|
|
754
|
+
database="production"
|
|
755
|
+
)
|
|
756
|
+
```
|
|
757
|
+
|
|
758
|
+
---
|
|
759
|
+
|
|
760
|
+
## Complete Example: Document Search System
|
|
761
|
+
|
|
762
|
+
```python
|
|
763
|
+
import chromadb
|
|
764
|
+
|
|
765
|
+
def main():
|
|
766
|
+
# Initialize client
|
|
767
|
+
client = chromadb.PersistentClient(path="./search_db")
|
|
768
|
+
|
|
769
|
+
# Create or get collection
|
|
770
|
+
collection = client.get_or_create_collection(
|
|
771
|
+
name="knowledge_base",
|
|
772
|
+
metadata={"hnsw:space": "cosine"}
|
|
773
|
+
)
|
|
774
|
+
|
|
775
|
+
# Add documents
|
|
776
|
+
collection.add(
|
|
777
|
+
ids=["doc1", "doc2", "doc3", "doc4"],
|
|
778
|
+
documents=[
|
|
779
|
+
"The quick brown fox jumps over the lazy dog",
|
|
780
|
+
"Machine learning is a subset of artificial intelligence",
|
|
781
|
+
"Python is a popular programming language",
|
|
782
|
+
"ChromaDB is a vector database for AI applications"
|
|
783
|
+
],
|
|
784
|
+
metadatas=[
|
|
785
|
+
{"category": "phrases", "language": "english"},
|
|
786
|
+
{"category": "ai", "language": "english"},
|
|
787
|
+
{"category": "programming", "language": "english"},
|
|
788
|
+
{"category": "database", "language": "english"}
|
|
789
|
+
]
|
|
790
|
+
)
|
|
791
|
+
|
|
792
|
+
# Query the collection
|
|
793
|
+
results = collection.query(
|
|
794
|
+
query_texts=["What is AI?"],
|
|
795
|
+
n_results=2,
|
|
796
|
+
where={"category": "ai"}
|
|
797
|
+
)
|
|
798
|
+
|
|
799
|
+
print("Search Results:")
|
|
800
|
+
print(results["documents"][0])
|
|
801
|
+
print(results["metadatas"][0])
|
|
802
|
+
print(results["distances"][0])
|
|
803
|
+
|
|
804
|
+
# Get document count
|
|
805
|
+
count = collection.count()
|
|
806
|
+
print(f"Total documents: {count}")
|
|
807
|
+
|
|
808
|
+
# Update a document
|
|
809
|
+
collection.update(
|
|
810
|
+
ids=["doc2"],
|
|
811
|
+
documents=["Machine learning is a powerful subset of artificial intelligence"],
|
|
812
|
+
metadatas=[{"category": "ai", "language": "english", "updated": True}]
|
|
813
|
+
)
|
|
814
|
+
|
|
815
|
+
# Delete documents
|
|
816
|
+
collection.delete(ids=["doc4"])
|
|
817
|
+
|
|
818
|
+
if __name__ == "__main__":
|
|
819
|
+
main()
|
|
820
|
+
```
|
|
821
|
+
|
|
822
|
+
---
|
|
823
|
+
|
|
824
|
+
## Complete Example: Semantic Search with OpenAI
|
|
825
|
+
|
|
826
|
+
```python
|
|
827
|
+
import chromadb
|
|
828
|
+
from chromadb.utils import embedding_functions
|
|
829
|
+
import os
|
|
830
|
+
from dotenv import load_dotenv
|
|
831
|
+
|
|
832
|
+
load_dotenv()
|
|
833
|
+
|
|
834
|
+
def semantic_search():
|
|
835
|
+
client = chromadb.PersistentClient(path="./semantic_db")
|
|
836
|
+
|
|
837
|
+
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
|
|
838
|
+
api_key=os.getenv("OPENAI_API_KEY"),
|
|
839
|
+
model_name="text-embedding-3-small"
|
|
840
|
+
)
|
|
841
|
+
|
|
842
|
+
collection = client.get_or_create_collection(
|
|
843
|
+
name="articles",
|
|
844
|
+
embedding_function=openai_ef,
|
|
845
|
+
metadata={"hnsw:space": "cosine"}
|
|
846
|
+
)
|
|
847
|
+
|
|
848
|
+
# Add articles
|
|
849
|
+
collection.add(
|
|
850
|
+
ids=["art1", "art2", "art3"],
|
|
851
|
+
documents=[
|
|
852
|
+
"Climate change is affecting global weather patterns",
|
|
853
|
+
"New breakthrough in quantum computing announced",
|
|
854
|
+
"The future of renewable energy looks promising"
|
|
855
|
+
],
|
|
856
|
+
metadatas=[
|
|
857
|
+
{"topic": "environment", "date": "2024-01-15"},
|
|
858
|
+
{"topic": "technology", "date": "2024-01-16"},
|
|
859
|
+
{"topic": "energy", "date": "2024-01-17"}
|
|
860
|
+
]
|
|
861
|
+
)
|
|
862
|
+
|
|
863
|
+
# Search for relevant articles
|
|
864
|
+
results = collection.query(
|
|
865
|
+
query_texts=["Tell me about environmental issues"],
|
|
866
|
+
n_results=3
|
|
867
|
+
)
|
|
868
|
+
|
|
869
|
+
for idx, doc in enumerate(results["documents"][0]):
|
|
870
|
+
print(f"Result {idx + 1}:")
|
|
871
|
+
print(f"Document: {doc}")
|
|
872
|
+
print(f"Metadata: {results['metadatas'][0][idx]}")
|
|
873
|
+
print(f"Distance: {results['distances'][0][idx]}")
|
|
874
|
+
print("---")
|
|
875
|
+
|
|
876
|
+
if __name__ == "__main__":
|
|
877
|
+
semantic_search()
|
|
878
|
+
```
|
|
879
|
+
|
|
880
|
+
---
|
|
881
|
+
|
|
882
|
+
## Complete Example: RAG (Retrieval-Augmented Generation)
|
|
883
|
+
|
|
884
|
+
```python
|
|
885
|
+
import chromadb
|
|
886
|
+
from openai import OpenAI
|
|
887
|
+
import os
|
|
888
|
+
from dotenv import load_dotenv
|
|
889
|
+
|
|
890
|
+
load_dotenv()
|
|
891
|
+
|
|
892
|
+
def rag_example():
|
|
893
|
+
# Initialize ChromaDB
|
|
894
|
+
chroma_client = chromadb.PersistentClient(path="./rag_db")
|
|
895
|
+
collection = chroma_client.get_or_create_collection(
|
|
896
|
+
name="company_docs",
|
|
897
|
+
metadata={"hnsw:space": "cosine"}
|
|
898
|
+
)
|
|
899
|
+
|
|
900
|
+
# Add company knowledge base
|
|
901
|
+
collection.add(
|
|
902
|
+
ids=["policy1", "policy2", "policy3"],
|
|
903
|
+
documents=[
|
|
904
|
+
"Our company offers 20 days of paid vacation per year",
|
|
905
|
+
"Remote work is available 3 days per week",
|
|
906
|
+
"Health insurance includes dental and vision coverage"
|
|
907
|
+
],
|
|
908
|
+
metadatas=[
|
|
909
|
+
{"type": "policy", "category": "time-off"},
|
|
910
|
+
{"type": "policy", "category": "work-arrangement"},
|
|
911
|
+
{"type": "policy", "category": "benefits"}
|
|
912
|
+
]
|
|
913
|
+
)
|
|
914
|
+
|
|
915
|
+
# User question
|
|
916
|
+
question = "How many vacation days do I get?"
|
|
917
|
+
|
|
918
|
+
# Retrieve relevant context
|
|
919
|
+
search_results = collection.query(
|
|
920
|
+
query_texts=[question],
|
|
921
|
+
n_results=2
|
|
922
|
+
)
|
|
923
|
+
|
|
924
|
+
context = "\n".join(search_results["documents"][0])
|
|
925
|
+
|
|
926
|
+
# Generate answer with OpenAI
|
|
927
|
+
openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
|
928
|
+
|
|
929
|
+
completion = openai_client.chat.completions.create(
|
|
930
|
+
model="gpt-4",
|
|
931
|
+
messages=[
|
|
932
|
+
{
|
|
933
|
+
"role": "system",
|
|
934
|
+
"content": "Answer the question based on the context provided."
|
|
935
|
+
},
|
|
936
|
+
{
|
|
937
|
+
"role": "user",
|
|
938
|
+
"content": f"Context:\n{context}\n\nQuestion: {question}"
|
|
939
|
+
}
|
|
940
|
+
]
|
|
941
|
+
)
|
|
942
|
+
|
|
943
|
+
print("Answer:", completion.choices[0].message.content)
|
|
944
|
+
|
|
945
|
+
if __name__ == "__main__":
|
|
946
|
+
rag_example()
|
|
947
|
+
```
|
|
948
|
+
|
|
949
|
+
---
|
|
950
|
+
|
|
951
|
+
## Complete Example: Streaming Large Dataset
|
|
952
|
+
|
|
953
|
+
```python
|
|
954
|
+
import chromadb
|
|
955
|
+
from typing import List, Generator
|
|
956
|
+
|
|
957
|
+
def document_generator(file_path: str) -> Generator[str, None, None]:
|
|
958
|
+
"""Generator to stream documents from a file."""
|
|
959
|
+
with open(file_path, 'r') as f:
|
|
960
|
+
for line in f:
|
|
961
|
+
yield line.strip()
|
|
962
|
+
|
|
963
|
+
def stream_add_documents(collection, doc_gen: Generator, batch_size: int = 1000):
|
|
964
|
+
"""Add documents in batches from a generator."""
|
|
965
|
+
batch_ids = []
|
|
966
|
+
batch_docs = []
|
|
967
|
+
counter = 0
|
|
968
|
+
|
|
969
|
+
for doc in doc_gen:
|
|
970
|
+
batch_ids.append(f"doc{counter}")
|
|
971
|
+
batch_docs.append(doc)
|
|
972
|
+
counter += 1
|
|
973
|
+
|
|
974
|
+
if len(batch_docs) >= batch_size:
|
|
975
|
+
collection.add(
|
|
976
|
+
ids=batch_ids,
|
|
977
|
+
documents=batch_docs
|
|
978
|
+
)
|
|
979
|
+
print(f"Added {counter} documents...")
|
|
980
|
+
batch_ids = []
|
|
981
|
+
batch_docs = []
|
|
982
|
+
|
|
983
|
+
# Add remaining documents
|
|
984
|
+
if batch_docs:
|
|
985
|
+
collection.add(
|
|
986
|
+
ids=batch_ids,
|
|
987
|
+
documents=batch_docs
|
|
988
|
+
)
|
|
989
|
+
print(f"Added final {len(batch_docs)} documents. Total: {counter}")
|
|
990
|
+
|
|
991
|
+
def main():
|
|
992
|
+
client = chromadb.PersistentClient(path="./large_db")
|
|
993
|
+
collection = client.get_or_create_collection(name="large_collection")
|
|
994
|
+
|
|
995
|
+
# Stream and add documents
|
|
996
|
+
doc_gen = document_generator("large_dataset.txt")
|
|
997
|
+
stream_add_documents(collection, doc_gen, batch_size=5000)
|
|
998
|
+
|
|
999
|
+
print(f"Total documents in collection: {collection.count()}")
|
|
1000
|
+
|
|
1001
|
+
if __name__ == "__main__":
|
|
1002
|
+
main()
|
|
1003
|
+
```
|
|
1004
|
+
|
|
1005
|
+
---
|
|
1006
|
+
|
|
1007
|
+
## Environment Variables
|
|
1008
|
+
|
|
1009
|
+
### .env File Setup
|
|
1010
|
+
|
|
1011
|
+
```bash
|
|
1012
|
+
# ChromaDB Server
|
|
1013
|
+
CHROMA_HOST=localhost
|
|
1014
|
+
CHROMA_PORT=8000
|
|
1015
|
+
|
|
1016
|
+
# Authentication (if required)
|
|
1017
|
+
CHROMA_AUTH_TOKEN=your-auth-token
|
|
1018
|
+
|
|
1019
|
+
# Embedding API Keys
|
|
1020
|
+
OPENAI_API_KEY=sk-...
|
|
1021
|
+
COHERE_API_KEY=...
|
|
1022
|
+
HF_API_KEY=...
|
|
1023
|
+
```
|
|
1024
|
+
|
|
1025
|
+
### Using Environment Variables
|
|
1026
|
+
|
|
1027
|
+
```python
|
|
1028
|
+
import chromadb
|
|
1029
|
+
import os
|
|
1030
|
+
from dotenv import load_dotenv
|
|
1031
|
+
|
|
1032
|
+
load_dotenv()
|
|
1033
|
+
|
|
1034
|
+
client = chromadb.HttpClient(
|
|
1035
|
+
host=os.getenv("CHROMA_HOST", "localhost"),
|
|
1036
|
+
port=int(os.getenv("CHROMA_PORT", "8000")),
|
|
1037
|
+
headers={"Authorization": f"Bearer {os.getenv('CHROMA_AUTH_TOKEN')}"}
|
|
1038
|
+
if os.getenv("CHROMA_AUTH_TOKEN") else None
|
|
1039
|
+
)
|
|
1040
|
+
```
|
|
1041
|
+
|
|
1042
|
+
---
|
|
1043
|
+
|
|
1044
|
+
## Type Hints
|
|
1045
|
+
|
|
1046
|
+
### Query Results Type
|
|
1047
|
+
|
|
1048
|
+
```python
|
|
1049
|
+
from typing import TypedDict, List, Optional
|
|
1050
|
+
|
|
1051
|
+
class QueryResult(TypedDict):
|
|
1052
|
+
ids: List[List[str]]
|
|
1053
|
+
embeddings: Optional[List[List[List[float]]]]
|
|
1054
|
+
documents: List[List[Optional[str]]]
|
|
1055
|
+
metadatas: List[List[Optional[dict]]]
|
|
1056
|
+
distances: Optional[List[List[float]]]
|
|
1057
|
+
```
|
|
1058
|
+
|
|
1059
|
+
### Get Results Type
|
|
1060
|
+
|
|
1061
|
+
```python
|
|
1062
|
+
from typing import TypedDict, List, Optional
|
|
1063
|
+
|
|
1064
|
+
class GetResult(TypedDict):
|
|
1065
|
+
ids: List[str]
|
|
1066
|
+
embeddings: Optional[List[List[float]]]
|
|
1067
|
+
documents: List[Optional[str]]
|
|
1068
|
+
metadatas: List[Optional[dict]]
|
|
1069
|
+
```
|
|
1070
|
+
|
|
1071
|
+
### Type-Safe Collection Operations
|
|
1072
|
+
|
|
1073
|
+
```python
|
|
1074
|
+
from typing import List, Dict, Optional
|
|
1075
|
+
|
|
1076
|
+
def add_typed_documents(
|
|
1077
|
+
collection,
|
|
1078
|
+
ids: List[str],
|
|
1079
|
+
documents: List[str],
|
|
1080
|
+
metadatas: Optional[List[Dict[str, any]]] = None
|
|
1081
|
+
) -> None:
|
|
1082
|
+
collection.add(
|
|
1083
|
+
ids=ids,
|
|
1084
|
+
documents=documents,
|
|
1085
|
+
metadatas=metadatas
|
|
1086
|
+
)
|
|
1087
|
+
```
|
|
1088
|
+
|
|
1089
|
+
---
|
|
1090
|
+
|
|
1091
|
+
## Error Handling
|
|
1092
|
+
|
|
1093
|
+
### Handle Collection Not Found
|
|
1094
|
+
|
|
1095
|
+
```python
|
|
1096
|
+
import chromadb
|
|
1097
|
+
|
|
1098
|
+
client = chromadb.PersistentClient()
|
|
1099
|
+
|
|
1100
|
+
try:
|
|
1101
|
+
collection = client.get_collection(name="nonexistent_collection")
|
|
1102
|
+
except ValueError as e:
|
|
1103
|
+
if "does not exist" in str(e):
|
|
1104
|
+
print("Collection not found, creating...")
|
|
1105
|
+
collection = client.create_collection(name="nonexistent_collection")
|
|
1106
|
+
else:
|
|
1107
|
+
raise
|
|
1108
|
+
```
|
|
1109
|
+
|
|
1110
|
+
### Handle Duplicate IDs
|
|
1111
|
+
|
|
1112
|
+
```python
|
|
1113
|
+
try:
|
|
1114
|
+
collection.add(
|
|
1115
|
+
ids=["id1"],
|
|
1116
|
+
documents=["Document"]
|
|
1117
|
+
)
|
|
1118
|
+
|
|
1119
|
+
# This will fail - ID already exists
|
|
1120
|
+
collection.add(
|
|
1121
|
+
ids=["id1"],
|
|
1122
|
+
documents=["Another document"]
|
|
1123
|
+
)
|
|
1124
|
+
except Exception as e:
|
|
1125
|
+
print(f"ID already exists. Use update() or upsert() instead. Error: {e}")
|
|
1126
|
+
|
|
1127
|
+
# Use upsert to add or update
|
|
1128
|
+
collection.upsert(
|
|
1129
|
+
ids=["id1"],
|
|
1130
|
+
documents=["Another document"]
|
|
1131
|
+
)
|
|
1132
|
+
```
|
|
1133
|
+
|
|
1134
|
+
### Handle Connection Errors
|
|
1135
|
+
|
|
1136
|
+
```python
|
|
1137
|
+
import chromadb
|
|
1138
|
+
from requests.exceptions import ConnectionError
|
|
1139
|
+
|
|
1140
|
+
try:
|
|
1141
|
+
client = chromadb.HttpClient(host="localhost", port=8000)
|
|
1142
|
+
collections = client.list_collections()
|
|
1143
|
+
except ConnectionError:
|
|
1144
|
+
print("Cannot connect to ChromaDB server. Make sure it's running.")
|
|
1145
|
+
print("Start server with: chroma run --path ./data")
|
|
1146
|
+
except Exception as e:
|
|
1147
|
+
print(f"Error: {e}")
|
|
1148
|
+
```
|
|
1149
|
+
|
|
1150
|
+
### Handle Empty Results
|
|
1151
|
+
|
|
1152
|
+
```python
|
|
1153
|
+
results = collection.query(
|
|
1154
|
+
query_texts=["Very specific query"],
|
|
1155
|
+
n_results=5
|
|
1156
|
+
)
|
|
1157
|
+
|
|
1158
|
+
if not results["ids"][0]:
|
|
1159
|
+
print("No results found. Try a broader query.")
|
|
1160
|
+
else:
|
|
1161
|
+
print(f"Found {len(results['ids'][0])} results")
|
|
1162
|
+
```
|
|
1163
|
+
|
|
1164
|
+
---
|
|
1165
|
+
|
|
1166
|
+
## Performance Optimization
|
|
1167
|
+
|
|
1168
|
+
### Batch Operations
|
|
1169
|
+
|
|
1170
|
+
```python
|
|
1171
|
+
# Batch add for large datasets
|
|
1172
|
+
chunk_size = 5000
|
|
1173
|
+
for i in range(0, len(documents), chunk_size):
|
|
1174
|
+
collection.add(
|
|
1175
|
+
ids=ids[i:i + chunk_size],
|
|
1176
|
+
documents=documents[i:i + chunk_size],
|
|
1177
|
+
metadatas=metadatas[i:i + chunk_size]
|
|
1178
|
+
)
|
|
1179
|
+
```
|
|
1180
|
+
|
|
1181
|
+
### Parallel Queries
|
|
1182
|
+
|
|
1183
|
+
```python
|
|
1184
|
+
import concurrent.futures
|
|
1185
|
+
|
|
1186
|
+
queries = [
|
|
1187
|
+
"Query about topic A",
|
|
1188
|
+
"Query about topic B",
|
|
1189
|
+
"Query about topic C"
|
|
1190
|
+
]
|
|
1191
|
+
|
|
1192
|
+
def run_query(query):
|
|
1193
|
+
return collection.query(
|
|
1194
|
+
query_texts=[query],
|
|
1195
|
+
n_results=5
|
|
1196
|
+
)
|
|
1197
|
+
|
|
1198
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
|
|
1199
|
+
results = list(executor.map(run_query, queries))
|
|
1200
|
+
```
|
|
1201
|
+
|
|
1202
|
+
### Limit Included Fields
|
|
1203
|
+
|
|
1204
|
+
```python
|
|
1205
|
+
# Exclude embeddings for better performance
|
|
1206
|
+
results = collection.query(
|
|
1207
|
+
query_texts=["Search query"],
|
|
1208
|
+
n_results=10,
|
|
1209
|
+
include=["documents", "metadatas", "distances"]
|
|
1210
|
+
# Don't include embeddings unless needed
|
|
1211
|
+
)
|
|
1212
|
+
```
|
|
1213
|
+
|
|
1214
|
+
### Reuse Embedding Function
|
|
1215
|
+
|
|
1216
|
+
```python
|
|
1217
|
+
from chromadb.utils import embedding_functions
|
|
1218
|
+
|
|
1219
|
+
# Create once, reuse for multiple collections
|
|
1220
|
+
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
|
|
1221
|
+
api_key="your-api-key",
|
|
1222
|
+
model_name="text-embedding-3-small"
|
|
1223
|
+
)
|
|
1224
|
+
|
|
1225
|
+
collection1 = client.create_collection(
|
|
1226
|
+
name="collection1",
|
|
1227
|
+
embedding_function=openai_ef
|
|
1228
|
+
)
|
|
1229
|
+
|
|
1230
|
+
collection2 = client.create_collection(
|
|
1231
|
+
name="collection2",
|
|
1232
|
+
embedding_function=openai_ef
|
|
1233
|
+
)
|
|
1234
|
+
```
|
|
1235
|
+
|
|
1236
|
+
---
|
|
1237
|
+
|
|
1238
|
+
## Common Patterns
|
|
1239
|
+
|
|
1240
|
+
### Incremental Updates
|
|
1241
|
+
|
|
1242
|
+
```python
|
|
1243
|
+
def add_daily_documents(collection, new_docs: List[str]):
|
|
1244
|
+
"""Add new documents daily."""
|
|
1245
|
+
count = collection.count()
|
|
1246
|
+
new_ids = [f"doc{count + i}" for i in range(len(new_docs))]
|
|
1247
|
+
|
|
1248
|
+
from datetime import datetime
|
|
1249
|
+
collection.add(
|
|
1250
|
+
ids=new_ids,
|
|
1251
|
+
documents=new_docs,
|
|
1252
|
+
metadatas=[{"added_date": datetime.now().isoformat()} for _ in new_docs]
|
|
1253
|
+
)
|
|
1254
|
+
```
|
|
1255
|
+
|
|
1256
|
+
### Search with Fallback
|
|
1257
|
+
|
|
1258
|
+
```python
|
|
1259
|
+
def search_with_fallback(collection, query: str):
|
|
1260
|
+
"""Try specific category first, fallback to all documents."""
|
|
1261
|
+
# Try specific category first
|
|
1262
|
+
results = collection.query(
|
|
1263
|
+
query_texts=[query],
|
|
1264
|
+
n_results=5,
|
|
1265
|
+
where={"category": "premium"}
|
|
1266
|
+
)
|
|
1267
|
+
|
|
1268
|
+
# If no results, search all documents
|
|
1269
|
+
if not results["ids"][0]:
|
|
1270
|
+
results = collection.query(
|
|
1271
|
+
query_texts=[query],
|
|
1272
|
+
n_results=5
|
|
1273
|
+
)
|
|
1274
|
+
|
|
1275
|
+
return results
|
|
1276
|
+
```
|
|
1277
|
+
|
|
1278
|
+
### Deduplicate Documents
|
|
1279
|
+
|
|
1280
|
+
```python
|
|
1281
|
+
def add_unique(collection, doc_id: str, document: str, metadata: dict):
|
|
1282
|
+
"""Add document only if ID doesn't exist, otherwise update."""
|
|
1283
|
+
try:
|
|
1284
|
+
existing = collection.get(ids=[doc_id])
|
|
1285
|
+
if existing["ids"]:
|
|
1286
|
+
print("Document already exists, updating...")
|
|
1287
|
+
collection.update(
|
|
1288
|
+
ids=[doc_id],
|
|
1289
|
+
documents=[document],
|
|
1290
|
+
metadatas=[metadata]
|
|
1291
|
+
)
|
|
1292
|
+
else:
|
|
1293
|
+
collection.add(
|
|
1294
|
+
ids=[doc_id],
|
|
1295
|
+
documents=[document],
|
|
1296
|
+
metadatas=[metadata]
|
|
1297
|
+
)
|
|
1298
|
+
except Exception:
|
|
1299
|
+
collection.add(
|
|
1300
|
+
ids=[doc_id],
|
|
1301
|
+
documents=[document],
|
|
1302
|
+
metadatas=[metadata]
|
|
1303
|
+
)
|
|
1304
|
+
```
|
|
1305
|
+
|
|
1306
|
+
### Pagination
|
|
1307
|
+
|
|
1308
|
+
```python
|
|
1309
|
+
def paginate_collection(collection, page_size: int = 100):
|
|
1310
|
+
"""Iterate through all documents in pages."""
|
|
1311
|
+
total = collection.count()
|
|
1312
|
+
|
|
1313
|
+
for offset in range(0, total, page_size):
|
|
1314
|
+
results = collection.get(
|
|
1315
|
+
limit=page_size,
|
|
1316
|
+
offset=offset
|
|
1317
|
+
)
|
|
1318
|
+
yield results
|
|
1319
|
+
|
|
1320
|
+
# Usage
|
|
1321
|
+
for page in paginate_collection(collection, page_size=100):
|
|
1322
|
+
print(f"Processing {len(page['ids'])} documents...")
|
|
1323
|
+
# Process documents
|
|
1324
|
+
```
|
|
1325
|
+
|
|
1326
|
+
---
|
|
1327
|
+
|
|
1328
|
+
## Metadata Filter Operators
|
|
1329
|
+
|
|
1330
|
+
### Comparison Operators
|
|
1331
|
+
|
|
1332
|
+
```python
|
|
1333
|
+
# Equal
|
|
1334
|
+
where = {"category": "tech"}
|
|
1335
|
+
where = {"category": {"$eq": "tech"}}
|
|
1336
|
+
|
|
1337
|
+
# Not equal
|
|
1338
|
+
where = {"category": {"$ne": "tech"}}
|
|
1339
|
+
|
|
1340
|
+
# Greater than
|
|
1341
|
+
where = {"price": {"$gt": 100}}
|
|
1342
|
+
|
|
1343
|
+
# Greater than or equal
|
|
1344
|
+
where = {"price": {"$gte": 100}}
|
|
1345
|
+
|
|
1346
|
+
# Less than
|
|
1347
|
+
where = {"price": {"$lt": 100}}
|
|
1348
|
+
|
|
1349
|
+
# Less than or equal
|
|
1350
|
+
where = {"price": {"$lte": 100}}
|
|
1351
|
+
```
|
|
1352
|
+
|
|
1353
|
+
### Logical Operators
|
|
1354
|
+
|
|
1355
|
+
```python
|
|
1356
|
+
# AND
|
|
1357
|
+
where = {
|
|
1358
|
+
"$and": [
|
|
1359
|
+
{"category": "tech"},
|
|
1360
|
+
{"price": {"$lt": 1000}}
|
|
1361
|
+
]
|
|
1362
|
+
}
|
|
1363
|
+
|
|
1364
|
+
# OR
|
|
1365
|
+
where = {
|
|
1366
|
+
"$or": [
|
|
1367
|
+
{"category": "tech"},
|
|
1368
|
+
{"category": "science"}
|
|
1369
|
+
]
|
|
1370
|
+
}
|
|
1371
|
+
|
|
1372
|
+
# NOT
|
|
1373
|
+
where = {
|
|
1374
|
+
"$not": {"category": "archived"}
|
|
1375
|
+
}
|
|
1376
|
+
```
|
|
1377
|
+
|
|
1378
|
+
### Set Operators
|
|
1379
|
+
|
|
1380
|
+
```python
|
|
1381
|
+
# In array
|
|
1382
|
+
where = {
|
|
1383
|
+
"category": {"$in": ["tech", "science", "health"]}
|
|
1384
|
+
}
|
|
1385
|
+
|
|
1386
|
+
# Not in array
|
|
1387
|
+
where = {
|
|
1388
|
+
"category": {"$nin": ["archived", "deleted"]}
|
|
1389
|
+
}
|
|
1390
|
+
```
|
|
1391
|
+
|
|
1392
|
+
---
|
|
1393
|
+
|
|
1394
|
+
## Document Filter Operators
|
|
1395
|
+
|
|
1396
|
+
### Contains
|
|
1397
|
+
|
|
1398
|
+
```python
|
|
1399
|
+
where_document = {"$contains": "machine learning"}
|
|
1400
|
+
```
|
|
1401
|
+
|
|
1402
|
+
### Not Contains
|
|
1403
|
+
|
|
1404
|
+
```python
|
|
1405
|
+
where_document = {"$not_contains": "deprecated"}
|
|
1406
|
+
```
|
|
1407
|
+
|
|
1408
|
+
### Combined with Metadata
|
|
1409
|
+
|
|
1410
|
+
```python
|
|
1411
|
+
results = collection.query(
|
|
1412
|
+
query_texts=["AI research"],
|
|
1413
|
+
where={"category": "research"},
|
|
1414
|
+
where_document={"$contains": "neural network"},
|
|
1415
|
+
n_results=10
|
|
1416
|
+
)
|
|
1417
|
+
```
|
|
1418
|
+
|
|
1419
|
+
---
|
|
1420
|
+
|
|
1421
|
+
## Working with Different Data Types
|
|
1422
|
+
|
|
1423
|
+
### Numeric Metadata
|
|
1424
|
+
|
|
1425
|
+
```python
|
|
1426
|
+
collection.add(
|
|
1427
|
+
ids=["p1", "p2", "p3"],
|
|
1428
|
+
documents=["Product A", "Product B", "Product C"],
|
|
1429
|
+
metadatas=[
|
|
1430
|
+
{"price": 29.99, "stock": 100},
|
|
1431
|
+
{"price": 49.99, "stock": 50},
|
|
1432
|
+
{"price": 19.99, "stock": 200}
|
|
1433
|
+
]
|
|
1434
|
+
)
|
|
1435
|
+
|
|
1436
|
+
# Query by price range
|
|
1437
|
+
results = collection.query(
|
|
1438
|
+
query_texts=["affordable products"],
|
|
1439
|
+
where={
|
|
1440
|
+
"$and": [
|
|
1441
|
+
{"price": {"$lt": 50}},
|
|
1442
|
+
{"stock": {"$gt": 75}}
|
|
1443
|
+
]
|
|
1444
|
+
},
|
|
1445
|
+
n_results=5
|
|
1446
|
+
)
|
|
1447
|
+
```
|
|
1448
|
+
|
|
1449
|
+
### Boolean Metadata
|
|
1450
|
+
|
|
1451
|
+
```python
|
|
1452
|
+
collection.add(
|
|
1453
|
+
ids=["u1", "u2", "u3"],
|
|
1454
|
+
documents=["User profile A", "User profile B", "User profile C"],
|
|
1455
|
+
metadatas=[
|
|
1456
|
+
{"active": True, "premium": False},
|
|
1457
|
+
{"active": True, "premium": True},
|
|
1458
|
+
{"active": False, "premium": False}
|
|
1459
|
+
]
|
|
1460
|
+
)
|
|
1461
|
+
|
|
1462
|
+
# Query active premium users
|
|
1463
|
+
results = collection.query(
|
|
1464
|
+
query_texts=["find users"],
|
|
1465
|
+
where={
|
|
1466
|
+
"$and": [
|
|
1467
|
+
{"active": True},
|
|
1468
|
+
{"premium": True}
|
|
1469
|
+
]
|
|
1470
|
+
},
|
|
1471
|
+
n_results=10
|
|
1472
|
+
)
|
|
1473
|
+
```
|
|
1474
|
+
|
|
1475
|
+
### List Metadata
|
|
1476
|
+
|
|
1477
|
+
```python
|
|
1478
|
+
collection.add(
|
|
1479
|
+
ids=["a1", "a2"],
|
|
1480
|
+
documents=["Article about AI and ML", "Article about databases"],
|
|
1481
|
+
metadatas=[
|
|
1482
|
+
{"tags": ["ai", "machine-learning", "neural-networks"]},
|
|
1483
|
+
{"tags": ["database", "vector-search"]}
|
|
1484
|
+
]
|
|
1485
|
+
)
|
|
1486
|
+
|
|
1487
|
+
# Note: ChromaDB metadata values should be strings, numbers, or booleans
|
|
1488
|
+
# For list-like metadata, use multiple metadata fields or JSON strings
|
|
1489
|
+
```
|
|
1490
|
+
|
|
1491
|
+
---
|
|
1492
|
+
|
|
1493
|
+
## Context Manager Pattern
|
|
1494
|
+
|
|
1495
|
+
```python
|
|
1496
|
+
import chromadb
|
|
1497
|
+
from contextlib import contextmanager
|
|
1498
|
+
|
|
1499
|
+
@contextmanager
|
|
1500
|
+
def get_collection(collection_name: str, path: str = "./chroma_db"):
|
|
1501
|
+
"""Context manager for ChromaDB collection."""
|
|
1502
|
+
client = chromadb.PersistentClient(path=path)
|
|
1503
|
+
collection = client.get_or_create_collection(name=collection_name)
|
|
1504
|
+
try:
|
|
1505
|
+
yield collection
|
|
1506
|
+
finally:
|
|
1507
|
+
# Cleanup if needed
|
|
1508
|
+
pass
|
|
1509
|
+
|
|
1510
|
+
# Usage
|
|
1511
|
+
with get_collection("my_collection") as collection:
|
|
1512
|
+
collection.add(
|
|
1513
|
+
ids=["id1"],
|
|
1514
|
+
documents=["Document"]
|
|
1515
|
+
)
|
|
1516
|
+
results = collection.query(
|
|
1517
|
+
query_texts=["Query"],
|
|
1518
|
+
n_results=5
|
|
1519
|
+
)
|
|
1520
|
+
print(results)
|
|
1521
|
+
```
|
|
1522
|
+
|
|
1523
|
+
---
|
|
1524
|
+
|
|
1525
|
+
## Testing with ChromaDB
|
|
1526
|
+
|
|
1527
|
+
### Using EphemeralClient for Tests
|
|
1528
|
+
|
|
1529
|
+
```python
|
|
1530
|
+
import unittest
|
|
1531
|
+
import chromadb
|
|
1532
|
+
|
|
1533
|
+
class TestDocumentSearch(unittest.TestCase):
|
|
1534
|
+
def setUp(self):
|
|
1535
|
+
"""Create ephemeral client for each test."""
|
|
1536
|
+
self.client = chromadb.EphemeralClient()
|
|
1537
|
+
self.collection = self.client.create_collection(name="test_collection")
|
|
1538
|
+
|
|
1539
|
+
def test_add_and_query(self):
|
|
1540
|
+
"""Test adding documents and querying."""
|
|
1541
|
+
self.collection.add(
|
|
1542
|
+
ids=["test1", "test2"],
|
|
1543
|
+
documents=["Document about cats", "Document about dogs"]
|
|
1544
|
+
)
|
|
1545
|
+
|
|
1546
|
+
results = self.collection.query(
|
|
1547
|
+
query_texts=["pets"],
|
|
1548
|
+
n_results=2
|
|
1549
|
+
)
|
|
1550
|
+
|
|
1551
|
+
self.assertEqual(len(results["ids"][0]), 2)
|
|
1552
|
+
|
|
1553
|
+
def test_metadata_filtering(self):
|
|
1554
|
+
"""Test metadata filtering."""
|
|
1555
|
+
self.collection.add(
|
|
1556
|
+
ids=["test1", "test2"],
|
|
1557
|
+
documents=["Doc 1", "Doc 2"],
|
|
1558
|
+
metadatas=[{"category": "A"}, {"category": "B"}]
|
|
1559
|
+
)
|
|
1560
|
+
|
|
1561
|
+
results = self.collection.query(
|
|
1562
|
+
query_texts=["search"],
|
|
1563
|
+
where={"category": "A"},
|
|
1564
|
+
n_results=5
|
|
1565
|
+
)
|
|
1566
|
+
|
|
1567
|
+
self.assertEqual(len(results["ids"][0]), 1)
|
|
1568
|
+
|
|
1569
|
+
if __name__ == "__main__":
|
|
1570
|
+
unittest.main()
|
|
1571
|
+
```
|
|
1572
|
+
|
|
1573
|
+
---
|
|
1574
|
+
|
|
1575
|
+
## Monitoring and Debugging
|
|
1576
|
+
|
|
1577
|
+
### Enable Logging
|
|
1578
|
+
|
|
1579
|
+
```python
|
|
1580
|
+
import logging
|
|
1581
|
+
import chromadb
|
|
1582
|
+
|
|
1583
|
+
# Enable ChromaDB logging
|
|
1584
|
+
logging.basicConfig(level=logging.DEBUG)
|
|
1585
|
+
logger = logging.getLogger("chromadb")
|
|
1586
|
+
logger.setLevel(logging.DEBUG)
|
|
1587
|
+
|
|
1588
|
+
client = chromadb.PersistentClient(path="./debug_db")
|
|
1589
|
+
```
|
|
1590
|
+
|
|
1591
|
+
### Inspect Collection Details
|
|
1592
|
+
|
|
1593
|
+
```python
|
|
1594
|
+
collection = client.get_collection(name="my_collection")
|
|
1595
|
+
|
|
1596
|
+
print(f"Collection name: {collection.name}")
|
|
1597
|
+
print(f"Collection metadata: {collection.metadata}")
|
|
1598
|
+
print(f"Document count: {collection.count()}")
|
|
1599
|
+
|
|
1600
|
+
# Peek at first documents
|
|
1601
|
+
first_docs = collection.peek(limit=3)
|
|
1602
|
+
print(f"First documents: {first_docs}")
|
|
1603
|
+
```
|
|
1604
|
+
|
|
1605
|
+
### Performance Profiling
|
|
1606
|
+
|
|
1607
|
+
```python
|
|
1608
|
+
import time
|
|
1609
|
+
|
|
1610
|
+
def profile_query(collection, query_text: str, n_results: int = 10):
|
|
1611
|
+
"""Profile query performance."""
|
|
1612
|
+
start = time.time()
|
|
1613
|
+
results = collection.query(
|
|
1614
|
+
query_texts=[query_text],
|
|
1615
|
+
n_results=n_results
|
|
1616
|
+
)
|
|
1617
|
+
elapsed = time.time() - start
|
|
1618
|
+
|
|
1619
|
+
print(f"Query: {query_text}")
|
|
1620
|
+
print(f"Time: {elapsed:.3f}s")
|
|
1621
|
+
print(f"Results: {len(results['ids'][0])}")
|
|
1622
|
+
return results
|
|
1623
|
+
|
|
1624
|
+
# Usage
|
|
1625
|
+
profile_query(collection, "machine learning", n_results=100)
|
|
1626
|
+
```
|
|
1627
|
+
|
|
1628
|
+
---
|
|
1629
|
+
|
|
1630
|
+
## Migration and Backup
|
|
1631
|
+
|
|
1632
|
+
### Export Collection to JSON
|
|
1633
|
+
|
|
1634
|
+
```python
|
|
1635
|
+
import json
|
|
1636
|
+
|
|
1637
|
+
def export_collection(collection, output_file: str):
|
|
1638
|
+
"""Export entire collection to JSON."""
|
|
1639
|
+
all_data = collection.get()
|
|
1640
|
+
|
|
1641
|
+
export_data = {
|
|
1642
|
+
"ids": all_data["ids"],
|
|
1643
|
+
"documents": all_data["documents"],
|
|
1644
|
+
"metadatas": all_data["metadatas"],
|
|
1645
|
+
"embeddings": all_data.get("embeddings")
|
|
1646
|
+
}
|
|
1647
|
+
|
|
1648
|
+
with open(output_file, 'w') as f:
|
|
1649
|
+
json.dump(export_data, f)
|
|
1650
|
+
|
|
1651
|
+
print(f"Exported {len(all_data['ids'])} documents to {output_file}")
|
|
1652
|
+
|
|
1653
|
+
# Usage
|
|
1654
|
+
export_collection(collection, "backup.json")
|
|
1655
|
+
```
|
|
1656
|
+
|
|
1657
|
+
### Import Collection from JSON
|
|
1658
|
+
|
|
1659
|
+
```python
|
|
1660
|
+
import json
|
|
1661
|
+
|
|
1662
|
+
def import_collection(collection, input_file: str):
|
|
1663
|
+
"""Import collection from JSON."""
|
|
1664
|
+
with open(input_file, 'r') as f:
|
|
1665
|
+
import_data = json.load(f)
|
|
1666
|
+
|
|
1667
|
+
collection.add(
|
|
1668
|
+
ids=import_data["ids"],
|
|
1669
|
+
documents=import_data["documents"],
|
|
1670
|
+
metadatas=import_data["metadatas"],
|
|
1671
|
+
embeddings=import_data.get("embeddings")
|
|
1672
|
+
)
|
|
1673
|
+
|
|
1674
|
+
print(f"Imported {len(import_data['ids'])} documents from {input_file}")
|
|
1675
|
+
|
|
1676
|
+
# Usage
|
|
1677
|
+
import_collection(collection, "backup.json")
|
|
1678
|
+
```
|
|
1679
|
+
|
|
1680
|
+
### Copy Collection
|
|
1681
|
+
|
|
1682
|
+
```python
|
|
1683
|
+
def copy_collection(source_collection, dest_collection, batch_size: int = 1000):
|
|
1684
|
+
"""Copy all data from source to destination collection."""
|
|
1685
|
+
total = source_collection.count()
|
|
1686
|
+
|
|
1687
|
+
for offset in range(0, total, batch_size):
|
|
1688
|
+
data = source_collection.get(
|
|
1689
|
+
limit=batch_size,
|
|
1690
|
+
offset=offset,
|
|
1691
|
+
include=["documents", "metadatas", "embeddings"]
|
|
1692
|
+
)
|
|
1693
|
+
|
|
1694
|
+
dest_collection.add(
|
|
1695
|
+
ids=data["ids"],
|
|
1696
|
+
documents=data["documents"],
|
|
1697
|
+
metadatas=data["metadatas"],
|
|
1698
|
+
embeddings=data.get("embeddings")
|
|
1699
|
+
)
|
|
1700
|
+
|
|
1701
|
+
print(f"Copied {offset + len(data['ids'])}/{total} documents")
|
|
1702
|
+
|
|
1703
|
+
# Usage
|
|
1704
|
+
source = client.get_collection(name="original")
|
|
1705
|
+
dest = client.create_collection(name="backup")
|
|
1706
|
+
copy_collection(source, dest)
|
|
1707
|
+
```
|