voctar 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -4
- package/docs/API.md +10 -0
- package/docs/EMBEDDINGS.md +109 -0
- package/docs/README.md +1 -1
- package/docs/STORAGE_BACKENDS.md +67 -52
- package/package.json +9 -1
- package/docs/CUSTOM_PROVIDERS.md +0 -101
- /package/docs/assets/{vectar.png → voctar.png} +0 -0
package/README.md
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
<p align="center">
|
|
2
|
-
<img src="
|
|
2
|
+
<img src="https://github.com/marvinified/voctar/blob/e0ca3d3d1d609020e9139530aea9c8e60eca92ae/docs/assets/vectar.png" alt="Voctar logo" width="180" />
|
|
3
3
|
</p>
|
|
4
4
|
|
|
5
5
|
<h1 align="center">Voctar</h1>
|
|
6
6
|
|
|
7
7
|
<p align="center">
|
|
8
|
-
Simple TypeScript library with RAG primitives for embeddings, chunking, storage, and
|
|
8
|
+
Simple TypeScript library with RAG primitives for embeddings, chunking, storage, and retrieval.
|
|
9
9
|
</p>
|
|
10
10
|
|
|
11
11
|
<p align="center">
|
|
@@ -16,11 +16,10 @@
|
|
|
16
16
|
</p>
|
|
17
17
|
|
|
18
18
|
## Features
|
|
19
|
-
|
|
19
|
+
- Simple primitives: `embed` and `search`
|
|
20
20
|
- Supports multiple vector stores: SQLite, Qdrant, in-memory, or custom store providers
|
|
21
21
|
- Automatic chunking for long documents with multiple strategies (`fixed`, `recursive`, `sentence`, `paragraph`, `semantic`)
|
|
22
22
|
- Semantic search with score thresholds and metadata filtering
|
|
23
|
-
- Simple primitives: `embed`, `search` and more
|
|
24
23
|
- TypeScript-first.
|
|
25
24
|
|
|
26
25
|
## Quick Start
|
|
@@ -97,6 +96,7 @@ Each result includes:
|
|
|
97
96
|
## Documentation
|
|
98
97
|
|
|
99
98
|
- [Docs Index](./docs/README.md)
|
|
99
|
+
- [Embeddings](./docs/EMBEDDINGS.md)
|
|
100
100
|
- [Storage Backends](./docs/STORAGE_BACKENDS.md)
|
|
101
101
|
- [Chunking](./docs/CHUNKING.md)
|
|
102
102
|
|
package/docs/API.md
CHANGED
|
@@ -280,6 +280,16 @@ type RuntimeEmbeddingConfig =
|
|
|
280
280
|
};
|
|
281
281
|
```
|
|
282
282
|
|
|
283
|
+
The built-in OpenAI provider defaults to:
|
|
284
|
+
|
|
285
|
+
- `model`: `text-embedding-3-small`
|
|
286
|
+
- `dimension`: `1536`
|
|
287
|
+
- `maxRetries`: `3`
|
|
288
|
+
|
|
289
|
+
Set `model` to any OpenAI embedding model supported by your OpenAI account. Set `dimension` when the model supports configurable embedding dimensions or when your vector store collection expects a specific dimension. A collection can only contain vectors with one dimension, so changing model or dimension usually requires a new collection.
|
|
290
|
+
|
|
291
|
+
Use `{ type: 'custom', provider }` for local models, hosted non-OpenAI models, or any other embedding service. Custom providers must implement `EmbeddingProvider`.
|
|
292
|
+
|
|
283
293
|
### `RuntimeStoreConfig`
|
|
284
294
|
|
|
285
295
|
```typescript
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
# Voctar Embeddings
|
|
2
|
+
|
|
3
|
+
This guide covers embedding model configuration in Voctar.
|
|
4
|
+
|
|
5
|
+
Voctar is config-first:
|
|
6
|
+
|
|
7
|
+
- your app chooses the embedding provider,
|
|
8
|
+
- your app reads env vars or secrets,
|
|
9
|
+
- your app passes explicit config to `new Voctar(...)`.
|
|
10
|
+
|
|
11
|
+
## Available Providers
|
|
12
|
+
|
|
13
|
+
Voctar supports:
|
|
14
|
+
|
|
15
|
+
- `openai`
|
|
16
|
+
- `custom`
|
|
17
|
+
|
|
18
|
+
## OpenAI Provider
|
|
19
|
+
|
|
20
|
+
The built-in OpenAI provider is the default path for most apps.
|
|
21
|
+
|
|
22
|
+
Defaults:
|
|
23
|
+
|
|
24
|
+
- `model`: `text-embedding-3-small`
|
|
25
|
+
- `dimension`: `1536`
|
|
26
|
+
- `maxRetries`: `3`
|
|
27
|
+
|
|
28
|
+
Example:
|
|
29
|
+
|
|
30
|
+
```typescript
|
|
31
|
+
import { Voctar } from 'voctar';
|
|
32
|
+
|
|
33
|
+
const vector = new Voctar({
|
|
34
|
+
embedding: {
|
|
35
|
+
type: 'openai',
|
|
36
|
+
apiKey: process.env.OPENAI_API_KEY!,
|
|
37
|
+
model: 'text-embedding-3-small',
|
|
38
|
+
dimension: 1536,
|
|
39
|
+
},
|
|
40
|
+
store: {
|
|
41
|
+
type: 'sqlite',
|
|
42
|
+
path: './data/vector.db',
|
|
43
|
+
},
|
|
44
|
+
});
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
You can pass any OpenAI embedding model supported by the OpenAI API. If the model supports configurable embedding dimensions, set `dimension` to the vector size you want to store.
|
|
48
|
+
|
|
49
|
+
## Model and Dimension Notes
|
|
50
|
+
|
|
51
|
+
The embedding dimension must match the vector store collection dimension. Existing collections cannot mix vectors with different dimensions, so changing `model` or `dimension` usually requires a new collection.
|
|
52
|
+
|
|
53
|
+
Voctar uses the provider token limit to decide when documents should be chunked automatically. The built-in OpenAI provider uses:
|
|
54
|
+
|
|
55
|
+
- `8192` tokens for `text-embedding-3-small` and `text-embedding-3-large`
|
|
56
|
+
- `8191` tokens for `text-embedding-ada-002`
|
|
57
|
+
- `8192` tokens for other OpenAI embedding model names
|
|
58
|
+
|
|
59
|
+
## Custom Embedding Provider
|
|
60
|
+
|
|
61
|
+
Use a custom embedding provider for local models, hosted non-OpenAI models, or any embedding service with your own client.
|
|
62
|
+
|
|
63
|
+
Example:
|
|
64
|
+
|
|
65
|
+
```typescript
|
|
66
|
+
import { Voctar, type EmbeddingProvider } from 'voctar';
|
|
67
|
+
|
|
68
|
+
class MyEmbeddingProvider implements EmbeddingProvider {
|
|
69
|
+
async embed(text: string): Promise<number[]> {
|
|
70
|
+
// Return one embedding vector for one text.
|
|
71
|
+
return [/* ... */];
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
async embedBatch(texts: string[]): Promise<number[][]> {
|
|
75
|
+
// Return one vector per input text in the same order.
|
|
76
|
+
return texts.map(() => [/* ... */]);
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
getDimension(): number {
|
|
80
|
+
return 1536;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
getModelName(): string {
|
|
84
|
+
return 'my-embedding-model';
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
getTokenLimit(): number {
|
|
88
|
+
return 8192;
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
const vector = new Voctar({
|
|
93
|
+
embedding: {
|
|
94
|
+
type: 'custom',
|
|
95
|
+
provider: new MyEmbeddingProvider(),
|
|
96
|
+
},
|
|
97
|
+
store: {
|
|
98
|
+
type: 'sqlite',
|
|
99
|
+
path: './data/vector.db',
|
|
100
|
+
},
|
|
101
|
+
});
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
Integration tips:
|
|
105
|
+
|
|
106
|
+
- Keep `embedBatch()` output order stable with input order.
|
|
107
|
+
- Ensure `getDimension()` matches vectors returned by `embed()` and `embedBatch()`.
|
|
108
|
+
- Return a realistic `getTokenLimit()` so automatic chunking can split long documents before embedding.
|
|
109
|
+
- Normalize errors with useful messages so callers can debug provider failures quickly.
|
package/docs/README.md
CHANGED
|
@@ -5,7 +5,7 @@ The canonical getting-started guide now lives in the root [`README.md`](../READM
|
|
|
5
5
|
Use this folder for focused topics:
|
|
6
6
|
|
|
7
7
|
- [API Reference](./API.md)
|
|
8
|
-
- [
|
|
8
|
+
- [Embeddings](./EMBEDDINGS.md)
|
|
9
9
|
- [Storage Backends](./STORAGE_BACKENDS.md)
|
|
10
10
|
- [Chunking](./CHUNKING.md)
|
|
11
11
|
|
package/docs/STORAGE_BACKENDS.md
CHANGED
|
@@ -2,12 +2,6 @@
|
|
|
2
2
|
|
|
3
3
|
This guide covers the available storage backends in Voctar and when to use each one.
|
|
4
4
|
|
|
5
|
-
Voctar is config-first:
|
|
6
|
-
|
|
7
|
-
- your app chooses the backend,
|
|
8
|
-
- your app reads env vars (if any),
|
|
9
|
-
- your app passes explicit config to `new Vectar(...)`.
|
|
10
|
-
|
|
11
5
|
## Available Backends
|
|
12
6
|
|
|
13
7
|
Voctar supports:
|
|
@@ -19,9 +13,9 @@ Voctar supports:
|
|
|
19
13
|
|
|
20
14
|
## Quick Selection Guide
|
|
21
15
|
|
|
16
|
+
- Use `memory` for tests and short-lived demos only.
|
|
22
17
|
- Use `sqlite` for local dev and simple production workloads.
|
|
23
18
|
- Use `qdrant` for larger datasets, higher throughput, or multi-instance deployments.
|
|
24
|
-
- Use `memory` for tests and short-lived demos only.
|
|
25
19
|
- Use `custom` when integrating an internal or third-party vector store.
|
|
26
20
|
|
|
27
21
|
## SQLite Backend
|
|
@@ -70,24 +64,17 @@ store: {
|
|
|
70
64
|
}
|
|
71
65
|
```
|
|
72
66
|
|
|
73
|
-
##
|
|
67
|
+
## In-Memory Backend
|
|
74
68
|
|
|
75
69
|
Best for:
|
|
76
70
|
|
|
77
|
-
-
|
|
78
|
-
-
|
|
79
|
-
- distributed deployments.
|
|
80
|
-
|
|
81
|
-
Pros:
|
|
82
|
-
|
|
83
|
-
- purpose-built vector DB,
|
|
84
|
-
- strong scale characteristics,
|
|
85
|
-
- rich filtering support.
|
|
71
|
+
- unit tests,
|
|
72
|
+
- quick local examples.
|
|
86
73
|
|
|
87
74
|
Trade-offs:
|
|
88
75
|
|
|
89
|
-
-
|
|
90
|
-
-
|
|
76
|
+
- data is lost on restart,
|
|
77
|
+
- unsuitable for production persistence.
|
|
91
78
|
|
|
92
79
|
Example:
|
|
93
80
|
|
|
@@ -100,27 +87,29 @@ const vector = new Voctar({
|
|
|
100
87
|
apiKey: process.env.OPENAI_API_KEY!,
|
|
101
88
|
},
|
|
102
89
|
store: {
|
|
103
|
-
type: '
|
|
104
|
-
url: process.env.QDRANT_URL!,
|
|
105
|
-
port: process.env.QDRANT_PORT ? Number(process.env.QDRANT_PORT) : 6333,
|
|
106
|
-
apiKey: process.env.QDRANT_API_KEY || undefined,
|
|
107
|
-
timeout: 30000,
|
|
108
|
-
checkCompatibility: false,
|
|
90
|
+
type: 'memory',
|
|
109
91
|
},
|
|
110
92
|
});
|
|
111
93
|
```
|
|
112
94
|
|
|
113
|
-
##
|
|
95
|
+
## Qdrant Backend
|
|
114
96
|
|
|
115
97
|
Best for:
|
|
116
98
|
|
|
117
|
-
-
|
|
118
|
-
-
|
|
99
|
+
- medium and large datasets,
|
|
100
|
+
- high query volume,
|
|
101
|
+
- distributed deployments.
|
|
102
|
+
|
|
103
|
+
Pros:
|
|
104
|
+
|
|
105
|
+
- purpose-built vector DB,
|
|
106
|
+
- strong scale characteristics,
|
|
107
|
+
- rich filtering support.
|
|
119
108
|
|
|
120
109
|
Trade-offs:
|
|
121
110
|
|
|
122
|
-
-
|
|
123
|
-
-
|
|
111
|
+
- extra service to operate,
|
|
112
|
+
- network hop adds operational complexity.
|
|
124
113
|
|
|
125
114
|
Example:
|
|
126
115
|
|
|
@@ -133,11 +122,17 @@ const vector = new Voctar({
|
|
|
133
122
|
apiKey: process.env.OPENAI_API_KEY!,
|
|
134
123
|
},
|
|
135
124
|
store: {
|
|
136
|
-
type: '
|
|
125
|
+
type: 'qdrant',
|
|
126
|
+
url: process.env.QDRANT_URL!,
|
|
127
|
+
port: process.env.QDRANT_PORT ? Number(process.env.QDRANT_PORT) : 6333,
|
|
128
|
+
apiKey: process.env.QDRANT_API_KEY || undefined,
|
|
129
|
+
timeout: 30000,
|
|
130
|
+
checkCompatibility: false,
|
|
137
131
|
},
|
|
138
132
|
});
|
|
139
133
|
```
|
|
140
134
|
|
|
135
|
+
|
|
141
136
|
## Custom Backend
|
|
142
137
|
|
|
143
138
|
Use this when you need full control over storage behavior.
|
|
@@ -161,29 +156,49 @@ const vector = new Voctar({
|
|
|
161
156
|
});
|
|
162
157
|
```
|
|
163
158
|
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
## Environment Variable Pattern (App-Owned)
|
|
167
|
-
|
|
168
|
-
Voctar does not auto-load env vars, but many apps use a selector like this:
|
|
169
|
-
|
|
170
|
-
```bash
|
|
171
|
-
VECTOR_STORE=sqlite # sqlite | qdrant | memory
|
|
172
|
-
SQLITE_PATH=./data/vector.db
|
|
173
|
-
QDRANT_URL=http://localhost
|
|
174
|
-
QDRANT_PORT=6333
|
|
175
|
-
QDRANT_API_KEY=your_api_key
|
|
176
|
-
```
|
|
177
|
-
|
|
178
|
-
Then in app bootstrap:
|
|
159
|
+
Full interface example:
|
|
179
160
|
|
|
180
161
|
```typescript
|
|
181
|
-
|
|
162
|
+
import type {
|
|
163
|
+
CollectionConfig,
|
|
164
|
+
SearchOptions,
|
|
165
|
+
SearchResult,
|
|
166
|
+
VectorPoint,
|
|
167
|
+
VectorStoreProvider,
|
|
168
|
+
} from 'voctar';
|
|
169
|
+
|
|
170
|
+
export class MyVectorStoreProvider implements VectorStoreProvider {
|
|
171
|
+
async ensureCollection(name: string, dimension: number, config?: CollectionConfig): Promise<void> {
|
|
172
|
+
// Create collection/index if missing.
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
async upsert(collection: string, points: VectorPoint[]): Promise<void> {
|
|
176
|
+
// Insert or update vectors.
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
async search(collection: string, vector: number[], options: SearchOptions): Promise<SearchResult[]> {
|
|
180
|
+
// Return scored results in descending relevance.
|
|
181
|
+
return [];
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
async delete(collection: string, ids: string[]): Promise<void> {
|
|
185
|
+
// Delete matching IDs.
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
async deleteCollection(collection: string): Promise<void> {
|
|
189
|
+
// Drop collection/index.
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
async getIdsByFilter(collection: string, filter: Record<string, any>, limit?: number): Promise<string[]> {
|
|
193
|
+
// Return IDs that match filter.
|
|
194
|
+
return [];
|
|
195
|
+
}
|
|
196
|
+
}
|
|
182
197
|
```
|
|
183
198
|
|
|
184
|
-
|
|
199
|
+
Integration tips:
|
|
185
200
|
|
|
186
|
-
-
|
|
187
|
-
-
|
|
188
|
-
-
|
|
189
|
-
-
|
|
201
|
+
- Ensure `ensureCollection()` respects the embedding provider dimension.
|
|
202
|
+
- Implement filter behavior consistently in `search()` and `getIdsByFilter()`.
|
|
203
|
+
- Return search results in descending relevance order.
|
|
204
|
+
- Normalize storage errors with useful messages so callers can debug quickly.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "voctar",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.2",
|
|
4
4
|
"description": "TypeScript library with RAG primitives for vector embeddings, chunking, storing and retrieval.",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"types": "dist/index.d.ts",
|
|
@@ -26,6 +26,14 @@
|
|
|
26
26
|
"qdrant",
|
|
27
27
|
"sqlite"
|
|
28
28
|
],
|
|
29
|
+
"repository": {
|
|
30
|
+
"type": "git",
|
|
31
|
+
"url": "git+https://github.com/marvinified/voctar.git"
|
|
32
|
+
},
|
|
33
|
+
"bugs": {
|
|
34
|
+
"url": "https://github.com/marvinified/voctar/issues"
|
|
35
|
+
},
|
|
36
|
+
"homepage": "https://github.com/marvinified/voctar#readme",
|
|
29
37
|
"license": "MIT",
|
|
30
38
|
"engines": {
|
|
31
39
|
"node": ">=18"
|
package/docs/CUSTOM_PROVIDERS.md
DELETED
|
@@ -1,101 +0,0 @@
|
|
|
1
|
-
# Custom Providers
|
|
2
|
-
|
|
3
|
-
Voctar supports custom providers for embeddings and storage.
|
|
4
|
-
|
|
5
|
-
## Use Custom Providers
|
|
6
|
-
|
|
7
|
-
```typescript
|
|
8
|
-
import { Voctar } from 'voctar';
|
|
9
|
-
|
|
10
|
-
const vector = new Voctar({
|
|
11
|
-
embedding: {
|
|
12
|
-
type: 'custom',
|
|
13
|
-
provider: myEmbeddingProvider,
|
|
14
|
-
},
|
|
15
|
-
store: {
|
|
16
|
-
type: 'custom',
|
|
17
|
-
provider: myVectorStoreProvider,
|
|
18
|
-
},
|
|
19
|
-
});
|
|
20
|
-
```
|
|
21
|
-
|
|
22
|
-
## Custom Embedding Provider
|
|
23
|
-
|
|
24
|
-
Implement the `EmbeddingProvider` interface:
|
|
25
|
-
|
|
26
|
-
```typescript
|
|
27
|
-
import type { EmbeddingProvider } from 'voctar';
|
|
28
|
-
|
|
29
|
-
export class MyEmbeddingProvider implements EmbeddingProvider {
|
|
30
|
-
async embed(text: string): Promise<number[]> {
|
|
31
|
-
// Return one embedding vector for one text
|
|
32
|
-
return [/* ... */];
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
async embedBatch(texts: string[]): Promise<number[][]> {
|
|
36
|
-
// Return one vector per input text (same order)
|
|
37
|
-
return texts.map(() => [/* ... */]);
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
getDimension(): number {
|
|
41
|
-
return 1536;
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
getModelName(): string {
|
|
45
|
-
return 'my-embedding-model';
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
getTokenLimit(): number {
|
|
49
|
-
return 8192;
|
|
50
|
-
}
|
|
51
|
-
}
|
|
52
|
-
```
|
|
53
|
-
|
|
54
|
-
## Custom Store Provider
|
|
55
|
-
|
|
56
|
-
Implement the `VectorStoreProvider` interface:
|
|
57
|
-
|
|
58
|
-
```typescript
|
|
59
|
-
import type {
|
|
60
|
-
VectorStoreProvider,
|
|
61
|
-
VectorPoint,
|
|
62
|
-
SearchOptions,
|
|
63
|
-
SearchResult,
|
|
64
|
-
CollectionConfig,
|
|
65
|
-
} from 'voctar';
|
|
66
|
-
|
|
67
|
-
export class MyVectorStoreProvider implements VectorStoreProvider {
|
|
68
|
-
async ensureCollection(name: string, dimension: number, config?: CollectionConfig): Promise<void> {
|
|
69
|
-
// Create collection/index if missing
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
async upsert(collection: string, points: VectorPoint[]): Promise<void> {
|
|
73
|
-
// Insert or update vectors
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
async search(collection: string, vector: number[], options: SearchOptions): Promise<SearchResult[]> {
|
|
77
|
-
// Return scored results in descending relevance
|
|
78
|
-
return [];
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
async delete(collection: string, ids: string[]): Promise<void> {
|
|
82
|
-
// Delete matching IDs
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
async deleteCollection(collection: string): Promise<void> {
|
|
86
|
-
// Drop collection/index
|
|
87
|
-
}
|
|
88
|
-
|
|
89
|
-
async getIdsByFilter(collection: string, filter: Record<string, any>, limit?: number): Promise<string[]> {
|
|
90
|
-
// Return IDs that match filter
|
|
91
|
-
return [];
|
|
92
|
-
}
|
|
93
|
-
}
|
|
94
|
-
```
|
|
95
|
-
|
|
96
|
-
## Integration Tips
|
|
97
|
-
|
|
98
|
-
- Keep `embedBatch()` order stable with input order.
|
|
99
|
-
- Ensure `getDimension()` matches vectors returned by `embed()`/`embedBatch()`.
|
|
100
|
-
- Normalize errors with useful messages so callers can debug quickly.
|
|
101
|
-
- Implement filter behavior consistently in `search()` and `getIdsByFilter()`.
|
|
File without changes
|