voctar 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -5,7 +5,7 @@
5
5
  <h1 align="center">Voctar</h1>
6
6
 
7
7
  <p align="center">
8
- Simple TypeScript library with RAG primitives for embeddings, chunking, storage, and semantic retrieval.
8
+ Simple TypeScript library with RAG primitives for embeddings, chunking, storage, and retrieval.
9
9
  </p>
10
10
 
11
11
  <p align="center">
@@ -16,11 +16,10 @@
16
16
  </p>
17
17
 
18
18
  ## Features
19
-
19
+ - Simple primitives: `embed` and `search`
20
20
  - Supports multiple vector stores: SQLite, Qdrant, in-memory, or custom store providers
21
21
  - Automatic chunking for long documents with multiple strategies (`fixed`, `recursive`, `sentence`, `paragraph`, `semantic`)
22
22
  - Semantic search with score thresholds and metadata filtering
23
- - Simple primitives: `embed`, `search` and more
24
23
  - TypeScript-first.
25
24
 
26
25
  ## Quick Start
@@ -97,6 +96,7 @@ Each result includes:
97
96
  ## Documentation
98
97
 
99
98
  - [Docs Index](./docs/README.md)
99
+ - [Embeddings](./docs/EMBEDDINGS.md)
100
100
  - [Storage Backends](./docs/STORAGE_BACKENDS.md)
101
101
  - [Chunking](./docs/CHUNKING.md)
102
102
 
package/docs/API.md CHANGED
@@ -280,6 +280,16 @@ type RuntimeEmbeddingConfig =
280
280
  };
281
281
  ```
282
282
 
283
+ The built-in OpenAI provider defaults to:
284
+
285
+ - `model`: `text-embedding-3-small`
286
+ - `dimension`: `1536`
287
+ - `maxRetries`: `3`
288
+
289
+ Set `model` to any OpenAI embedding model supported by your OpenAI account. Set `dimension` when the model supports configurable embedding dimensions or when your vector store collection expects a specific dimension. A collection can only contain vectors with one dimension, so changing model or dimension usually requires a new collection.
290
+
291
+ Use `{ type: 'custom', provider }` for local models, hosted non-OpenAI models, or any other embedding service. Custom providers must implement `EmbeddingProvider`.
292
+
283
293
  ### `RuntimeStoreConfig`
284
294
 
285
295
  ```typescript
@@ -0,0 +1,109 @@
1
+ # Voctar Embeddings
2
+
3
+ This guide covers embedding model configuration in Voctar.
4
+
5
+ Voctar is config-first:
6
+
7
+ - your app chooses the embedding provider,
8
+ - your app reads env vars or secrets,
9
+ - your app passes explicit config to `new Voctar(...)`.
10
+
11
+ ## Available Providers
12
+
13
+ Voctar supports:
14
+
15
+ - `openai`
16
+ - `custom`
17
+
18
+ ## OpenAI Provider
19
+
20
+ The built-in OpenAI provider is the default path for most apps.
21
+
22
+ Defaults:
23
+
24
+ - `model`: `text-embedding-3-small`
25
+ - `dimension`: `1536`
26
+ - `maxRetries`: `3`
27
+
28
+ Example:
29
+
30
+ ```typescript
31
+ import { Voctar } from 'voctar';
32
+
33
+ const vector = new Voctar({
34
+ embedding: {
35
+ type: 'openai',
36
+ apiKey: process.env.OPENAI_API_KEY!,
37
+ model: 'text-embedding-3-small',
38
+ dimension: 1536,
39
+ },
40
+ store: {
41
+ type: 'sqlite',
42
+ path: './data/vector.db',
43
+ },
44
+ });
45
+ ```
46
+
47
+ You can pass any OpenAI embedding model supported by the OpenAI API. If the model supports configurable embedding dimensions, set `dimension` to the vector size you want to store.
48
+
49
+ ## Model and Dimension Notes
50
+
51
+ The embedding dimension must match the vector store collection dimension. Existing collections cannot mix vectors with different dimensions, so changing `model` or `dimension` usually requires a new collection.
52
+
53
+ Voctar uses the provider token limit to decide when documents should be chunked automatically. The built-in OpenAI provider uses:
54
+
55
+ - `8192` tokens for `text-embedding-3-small` and `text-embedding-3-large`
56
+ - `8191` tokens for `text-embedding-ada-002`
57
+ - `8192` tokens for other OpenAI embedding model names
58
+
59
+ ## Custom Embedding Provider
60
+
61
+ Use a custom embedding provider for local models, hosted non-OpenAI models, or any embedding service with your own client.
62
+
63
+ Example:
64
+
65
+ ```typescript
66
+ import { Voctar, type EmbeddingProvider } from 'voctar';
67
+
68
+ class MyEmbeddingProvider implements EmbeddingProvider {
69
+ async embed(text: string): Promise<number[]> {
70
+ // Return one embedding vector for one text.
71
+ return [/* ... */];
72
+ }
73
+
74
+ async embedBatch(texts: string[]): Promise<number[][]> {
75
+ // Return one vector per input text in the same order.
76
+ return texts.map(() => [/* ... */]);
77
+ }
78
+
79
+ getDimension(): number {
80
+ return 1536;
81
+ }
82
+
83
+ getModelName(): string {
84
+ return 'my-embedding-model';
85
+ }
86
+
87
+ getTokenLimit(): number {
88
+ return 8192;
89
+ }
90
+ }
91
+
92
+ const vector = new Voctar({
93
+ embedding: {
94
+ type: 'custom',
95
+ provider: new MyEmbeddingProvider(),
96
+ },
97
+ store: {
98
+ type: 'sqlite',
99
+ path: './data/vector.db',
100
+ },
101
+ });
102
+ ```
103
+
104
+ Integration tips:
105
+
106
+ - Keep `embedBatch()` output order stable with input order.
107
+ - Ensure `getDimension()` matches vectors returned by `embed()` and `embedBatch()`.
108
+ - Return a realistic `getTokenLimit()` so automatic chunking can split long documents before embedding.
109
+ - Normalize errors with useful messages so callers can debug provider failures quickly.
package/docs/README.md CHANGED
@@ -5,7 +5,7 @@ The canonical getting-started guide now lives in the root [`README.md`](../READM
5
5
  Use this folder for focused topics:
6
6
 
7
7
  - [API Reference](./API.md)
8
- - [Custom Providers](./CUSTOM_PROVIDERS.md)
8
+ - [Embeddings](./EMBEDDINGS.md)
9
9
  - [Storage Backends](./STORAGE_BACKENDS.md)
10
10
  - [Chunking](./CHUNKING.md)
11
11
 
@@ -2,12 +2,6 @@
2
2
 
3
3
  This guide covers the available storage backends in Voctar and when to use each one.
4
4
 
5
- Voctar is config-first:
6
-
7
- - your app chooses the backend,
8
- - your app reads env vars (if any),
9
- - your app passes explicit config to `new Voctar(...)`.
10
-
11
5
  ## Available Backends
12
6
 
13
7
  Voctar supports:
@@ -19,9 +13,9 @@ Voctar supports:
19
13
 
20
14
  ## Quick Selection Guide
21
15
 
16
+ - Use `memory` for tests and short-lived demos only.
22
17
  - Use `sqlite` for local dev and simple production workloads.
23
18
  - Use `qdrant` for larger datasets, higher throughput, or multi-instance deployments.
24
- - Use `memory` for tests and short-lived demos only.
25
19
  - Use `custom` when integrating an internal or third-party vector store.
26
20
 
27
21
  ## SQLite Backend
@@ -70,24 +64,17 @@ store: {
70
64
  }
71
65
  ```
72
66
 
73
- ## Qdrant Backend
67
+ ## In-Memory Backend
74
68
 
75
69
  Best for:
76
70
 
77
- - medium and large datasets,
78
- - high query volume,
79
- - distributed deployments.
80
-
81
- Pros:
82
-
83
- - purpose-built vector DB,
84
- - strong scale characteristics,
85
- - rich filtering support.
71
+ - unit tests,
72
+ - quick local examples.
86
73
 
87
74
  Trade-offs:
88
75
 
89
- - extra service to operate,
90
- - network hop adds operational complexity.
76
+ - data is lost on restart,
77
+ - unsuitable for production persistence.
91
78
 
92
79
  Example:
93
80
 
@@ -100,27 +87,29 @@ const vector = new Voctar({
100
87
  apiKey: process.env.OPENAI_API_KEY!,
101
88
  },
102
89
  store: {
103
- type: 'qdrant',
104
- url: process.env.QDRANT_URL!,
105
- port: process.env.QDRANT_PORT ? Number(process.env.QDRANT_PORT) : 6333,
106
- apiKey: process.env.QDRANT_API_KEY || undefined,
107
- timeout: 30000,
108
- checkCompatibility: false,
90
+ type: 'memory',
109
91
  },
110
92
  });
111
93
  ```
112
94
 
113
- ## In-Memory Backend
95
+ ## Qdrant Backend
114
96
 
115
97
  Best for:
116
98
 
117
- - unit tests,
118
- - quick local examples.
99
+ - medium and large datasets,
100
+ - high query volume,
101
+ - distributed deployments.
102
+
103
+ Pros:
104
+
105
+ - purpose-built vector DB,
106
+ - strong scale characteristics,
107
+ - rich filtering support.
119
108
 
120
109
  Trade-offs:
121
110
 
122
- - data is lost on restart,
123
- - unsuitable for production persistence.
111
+ - extra service to operate,
112
+ - network hop adds operational complexity.
124
113
 
125
114
  Example:
126
115
 
@@ -133,11 +122,17 @@ const vector = new Voctar({
133
122
  apiKey: process.env.OPENAI_API_KEY!,
134
123
  },
135
124
  store: {
136
- type: 'memory',
125
+ type: 'qdrant',
126
+ url: process.env.QDRANT_URL!,
127
+ port: process.env.QDRANT_PORT ? Number(process.env.QDRANT_PORT) : 6333,
128
+ apiKey: process.env.QDRANT_API_KEY || undefined,
129
+ timeout: 30000,
130
+ checkCompatibility: false,
137
131
  },
138
132
  });
139
133
  ```
140
134
 
135
+
141
136
  ## Custom Backend
142
137
 
143
138
  Use this when you need full control over storage behavior.
@@ -161,29 +156,49 @@ const vector = new Voctar({
161
156
  });
162
157
  ```
163
158
 
164
- See [`CUSTOM_PROVIDERS.md`](./CUSTOM_PROVIDERS.md) for full interface details.
165
-
166
- ## Environment Variable Pattern (App-Owned)
167
-
168
- Voctar does not auto-load env vars, but many apps use a selector like this:
169
-
170
- ```bash
171
- VECTOR_STORE=sqlite # sqlite | qdrant | memory
172
- SQLITE_PATH=./data/vector.db
173
- QDRANT_URL=http://localhost
174
- QDRANT_PORT=6333
175
- QDRANT_API_KEY=your_api_key
176
- ```
177
-
178
- Then in app bootstrap:
159
+ Full interface example:
179
160
 
180
161
  ```typescript
181
- const storeType = process.env.VECTOR_STORE ?? 'sqlite';
162
+ import type {
163
+ CollectionConfig,
164
+ SearchOptions,
165
+ SearchResult,
166
+ VectorPoint,
167
+ VectorStoreProvider,
168
+ } from 'voctar';
169
+
170
+ export class MyVectorStoreProvider implements VectorStoreProvider {
171
+ async ensureCollection(name: string, dimension: number, config?: CollectionConfig): Promise<void> {
172
+ // Create collection/index if missing.
173
+ }
174
+
175
+ async upsert(collection: string, points: VectorPoint[]): Promise<void> {
176
+ // Insert or update vectors.
177
+ }
178
+
179
+ async search(collection: string, vector: number[], options: SearchOptions): Promise<SearchResult[]> {
180
+ // Return scored results in descending relevance.
181
+ return [];
182
+ }
183
+
184
+ async delete(collection: string, ids: string[]): Promise<void> {
185
+ // Delete matching IDs.
186
+ }
187
+
188
+ async deleteCollection(collection: string): Promise<void> {
189
+ // Drop collection/index.
190
+ }
191
+
192
+ async getIdsByFilter(collection: string, filter: Record<string, any>, limit?: number): Promise<string[]> {
193
+ // Return IDs that match filter.
194
+ return [];
195
+ }
196
+ }
182
197
  ```
183
198
 
184
- ## Migration and Operations Notes
199
+ Integration tips:
185
200
 
186
- - Start with `sqlite` if you are early-stage.
187
- - Move to `qdrant` when dataset size, traffic, or deployment topology requires it.
188
- - Back up SQLite database files regularly.
189
- - For Qdrant, use snapshots/backups supported by your Qdrant setup.
201
+ - Ensure `ensureCollection()` respects the embedding provider dimension.
202
+ - Implement filter behavior consistently in `search()` and `getIdsByFilter()`.
203
+ - Return search results in descending relevance order.
204
+ - Normalize storage errors with useful messages so callers can debug quickly.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "voctar",
3
- "version": "0.1.1",
3
+ "version": "0.1.2",
4
4
  "description": "TypeScript library with RAG primitives for vector embeddings, chunking, storing and retrieval.",
5
5
  "main": "dist/index.js",
6
6
  "types": "dist/index.d.ts",
@@ -26,6 +26,14 @@
26
26
  "qdrant",
27
27
  "sqlite"
28
28
  ],
29
+ "repository": {
30
+ "type": "git",
31
+ "url": "git+https://github.com/marvinified/voctar.git"
32
+ },
33
+ "bugs": {
34
+ "url": "https://github.com/marvinified/voctar/issues"
35
+ },
36
+ "homepage": "https://github.com/marvinified/voctar#readme",
29
37
  "license": "MIT",
30
38
  "engines": {
31
39
  "node": ">=18"
package/docs/.DS_Store DELETED
Binary file
@@ -1,101 +0,0 @@
1
- # Custom Providers
2
-
3
- Voctar supports custom providers for embeddings and storage.
4
-
5
- ## Use Custom Providers
6
-
7
- ```typescript
8
- import { Voctar } from 'voctar';
9
-
10
- const vector = new Voctar({
11
- embedding: {
12
- type: 'custom',
13
- provider: myEmbeddingProvider,
14
- },
15
- store: {
16
- type: 'custom',
17
- provider: myVectorStoreProvider,
18
- },
19
- });
20
- ```
21
-
22
- ## Custom Embedding Provider
23
-
24
- Implement the `EmbeddingProvider` interface:
25
-
26
- ```typescript
27
- import type { EmbeddingProvider } from 'voctar';
28
-
29
- export class MyEmbeddingProvider implements EmbeddingProvider {
30
- async embed(text: string): Promise<number[]> {
31
- // Return one embedding vector for one text
32
- return [/* ... */];
33
- }
34
-
35
- async embedBatch(texts: string[]): Promise<number[][]> {
36
- // Return one vector per input text (same order)
37
- return texts.map(() => [/* ... */]);
38
- }
39
-
40
- getDimension(): number {
41
- return 1536;
42
- }
43
-
44
- getModelName(): string {
45
- return 'my-embedding-model';
46
- }
47
-
48
- getTokenLimit(): number {
49
- return 8192;
50
- }
51
- }
52
- ```
53
-
54
- ## Custom Store Provider
55
-
56
- Implement the `VectorStoreProvider` interface:
57
-
58
- ```typescript
59
- import type {
60
- VectorStoreProvider,
61
- VectorPoint,
62
- SearchOptions,
63
- SearchResult,
64
- CollectionConfig,
65
- } from 'voctar';
66
-
67
- export class MyVectorStoreProvider implements VectorStoreProvider {
68
- async ensureCollection(name: string, dimension: number, config?: CollectionConfig): Promise<void> {
69
- // Create collection/index if missing
70
- }
71
-
72
- async upsert(collection: string, points: VectorPoint[]): Promise<void> {
73
- // Insert or update vectors
74
- }
75
-
76
- async search(collection: string, vector: number[], options: SearchOptions): Promise<SearchResult[]> {
77
- // Return scored results in descending relevance
78
- return [];
79
- }
80
-
81
- async delete(collection: string, ids: string[]): Promise<void> {
82
- // Delete matching IDs
83
- }
84
-
85
- async deleteCollection(collection: string): Promise<void> {
86
- // Drop collection/index
87
- }
88
-
89
- async getIdsByFilter(collection: string, filter: Record<string, any>, limit?: number): Promise<string[]> {
90
- // Return IDs that match filter
91
- return [];
92
- }
93
- }
94
- ```
95
-
96
- ## Integration Tips
97
-
98
- - Keep `embedBatch()` order stable with input order.
99
- - Ensure `getDimension()` matches vectors returned by `embed()`/`embedBatch()`.
100
- - Normalize errors with useful messages so callers can debug quickly.
101
- - Implement filter behavior consistently in `search()` and `getIdsByFilter()`.