retriv 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +282 -0
- package/dist/_chunks/split-text.mjs +85 -0
- package/dist/db/cloudflare.d.mts +36 -0
- package/dist/db/cloudflare.mjs +55 -0
- package/dist/db/libsql.d.mts +30 -0
- package/dist/db/libsql.mjs +87 -0
- package/dist/db/pgvector.d.mts +30 -0
- package/dist/db/pgvector.mjs +80 -0
- package/dist/db/sqlite-fts.d.mts +23 -0
- package/dist/db/sqlite-fts.mjs +68 -0
- package/dist/db/sqlite-vec.d.mts +27 -0
- package/dist/db/sqlite-vec.mjs +108 -0
- package/dist/db/upstash.d.mts +28 -0
- package/dist/db/upstash.mjs +56 -0
- package/dist/embeddings/cohere.d.mts +28 -0
- package/dist/embeddings/cohere.mjs +39 -0
- package/dist/embeddings/google.d.mts +28 -0
- package/dist/embeddings/google.mjs +39 -0
- package/dist/embeddings/mistral.d.mts +28 -0
- package/dist/embeddings/mistral.mjs +39 -0
- package/dist/embeddings/ollama.d.mts +26 -0
- package/dist/embeddings/ollama.mjs +37 -0
- package/dist/embeddings/openai.d.mts +28 -0
- package/dist/embeddings/openai.mjs +39 -0
- package/dist/embeddings/resolve.d.mts +10 -0
- package/dist/embeddings/resolve.mjs +4 -0
- package/dist/embeddings/transformers.d.mts +24 -0
- package/dist/embeddings/transformers.mjs +26 -0
- package/dist/index.d.mts +3 -0
- package/dist/index.mjs +2 -0
- package/dist/retriv.d.mts +9 -0
- package/dist/retriv.mjs +112 -0
- package/dist/types.d.mts +203 -0
- package/dist/types.mjs +1 -0
- package/dist/utils/split-text.d.mts +23 -0
- package/dist/utils/split-text.mjs +2 -0
- package/package.json +167 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026-present Harlan Wilton
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
<h1>retriv</h1>
|
|
2
|
+
|
|
3
|
+
[![npm version][npm-version-src]][npm-version-href]
|
|
4
|
+
[![npm downloads][npm-downloads-src]][npm-downloads-href]
|
|
5
|
+
[![License][license-src]][license-href]
|
|
6
|
+
|
|
7
|
+
Index and retrieve Markdown documents with [up to 30% better recall](https://ragaboutit.com/hybrid-retrieval-for-enterprise-rag-when-to-use-bm25-vectors-or-both/) using hybrid search.
|
|
8
|
+
|
|
9
|
+
Keyword search (BM25) finds exact matches but misses synonyms. Semantic search understands meaning but struggles with names, codes, and precise terminology. Hybrid search combines both using [Reciprocal Rank Fusion](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) - [research shows up to 5.8x improvement](https://www.researchgate.net/publication/399428523_Hybrid_Dense-Sparse_Retrieval_for_High-Recall_Information_Retrieval) on standard benchmarks.
|
|
10
|
+
|
|
11
|
+
<p align="center">
|
|
12
|
+
<table>
|
|
13
|
+
<tbody>
|
|
14
|
+
<td align="center">
|
|
15
|
+
<sub>Made possible by my <a href="https://github.com/sponsors/harlan-zw">Sponsor Program 💖</a><br> Follow me <a href="https://twitter.com/harlan_zw">@harlan_zw</a> 🐦 • Join <a href="https://discord.gg/275MBUBvgP">Discord</a> for help</sub><br>
|
|
16
|
+
</td>
|
|
17
|
+
</tbody>
|
|
18
|
+
</table>
|
|
19
|
+
</p>
|
|
20
|
+
|
|
21
|
+
## Features
|
|
22
|
+
|
|
23
|
+
- 🔀 **[Hybrid search](#local-first-sqlite)** - BM25 + vectors with [RRF fusion](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) in a single SQLite file
|
|
24
|
+
- 🔌 **[Swappable backends](#drivers)** - SQLite, LibSQL/Turso, pgvector, Upstash, Cloudflare Vectorize
|
|
25
|
+
- 🧠 **[Any embedding provider](#embedding-providers)** - OpenAI, Google, Mistral, Cohere, Ollama, or local [Transformers.js](https://huggingface.co/docs/transformers.js)
|
|
26
|
+
- ✂️ **[Automatic chunking](#with-chunking)** - Split large documents with configurable overlap
|
|
27
|
+
- 📦 **[Unified interface](#api)** - Same `SearchProvider` API across all drivers
|
|
28
|
+
|
|
29
|
+
## Installation
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
pnpm add retriv
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Usage
|
|
36
|
+
|
|
37
|
+
### Local-First (SQLite)
|
|
38
|
+
|
|
39
|
+
Single file with BM25 + vector search. No external services needed.
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pnpm add @huggingface/transformers sqlite-vec
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
```ts
|
|
46
|
+
import { createRetriv } from 'retriv'
|
|
47
|
+
import sqlite from 'retriv/db/sqlite'
|
|
48
|
+
import { transformers } from 'retriv/embeddings/transformers'
|
|
49
|
+
|
|
50
|
+
const search = await createRetriv({
|
|
51
|
+
driver: sqlite({
|
|
52
|
+
path: './search.db',
|
|
53
|
+
embeddings: transformers(), // runs locally, no API key
|
|
54
|
+
}),
|
|
55
|
+
})
|
|
56
|
+
|
|
57
|
+
await search.index([
|
|
58
|
+
{
|
|
59
|
+
id: '1',
|
|
60
|
+
content: 'How to mass delete Gmail emails using filters',
|
|
61
|
+
metadata: { source: 'https://support.google.com/mail', title: 'Gmail Help' },
|
|
62
|
+
},
|
|
63
|
+
{
|
|
64
|
+
id: '2',
|
|
65
|
+
content: 'Setting up email forwarding rules in Outlook',
|
|
66
|
+
metadata: { source: 'https://support.microsoft.com', title: 'Outlook Help' },
|
|
67
|
+
},
|
|
68
|
+
])
|
|
69
|
+
|
|
70
|
+
const results = await search.search('bulk remove messages', { returnMetadata: true })
|
|
71
|
+
// Finds #1 via semantic similarity even without keyword overlap
|
|
72
|
+
// results[0].metadata.source → 'https://support.google.com/mail'
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
### Swap to Cloud Embeddings
|
|
76
|
+
|
|
77
|
+
Same hybrid driver, better embeddings:
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
pnpm add @ai-sdk/openai ai sqlite-vec
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
```ts
|
|
84
|
+
import { createRetriv } from 'retriv'
|
|
85
|
+
import sqlite from 'retriv/db/sqlite'
|
|
86
|
+
import { openai } from 'retriv/embeddings/openai'
|
|
87
|
+
|
|
88
|
+
const search = await createRetriv({
|
|
89
|
+
driver: sqlite({
|
|
90
|
+
path: './search.db',
|
|
91
|
+
embeddings: openai(), // uses OPENAI_API_KEY env
|
|
92
|
+
}),
|
|
93
|
+
})
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### Swap to Cloud Vector DB
|
|
97
|
+
|
|
98
|
+
For serverless or edge deployments:
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
pnpm add @libsql/client better-sqlite3 @ai-sdk/openai ai
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
```ts
|
|
105
|
+
import { createRetriv } from 'retriv'
|
|
106
|
+
import libsql from 'retriv/db/libsql'
|
|
107
|
+
import sqliteFts from 'retriv/db/sqlite-fts'
|
|
108
|
+
import { openai } from 'retriv/embeddings/openai'
|
|
109
|
+
|
|
110
|
+
const search = await createRetriv({
|
|
111
|
+
driver: {
|
|
112
|
+
// Turso for vectors
|
|
113
|
+
vector: libsql({
|
|
114
|
+
url: 'libsql://your-db.turso.io',
|
|
115
|
+
authToken: process.env.TURSO_AUTH_TOKEN,
|
|
116
|
+
embeddings: openai(),
|
|
117
|
+
}),
|
|
118
|
+
// Local SQLite for BM25
|
|
119
|
+
keyword: sqliteFts({ path: './search.db' }),
|
|
120
|
+
},
|
|
121
|
+
})
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### With Chunking
|
|
125
|
+
|
|
126
|
+
Automatically split large documents:
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
pnpm add @huggingface/transformers sqlite-vec
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
```ts
|
|
133
|
+
import { createRetriv } from 'retriv'
|
|
134
|
+
import sqlite from 'retriv/db/sqlite'
|
|
135
|
+
import { transformers } from 'retriv/embeddings/transformers'
|
|
136
|
+
|
|
137
|
+
const search = await createRetriv({
|
|
138
|
+
driver: sqlite({
|
|
139
|
+
path: './search.db',
|
|
140
|
+
embeddings: transformers(),
|
|
141
|
+
}),
|
|
142
|
+
chunking: {
|
|
143
|
+
chunkSize: 1000,
|
|
144
|
+
chunkOverlap: 200,
|
|
145
|
+
},
|
|
146
|
+
})
|
|
147
|
+
|
|
148
|
+
await search.index([
|
|
149
|
+
{ id: 'doc-1', content: veryLongArticle },
|
|
150
|
+
])
|
|
151
|
+
|
|
152
|
+
const results = await search.search('specific topic')
|
|
153
|
+
// Results include _chunk: { parentId, index, range }
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
## Drivers
|
|
157
|
+
|
|
158
|
+
### Hybrid (Recommended)
|
|
159
|
+
|
|
160
|
+
| Driver | Import | Peer Dependencies |
|
|
161
|
+
|--------|--------|-------------------|
|
|
162
|
+
| SQLite | `retriv/db/sqlite` | `sqlite-vec` (Node.js >= 22.5) |
|
|
163
|
+
|
|
164
|
+
### Vector-Only (for composed hybrid)
|
|
165
|
+
|
|
166
|
+
| Driver | Import | Peer Dependencies |
|
|
167
|
+
|--------|--------|-------------------|
|
|
168
|
+
| LibSQL | `retriv/db/libsql` | `@libsql/client` |
|
|
169
|
+
| Upstash | `retriv/db/upstash` | `@upstash/vector` |
|
|
170
|
+
| Cloudflare | `retriv/db/cloudflare` | — (uses Cloudflare bindings) |
|
|
171
|
+
| pgvector | `retriv/db/pgvector` | `pg` |
|
|
172
|
+
| sqlite-vec | `retriv/db/sqlite-vec` | `sqlite-vec` (Node.js >= 22.5) |
|
|
173
|
+
|
|
174
|
+
### Keyword-Only (for composed hybrid)
|
|
175
|
+
|
|
176
|
+
| Driver | Import | Peer Dependencies |
|
|
177
|
+
|--------|--------|-------------------|
|
|
178
|
+
| SQLite FTS5 | `retriv/db/sqlite-fts` | `better-sqlite3` |
|
|
179
|
+
|
|
180
|
+
## Embedding Providers
|
|
181
|
+
|
|
182
|
+
All vector drivers accept an `embeddings` config:
|
|
183
|
+
|
|
184
|
+
| Provider | Import | Peer Dependencies |
|
|
185
|
+
|----------|--------|-------------------|
|
|
186
|
+
| OpenAI | `retriv/embeddings/openai` | `@ai-sdk/openai ai` |
|
|
187
|
+
| Google | `retriv/embeddings/google` | `@ai-sdk/google ai` |
|
|
188
|
+
| Mistral | `retriv/embeddings/mistral` | `@ai-sdk/mistral ai` |
|
|
189
|
+
| Cohere | `retriv/embeddings/cohere` | `@ai-sdk/cohere ai` |
|
|
190
|
+
| Ollama | `retriv/embeddings/ollama` | `ollama-ai-provider-v2 ai` |
|
|
191
|
+
| Transformers | `retriv/embeddings/transformers` | `@huggingface/transformers` |
|
|
192
|
+
|
|
193
|
+
```ts
|
|
194
|
+
// Cloud providers (require API keys)
|
|
195
|
+
openai({ model: 'text-embedding-3-small' })
|
|
196
|
+
google({ model: 'text-embedding-004' })
|
|
197
|
+
mistral({ model: 'mistral-embed' })
|
|
198
|
+
cohere({ model: 'embed-english-v3.0' })
|
|
199
|
+
|
|
200
|
+
// Local (no API key)
|
|
201
|
+
ollama({ model: 'nomic-embed-text' })
|
|
202
|
+
transformers({ model: 'Xenova/all-MiniLM-L6-v2' })
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
## API
|
|
206
|
+
|
|
207
|
+
### SearchProvider Interface
|
|
208
|
+
|
|
209
|
+
All drivers implement the same interface:
|
|
210
|
+
|
|
211
|
+
```ts
|
|
212
|
+
interface SearchProvider {
|
|
213
|
+
index: (docs: Document[]) => Promise<{ count: number }>
|
|
214
|
+
search: (query: string, options?: SearchOptions) => Promise<SearchResult[]>
|
|
215
|
+
remove?: (ids: string[]) => Promise<{ count: number }>
|
|
216
|
+
clear?: () => Promise<void>
|
|
217
|
+
close?: () => Promise<void>
|
|
218
|
+
}
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
### Search Options
|
|
222
|
+
|
|
223
|
+
```ts
|
|
224
|
+
interface SearchOptions {
|
|
225
|
+
limit?: number // Max results (default varies by driver)
|
|
226
|
+
returnContent?: boolean // Include original content in results
|
|
227
|
+
returnMetadata?: boolean // Include metadata in results
|
|
228
|
+
returnMeta?: boolean // Include driver-specific _meta
|
|
229
|
+
}
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
### Search Result
|
|
233
|
+
|
|
234
|
+
```ts
|
|
235
|
+
interface SearchResult {
|
|
236
|
+
id: string // Document ID
|
|
237
|
+
score: number // 0-1, higher is better
|
|
238
|
+
content?: string // If returnContent: true
|
|
239
|
+
metadata?: Record<string, any> // If returnMetadata: true
|
|
240
|
+
_chunk?: ChunkInfo // When chunking enabled
|
|
241
|
+
_meta?: SearchMeta // If returnMeta: true (driver-specific extras)
|
|
242
|
+
}
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
## Benchmarks
|
|
246
|
+
|
|
247
|
+
Retrieval accuracy on Nuxt documentation (639 docs):
|
|
248
|
+
|
|
249
|
+
| Test Type | FTS | Vector | Hybrid |
|
|
250
|
+
|-----------|-----|--------|--------|
|
|
251
|
+
| Exact terminology (ports, config names) | 3/3 | 2/3 | 3/3 |
|
|
252
|
+
| Doc retrieval (keyword overlap) | 3/3 | 2/3 | 3/3 |
|
|
253
|
+
| Semantic queries (synonyms, no overlap) | 0/3 | 3/3 | 3/3 |
|
|
254
|
+
| **Total** | **6/9 (67%)** | **7/9 (78%)** | **9/9 (100%)** |
|
|
255
|
+
|
|
256
|
+
- **FTS** excels at exact terms but fails semantic queries ("reuse logic" → composables)
|
|
257
|
+
- **Vector** understands meaning but misses precise terminology ("port 3000")
|
|
258
|
+
- **Hybrid** combines both - never worse than either method alone
|
|
259
|
+
|
|
260
|
+
Run locally: `pnpm test:eval`
|
|
261
|
+
|
|
262
|
+
## Sponsors
|
|
263
|
+
|
|
264
|
+
<p align="center">
|
|
265
|
+
<a href="https://raw.githubusercontent.com/harlan-zw/static/main/sponsors.svg">
|
|
266
|
+
<img src='https://raw.githubusercontent.com/harlan-zw/static/main/sponsors.svg'/>
|
|
267
|
+
</a>
|
|
268
|
+
</p>
|
|
269
|
+
|
|
270
|
+
## License
|
|
271
|
+
|
|
272
|
+
Licensed under the [MIT license](https://github.com/harlan-zw/retriv/blob/main/LICENSE).
|
|
273
|
+
|
|
274
|
+
<!-- Badges -->
|
|
275
|
+
[npm-version-src]: https://img.shields.io/npm/v/retriv/latest.svg?style=flat&colorA=18181B&colorB=28CF8D
|
|
276
|
+
[npm-version-href]: https://npmjs.com/package/retriv
|
|
277
|
+
|
|
278
|
+
[npm-downloads-src]: https://img.shields.io/npm/dm/retriv.svg?style=flat&colorA=18181B&colorB=28CF8D
|
|
279
|
+
[npm-downloads-href]: https://npmjs.com/package/retriv
|
|
280
|
+
|
|
281
|
+
[license-src]: https://img.shields.io/github/license/harlan-zw/retriv.svg?style=flat&colorA=18181B&colorB=28CF8D
|
|
282
|
+
[license-href]: https://github.com/harlan-zw/retriv/blob/main/LICENSE
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
const MARKDOWN_SEPARATORS = [
|
|
2
|
+
"\n## ",
|
|
3
|
+
"\n### ",
|
|
4
|
+
"\n#### ",
|
|
5
|
+
"\n##### ",
|
|
6
|
+
"\n###### ",
|
|
7
|
+
"```\n\n",
|
|
8
|
+
"\n\n***\n\n",
|
|
9
|
+
"\n\n---\n\n",
|
|
10
|
+
"\n\n___\n\n",
|
|
11
|
+
"\n\n",
|
|
12
|
+
"\n",
|
|
13
|
+
" ",
|
|
14
|
+
""
|
|
15
|
+
];
|
|
16
|
+
function offsetToLine(text, offset) {
|
|
17
|
+
let line = 1;
|
|
18
|
+
for (let i = 0; i < offset && i < text.length; i++) if (text[i] === "\n") line++;
|
|
19
|
+
return line;
|
|
20
|
+
}
|
|
21
|
+
function splitText(text, options = {}) {
|
|
22
|
+
const { chunkSize = 1e3, chunkOverlap = 200, separators = MARKDOWN_SEPARATORS } = options;
|
|
23
|
+
if (text.length <= chunkSize) {
|
|
24
|
+
const endLine = offsetToLine(text, text.length);
|
|
25
|
+
return [{
|
|
26
|
+
text,
|
|
27
|
+
index: 0,
|
|
28
|
+
range: [0, text.length],
|
|
29
|
+
lines: [1, endLine]
|
|
30
|
+
}];
|
|
31
|
+
}
|
|
32
|
+
return mergeChunks(splitRecursive(text, chunkSize, separators), chunkSize, chunkOverlap, text);
|
|
33
|
+
}
|
|
34
|
+
function splitRecursive(text, chunkSize, separators) {
|
|
35
|
+
if (text.length <= chunkSize || separators.length === 0) return [text];
|
|
36
|
+
const separator = separators.find((sep) => sep === "" || text.includes(sep));
|
|
37
|
+
if (!separator && separator !== "") return [text];
|
|
38
|
+
const parts = separator === "" ? [...text] : text.split(separator);
|
|
39
|
+
const results = [];
|
|
40
|
+
for (let i = 0; i < parts.length; i++) {
|
|
41
|
+
const part = parts[i];
|
|
42
|
+
const withSep = i < parts.length - 1 && separator !== "" ? part + separator : part;
|
|
43
|
+
if (withSep.length <= chunkSize) results.push(withSep);
|
|
44
|
+
else {
|
|
45
|
+
const subParts = splitRecursive(withSep, chunkSize, separators.slice(1));
|
|
46
|
+
results.push(...subParts);
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
return results;
|
|
50
|
+
}
|
|
51
|
+
function mergeChunks(parts, chunkSize, chunkOverlap, originalText) {
|
|
52
|
+
const chunks = [];
|
|
53
|
+
let current = "";
|
|
54
|
+
let currentStart = 0;
|
|
55
|
+
for (const part of parts) if (current.length + part.length <= chunkSize) current += part;
|
|
56
|
+
else {
|
|
57
|
+
if (current) {
|
|
58
|
+
const start = originalText.indexOf(current, currentStart);
|
|
59
|
+
const actualStart = start >= 0 ? start : currentStart;
|
|
60
|
+
const actualEnd = actualStart + current.length;
|
|
61
|
+
chunks.push({
|
|
62
|
+
text: current,
|
|
63
|
+
index: chunks.length,
|
|
64
|
+
range: [actualStart, actualEnd],
|
|
65
|
+
lines: [offsetToLine(originalText, actualStart), offsetToLine(originalText, actualEnd)]
|
|
66
|
+
});
|
|
67
|
+
currentStart = Math.max(0, actualStart + current.length - chunkOverlap);
|
|
68
|
+
}
|
|
69
|
+
if (chunkOverlap > 0 && current.length > chunkOverlap) current = current.slice(-chunkOverlap) + part;
|
|
70
|
+
else current = part;
|
|
71
|
+
}
|
|
72
|
+
if (current) {
|
|
73
|
+
const start = originalText.indexOf(current, currentStart);
|
|
74
|
+
const actualStart = start >= 0 ? start : currentStart;
|
|
75
|
+
const actualEnd = start >= 0 ? start + current.length : originalText.length;
|
|
76
|
+
chunks.push({
|
|
77
|
+
text: current,
|
|
78
|
+
index: chunks.length,
|
|
79
|
+
range: [actualStart, actualEnd],
|
|
80
|
+
lines: [offsetToLine(originalText, actualStart), offsetToLine(originalText, actualEnd)]
|
|
81
|
+
});
|
|
82
|
+
}
|
|
83
|
+
return chunks;
|
|
84
|
+
}
|
|
85
|
+
export { splitText as t };
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import { BaseDriverConfig, EmbeddingConfig, SearchProvider } from "../types.mjs";
|
|
2
|
+
|
|
3
|
+
//#region src/db/cloudflare.d.ts
|
|
4
|
+
interface VectorizeIndexBinding {
|
|
5
|
+
query: (vector: number[], options?: any) => Promise<{
|
|
6
|
+
matches: any[];
|
|
7
|
+
count?: number;
|
|
8
|
+
}>;
|
|
9
|
+
insert: (vectors: any[]) => Promise<void>;
|
|
10
|
+
upsert: (vectors: any[]) => Promise<void>;
|
|
11
|
+
deleteByIds: (ids: string[]) => Promise<void>;
|
|
12
|
+
}
|
|
13
|
+
interface CloudflareConfig extends BaseDriverConfig {
|
|
14
|
+
/** Cloudflare Vectorize binding instance */
|
|
15
|
+
binding: VectorizeIndexBinding;
|
|
16
|
+
/** Embedding provider from retriv/embeddings/ */
|
|
17
|
+
embeddings: EmbeddingConfig;
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* Create a Cloudflare Vectorize search provider
|
|
21
|
+
* For use in Cloudflare Workers at runtime
|
|
22
|
+
*
|
|
23
|
+
* @example
|
|
24
|
+
* ```ts
|
|
25
|
+
* import { cloudflare } from 'retriv/db/cloudflare'
|
|
26
|
+
* import { openai } from 'retriv/embeddings/openai'
|
|
27
|
+
*
|
|
28
|
+
* const db = await cloudflare({
|
|
29
|
+
* binding: env.VECTORIZE,
|
|
30
|
+
* embeddings: openai({ model: 'text-embedding-3-small' }),
|
|
31
|
+
* })
|
|
32
|
+
* ```
|
|
33
|
+
*/
|
|
34
|
+
declare function cloudflare(config: CloudflareConfig): Promise<SearchProvider>;
|
|
35
|
+
//#endregion
|
|
36
|
+
export { CloudflareConfig, cloudflare, cloudflare as default };
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import { resolveEmbedding } from "../embeddings/resolve.mjs";
|
|
2
|
+
async function cloudflare(config) {
|
|
3
|
+
const { binding } = config;
|
|
4
|
+
if (!binding) throw new Error("[cloudflare] binding is required");
|
|
5
|
+
if (!config.embeddings) throw new Error("[cloudflare] embeddings is required");
|
|
6
|
+
const { embedder } = await resolveEmbedding(config.embeddings);
|
|
7
|
+
return {
|
|
8
|
+
async index(docs) {
|
|
9
|
+
if (docs.length === 0) return { count: 0 };
|
|
10
|
+
const embeddings = await embedder(docs.map((d) => d.content));
|
|
11
|
+
if (embeddings.length !== docs.length) throw new Error(`Embedding count mismatch: expected ${docs.length}, got ${embeddings.length}`);
|
|
12
|
+
const vectors = docs.map((doc, i) => ({
|
|
13
|
+
id: doc.id,
|
|
14
|
+
values: embeddings[i],
|
|
15
|
+
metadata: {
|
|
16
|
+
...doc.metadata,
|
|
17
|
+
_content: doc.content
|
|
18
|
+
}
|
|
19
|
+
}));
|
|
20
|
+
await binding.upsert(vectors);
|
|
21
|
+
return { count: docs.length };
|
|
22
|
+
},
|
|
23
|
+
async search(query, options = {}) {
|
|
24
|
+
const { limit = 10, returnContent = false, returnMetadata = true } = options;
|
|
25
|
+
const [embedding] = await embedder([query]);
|
|
26
|
+
if (!embedding) throw new Error("Failed to generate query embedding");
|
|
27
|
+
return ((await binding.query(embedding, {
|
|
28
|
+
topK: limit,
|
|
29
|
+
returnValues: false,
|
|
30
|
+
returnMetadata: true
|
|
31
|
+
})).matches || []).map((m) => {
|
|
32
|
+
const result = {
|
|
33
|
+
id: m.id,
|
|
34
|
+
score: Math.max(0, Math.min(1, m.score))
|
|
35
|
+
};
|
|
36
|
+
if (returnContent && m.metadata?._content) result.content = m.metadata._content;
|
|
37
|
+
if (returnMetadata && m.metadata) {
|
|
38
|
+
const { _content, ...rest } = m.metadata;
|
|
39
|
+
if (Object.keys(rest).length > 0) result.metadata = rest;
|
|
40
|
+
}
|
|
41
|
+
return result;
|
|
42
|
+
});
|
|
43
|
+
},
|
|
44
|
+
async remove(ids) {
|
|
45
|
+
await binding.deleteByIds(ids);
|
|
46
|
+
return { count: ids.length };
|
|
47
|
+
},
|
|
48
|
+
async clear() {
|
|
49
|
+
throw new Error("[cloudflare] clear() is not supported - use wrangler CLI instead");
|
|
50
|
+
},
|
|
51
|
+
async close() {}
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
var cloudflare_default = cloudflare;
|
|
55
|
+
export { cloudflare, cloudflare_default as default };
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import { BaseDriverConfig, EmbeddingConfig, SearchProvider } from "../types.mjs";
|
|
2
|
+
|
|
3
|
+
//#region src/db/libsql.d.ts
|
|
4
|
+
interface LibsqlConfig extends BaseDriverConfig {
|
|
5
|
+
/** Database URL (file:path.db for local, libsql://... for remote) */
|
|
6
|
+
url?: string;
|
|
7
|
+
/** Auth token for remote LibSQL/Turso */
|
|
8
|
+
authToken?: string;
|
|
9
|
+
/** Embedding provider from retriv/embeddings/ */
|
|
10
|
+
embeddings: EmbeddingConfig;
|
|
11
|
+
}
|
|
12
|
+
/**
|
|
13
|
+
* Create a LibSQL/Turso vector search provider
|
|
14
|
+
* Supports local SQLite files and remote Turso databases
|
|
15
|
+
*
|
|
16
|
+
* @example
|
|
17
|
+
* ```ts
|
|
18
|
+
* import { libsql } from 'retriv/db/libsql'
|
|
19
|
+
* import { openai } from 'retriv/embeddings/openai'
|
|
20
|
+
*
|
|
21
|
+
* const db = await libsql({
|
|
22
|
+
* url: 'libsql://your-db.turso.io',
|
|
23
|
+
* authToken: process.env.TURSO_AUTH_TOKEN,
|
|
24
|
+
* embeddings: openai({ model: 'text-embedding-3-small' }),
|
|
25
|
+
* })
|
|
26
|
+
* ```
|
|
27
|
+
*/
|
|
28
|
+
declare function libsql(config: LibsqlConfig): Promise<SearchProvider>;
|
|
29
|
+
//#endregion
|
|
30
|
+
export { LibsqlConfig, libsql as default, libsql };
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import { resolveEmbedding } from "../embeddings/resolve.mjs";
|
|
2
|
+
import { createClient } from "@libsql/client";
|
|
3
|
+
async function libsql(config) {
|
|
4
|
+
const url = config.url || config.path || "file:vectors.db";
|
|
5
|
+
const { authToken } = config;
|
|
6
|
+
if (!config.embeddings) throw new Error("[libsql] embeddings is required");
|
|
7
|
+
const { embedder, dimensions } = await resolveEmbedding(config.embeddings);
|
|
8
|
+
const client = createClient({
|
|
9
|
+
url,
|
|
10
|
+
...authToken && { authToken }
|
|
11
|
+
});
|
|
12
|
+
await client.execute(`
|
|
13
|
+
CREATE TABLE IF NOT EXISTS vectors (
|
|
14
|
+
id TEXT PRIMARY KEY,
|
|
15
|
+
content TEXT,
|
|
16
|
+
metadata TEXT,
|
|
17
|
+
embedding F32_BLOB(${dimensions})
|
|
18
|
+
)
|
|
19
|
+
`);
|
|
20
|
+
return {
|
|
21
|
+
async index(docs) {
|
|
22
|
+
if (docs.length === 0) return { count: 0 };
|
|
23
|
+
const embeddings = await embedder(docs.map((d) => d.content));
|
|
24
|
+
if (embeddings.length !== docs.length) throw new Error(`Embedding count mismatch: expected ${docs.length}, got ${embeddings.length}`);
|
|
25
|
+
for (let i = 0; i < docs.length; i++) {
|
|
26
|
+
const doc = docs[i];
|
|
27
|
+
const vector = embeddings[i];
|
|
28
|
+
const vectorStr = JSON.stringify(vector);
|
|
29
|
+
await client.execute({
|
|
30
|
+
sql: `
|
|
31
|
+
INSERT OR REPLACE INTO vectors (id, content, metadata, embedding)
|
|
32
|
+
VALUES (?, ?, ?, vector(?))
|
|
33
|
+
`,
|
|
34
|
+
args: [
|
|
35
|
+
doc.id,
|
|
36
|
+
doc.content,
|
|
37
|
+
doc.metadata ? JSON.stringify(doc.metadata) : null,
|
|
38
|
+
vectorStr
|
|
39
|
+
]
|
|
40
|
+
});
|
|
41
|
+
}
|
|
42
|
+
return { count: docs.length };
|
|
43
|
+
},
|
|
44
|
+
async search(query, options = {}) {
|
|
45
|
+
const { limit = 10, returnContent = false, returnMetadata = true } = options;
|
|
46
|
+
const [embedding] = await embedder([query]);
|
|
47
|
+
if (!embedding) throw new Error("Failed to generate query embedding");
|
|
48
|
+
const vectorStr = JSON.stringify(embedding);
|
|
49
|
+
return ((await client.execute({
|
|
50
|
+
sql: `
|
|
51
|
+
SELECT
|
|
52
|
+
id,
|
|
53
|
+
content,
|
|
54
|
+
metadata,
|
|
55
|
+
vector_distance_cos(embedding, vector32(?)) as distance
|
|
56
|
+
FROM vectors
|
|
57
|
+
ORDER BY distance
|
|
58
|
+
LIMIT ?
|
|
59
|
+
`,
|
|
60
|
+
args: [vectorStr, limit]
|
|
61
|
+
})).rows || []).map((row) => {
|
|
62
|
+
const result = {
|
|
63
|
+
id: row.id,
|
|
64
|
+
score: Math.max(0, 1 - row.distance)
|
|
65
|
+
};
|
|
66
|
+
if (returnContent && row.content) result.content = row.content;
|
|
67
|
+
if (returnMetadata && row.metadata) result.metadata = JSON.parse(row.metadata);
|
|
68
|
+
return result;
|
|
69
|
+
});
|
|
70
|
+
},
|
|
71
|
+
async remove(ids) {
|
|
72
|
+
for (const id of ids) await client.execute({
|
|
73
|
+
sql: "DELETE FROM vectors WHERE id = ?",
|
|
74
|
+
args: [id]
|
|
75
|
+
});
|
|
76
|
+
return { count: ids.length };
|
|
77
|
+
},
|
|
78
|
+
async clear() {
|
|
79
|
+
await client.execute("DELETE FROM vectors");
|
|
80
|
+
},
|
|
81
|
+
async close() {
|
|
82
|
+
client.close();
|
|
83
|
+
}
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
var libsql_default = libsql;
|
|
87
|
+
export { libsql_default as default, libsql };
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import { BaseDriverConfig, EmbeddingConfig, SearchProvider } from "../types.mjs";
|
|
2
|
+
|
|
3
|
+
//#region src/db/pgvector.d.ts
|
|
4
|
+
interface PgvectorConfig extends BaseDriverConfig {
|
|
5
|
+
/** PostgreSQL connection URL */
|
|
6
|
+
url: string;
|
|
7
|
+
/** Table name for vectors */
|
|
8
|
+
table?: string;
|
|
9
|
+
/** Embedding provider from retriv/embeddings/ */
|
|
10
|
+
embeddings: EmbeddingConfig;
|
|
11
|
+
/** Distance metric */
|
|
12
|
+
metric?: 'cosine' | 'euclidean' | 'inner_product';
|
|
13
|
+
}
|
|
14
|
+
/**
|
|
15
|
+
* Create a PostgreSQL pgvector search provider
|
|
16
|
+
*
|
|
17
|
+
* @example
|
|
18
|
+
* ```ts
|
|
19
|
+
* import { pgvector } from 'retriv/db/pgvector'
|
|
20
|
+
* import { openai } from 'retriv/embeddings/openai'
|
|
21
|
+
*
|
|
22
|
+
* const db = await pgvector({
|
|
23
|
+
* url: process.env.DATABASE_URL,
|
|
24
|
+
* embeddings: openai({ model: 'text-embedding-3-small' }),
|
|
25
|
+
* })
|
|
26
|
+
* ```
|
|
27
|
+
*/
|
|
28
|
+
declare function pgvector(config: PgvectorConfig): Promise<SearchProvider>;
|
|
29
|
+
//#endregion
|
|
30
|
+
export { PgvectorConfig, pgvector as default, pgvector };
|