retriv 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +7 -7
- package/dist/_chunks/extract-snippet.mjs +122 -0
- package/dist/db/cloudflare.mjs +9 -1
- package/dist/db/libsql.mjs +9 -1
- package/dist/db/pgvector.mjs +9 -1
- package/dist/db/sqlite-fts.mjs +9 -1
- package/dist/db/sqlite-vec.mjs +9 -1
- package/dist/db/upstash.mjs +9 -1
- package/dist/embeddings/model-info.mjs +2 -2
- package/dist/embeddings/transformers-js.mjs +1 -1
- package/package.json +8 -6
package/README.md
CHANGED
|
@@ -244,18 +244,18 @@ interface SearchResult {
|
|
|
244
244
|
|
|
245
245
|
## Benchmarks
|
|
246
246
|
|
|
247
|
-
Retrieval accuracy on Nuxt documentation (
|
|
247
|
+
Retrieval accuracy on Nuxt documentation (2,817 chunks):
|
|
248
248
|
|
|
249
249
|
| Test Type | FTS | Vector | Hybrid |
|
|
250
250
|
|-----------|-----|--------|--------|
|
|
251
|
-
| Exact terminology (ports, config names) |
|
|
252
|
-
| Doc retrieval (keyword overlap) |
|
|
253
|
-
| Semantic queries (synonyms, no overlap) |
|
|
254
|
-
| **Total** | **
|
|
251
|
+
| Exact terminology (ports, config names) | 7/7 | 5/7 | 7/7 |
|
|
252
|
+
| Doc retrieval (keyword overlap) | 0/7 | 5/7 | 5/7 |
|
|
253
|
+
| Semantic queries (synonyms, no overlap) | 1/6 | 5/6 | 5/6 |
|
|
254
|
+
| **Total** | **8/20 (40%)** | **15/20 (75%)** | **17/20 (85%)** |
|
|
255
255
|
|
|
256
256
|
- **FTS** excels at exact terms but fails semantic queries ("reuse logic" → composables)
|
|
257
|
-
- **Vector** understands meaning but misses precise terminology ("
|
|
258
|
-
- **Hybrid** combines both -
|
|
257
|
+
- **Vector** understands meaning but misses precise terminology (".global" suffix)
|
|
258
|
+
- **Hybrid** combines both - best overall recall across query types
|
|
259
259
|
|
|
260
260
|
Run locally: `pnpm test:eval`
|
|
261
261
|
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
const STOPWORDS = new Set([
|
|
2
|
+
"the",
|
|
3
|
+
"a",
|
|
4
|
+
"an",
|
|
5
|
+
"and",
|
|
6
|
+
"or",
|
|
7
|
+
"but",
|
|
8
|
+
"in",
|
|
9
|
+
"on",
|
|
10
|
+
"at",
|
|
11
|
+
"to",
|
|
12
|
+
"for",
|
|
13
|
+
"of",
|
|
14
|
+
"with",
|
|
15
|
+
"by",
|
|
16
|
+
"from",
|
|
17
|
+
"as",
|
|
18
|
+
"is",
|
|
19
|
+
"was",
|
|
20
|
+
"are",
|
|
21
|
+
"were",
|
|
22
|
+
"been",
|
|
23
|
+
"be",
|
|
24
|
+
"have",
|
|
25
|
+
"has",
|
|
26
|
+
"had",
|
|
27
|
+
"do",
|
|
28
|
+
"does",
|
|
29
|
+
"did",
|
|
30
|
+
"will",
|
|
31
|
+
"would",
|
|
32
|
+
"could",
|
|
33
|
+
"should",
|
|
34
|
+
"may",
|
|
35
|
+
"might",
|
|
36
|
+
"must",
|
|
37
|
+
"can",
|
|
38
|
+
"this",
|
|
39
|
+
"that",
|
|
40
|
+
"these",
|
|
41
|
+
"those",
|
|
42
|
+
"it",
|
|
43
|
+
"its",
|
|
44
|
+
"they",
|
|
45
|
+
"them",
|
|
46
|
+
"their",
|
|
47
|
+
"we",
|
|
48
|
+
"our",
|
|
49
|
+
"you",
|
|
50
|
+
"your",
|
|
51
|
+
"what",
|
|
52
|
+
"which",
|
|
53
|
+
"who",
|
|
54
|
+
"how",
|
|
55
|
+
"when",
|
|
56
|
+
"where",
|
|
57
|
+
"why",
|
|
58
|
+
"all",
|
|
59
|
+
"each",
|
|
60
|
+
"every",
|
|
61
|
+
"both",
|
|
62
|
+
"few",
|
|
63
|
+
"more",
|
|
64
|
+
"most",
|
|
65
|
+
"other",
|
|
66
|
+
"some",
|
|
67
|
+
"such",
|
|
68
|
+
"no",
|
|
69
|
+
"not",
|
|
70
|
+
"only"
|
|
71
|
+
]);
|
|
72
|
+
function scoreTerms(terms, content) {
|
|
73
|
+
const contentLower = content.toLowerCase();
|
|
74
|
+
const contentLen = content.length;
|
|
75
|
+
return terms.filter((t) => contentLower.includes(t)).map((term) => {
|
|
76
|
+
const regex = new RegExp(term, "gi");
|
|
77
|
+
const matches = contentLower.match(regex);
|
|
78
|
+
const tf = matches ? matches.length : 0;
|
|
79
|
+
const k1 = 1.2;
|
|
80
|
+
const tfNorm = tf * (k1 + 1) / (tf + k1 * (.25 + .75 * (contentLen / 500)));
|
|
81
|
+
const stopwordPenalty = STOPWORDS.has(term) ? .1 : 1;
|
|
82
|
+
const lengthBoost = Math.min(term.length / 5, 1.5);
|
|
83
|
+
return {
|
|
84
|
+
term,
|
|
85
|
+
score: tfNorm * stopwordPenalty * lengthBoost
|
|
86
|
+
};
|
|
87
|
+
}).sort((a, b) => b.score - a.score);
|
|
88
|
+
}
|
|
89
|
+
function extractSnippet(content, query, contextLines = 2) {
|
|
90
|
+
const lines = content.split("\n");
|
|
91
|
+
const totalContext = contextLines * 2 + 1;
|
|
92
|
+
const queryWords = query.toLowerCase().split(/\s+/).filter((w) => w.length > 2);
|
|
93
|
+
const scoredTerms = scoreTerms(queryWords, content);
|
|
94
|
+
const highlights = scoredTerms.slice(0, 5).map((t) => t.term);
|
|
95
|
+
if (lines.length <= totalContext) return {
|
|
96
|
+
snippet: content,
|
|
97
|
+
highlights
|
|
98
|
+
};
|
|
99
|
+
const termScores = new Map(scoredTerms.map((t) => [t.term, t.score]));
|
|
100
|
+
let bestIdx = 0;
|
|
101
|
+
let bestScore = 0;
|
|
102
|
+
for (let i = 0; i < lines.length; i++) {
|
|
103
|
+
const lineLower = lines[i].toLowerCase();
|
|
104
|
+
let score = 0;
|
|
105
|
+
for (const word of queryWords) if (lineLower.includes(word)) score += termScores.get(word) || 1;
|
|
106
|
+
if (score > bestScore) {
|
|
107
|
+
bestScore = score;
|
|
108
|
+
bestIdx = i;
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
if (bestScore === 0) return {
|
|
112
|
+
snippet: lines.slice(0, totalContext).join("\n"),
|
|
113
|
+
highlights
|
|
114
|
+
};
|
|
115
|
+
const start = Math.max(0, bestIdx - contextLines);
|
|
116
|
+
const end = Math.min(lines.length, bestIdx + contextLines + 1);
|
|
117
|
+
return {
|
|
118
|
+
snippet: lines.slice(start, end).join("\n"),
|
|
119
|
+
highlights
|
|
120
|
+
};
|
|
121
|
+
}
|
|
122
|
+
export { extractSnippet as t };
|
package/dist/db/cloudflare.mjs
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { resolveEmbedding } from "../embeddings/resolve.mjs";
|
|
2
|
+
import { t as extractSnippet } from "../_chunks/extract-snippet.mjs";
|
|
2
3
|
async function cloudflare(config) {
|
|
3
4
|
const { binding } = config;
|
|
4
5
|
if (!binding) throw new Error("[cloudflare] binding is required");
|
|
@@ -33,7 +34,14 @@ async function cloudflare(config) {
|
|
|
33
34
|
id: m.id,
|
|
34
35
|
score: Math.max(0, Math.min(1, m.score))
|
|
35
36
|
};
|
|
36
|
-
if (returnContent && m.metadata?._content)
|
|
37
|
+
if (returnContent && m.metadata?._content) {
|
|
38
|
+
const { snippet, highlights } = extractSnippet(m.metadata._content, query);
|
|
39
|
+
result.content = snippet;
|
|
40
|
+
if (highlights.length) result._meta = {
|
|
41
|
+
...result._meta,
|
|
42
|
+
highlights
|
|
43
|
+
};
|
|
44
|
+
}
|
|
37
45
|
if (returnMetadata && m.metadata) {
|
|
38
46
|
const { _content, ...rest } = m.metadata;
|
|
39
47
|
if (Object.keys(rest).length > 0) result.metadata = rest;
|
package/dist/db/libsql.mjs
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { resolveEmbedding } from "../embeddings/resolve.mjs";
|
|
2
|
+
import { t as extractSnippet } from "../_chunks/extract-snippet.mjs";
|
|
2
3
|
import { createClient } from "@libsql/client";
|
|
3
4
|
async function libsql(config) {
|
|
4
5
|
const url = config.url || config.path || "file:vectors.db";
|
|
@@ -63,7 +64,14 @@ async function libsql(config) {
|
|
|
63
64
|
id: row.id,
|
|
64
65
|
score: Math.max(0, 1 - row.distance)
|
|
65
66
|
};
|
|
66
|
-
if (returnContent && row.content)
|
|
67
|
+
if (returnContent && row.content) {
|
|
68
|
+
const { snippet, highlights } = extractSnippet(row.content, query);
|
|
69
|
+
result.content = snippet;
|
|
70
|
+
if (highlights.length) result._meta = {
|
|
71
|
+
...result._meta,
|
|
72
|
+
highlights
|
|
73
|
+
};
|
|
74
|
+
}
|
|
67
75
|
if (returnMetadata && row.metadata) result.metadata = JSON.parse(row.metadata);
|
|
68
76
|
return result;
|
|
69
77
|
});
|
package/dist/db/pgvector.mjs
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { resolveEmbedding } from "../embeddings/resolve.mjs";
|
|
2
|
+
import { t as extractSnippet } from "../_chunks/extract-snippet.mjs";
|
|
2
3
|
import pg from "pg";
|
|
3
4
|
async function pgvector(config) {
|
|
4
5
|
const { url, table = "vectors", metric = "cosine" } = config;
|
|
@@ -59,7 +60,14 @@ async function pgvector(config) {
|
|
|
59
60
|
id: row.id,
|
|
60
61
|
score
|
|
61
62
|
};
|
|
62
|
-
if (returnContent && row.content)
|
|
63
|
+
if (returnContent && row.content) {
|
|
64
|
+
const { snippet, highlights } = extractSnippet(row.content, query);
|
|
65
|
+
searchResult.content = snippet;
|
|
66
|
+
if (highlights.length) searchResult._meta = {
|
|
67
|
+
...searchResult._meta,
|
|
68
|
+
highlights
|
|
69
|
+
};
|
|
70
|
+
}
|
|
63
71
|
if (returnMetadata && row.metadata) searchResult.metadata = row.metadata;
|
|
64
72
|
return searchResult;
|
|
65
73
|
});
|
package/dist/db/sqlite-fts.mjs
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { t as extractSnippet } from "../_chunks/extract-snippet.mjs";
|
|
1
2
|
import { mkdirSync } from "node:fs";
|
|
2
3
|
import { dirname } from "node:path";
|
|
3
4
|
async function sqliteFts(config = {}) {
|
|
@@ -49,7 +50,14 @@ async function sqliteFts(config = {}) {
|
|
|
49
50
|
id: row.id,
|
|
50
51
|
score: normalizedScore
|
|
51
52
|
};
|
|
52
|
-
if (returnContent && row.content)
|
|
53
|
+
if (returnContent && row.content) {
|
|
54
|
+
const { snippet, highlights } = extractSnippet(row.content, query);
|
|
55
|
+
result.content = snippet;
|
|
56
|
+
if (highlights.length) result._meta = {
|
|
57
|
+
...result._meta,
|
|
58
|
+
highlights
|
|
59
|
+
};
|
|
60
|
+
}
|
|
53
61
|
if (returnMetadata && row.metadata) result.metadata = JSON.parse(row.metadata);
|
|
54
62
|
return result;
|
|
55
63
|
});
|
package/dist/db/sqlite-vec.mjs
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { resolveEmbedding } from "../embeddings/resolve.mjs";
|
|
2
|
+
import { t as extractSnippet } from "../_chunks/extract-snippet.mjs";
|
|
2
3
|
import { mkdirSync } from "node:fs";
|
|
3
4
|
import { dirname } from "node:path";
|
|
4
5
|
import * as sqliteVecExt from "sqlite-vec";
|
|
@@ -73,7 +74,14 @@ async function sqliteVec(config) {
|
|
|
73
74
|
id: meta.id,
|
|
74
75
|
score: 1 / (1 + row.distance)
|
|
75
76
|
};
|
|
76
|
-
if (returnContent && meta.content)
|
|
77
|
+
if (returnContent && meta.content) {
|
|
78
|
+
const { snippet, highlights } = extractSnippet(meta.content, query);
|
|
79
|
+
result.content = snippet;
|
|
80
|
+
if (highlights.length) result._meta = {
|
|
81
|
+
...result._meta,
|
|
82
|
+
highlights
|
|
83
|
+
};
|
|
84
|
+
}
|
|
77
85
|
if (returnMetadata && meta.metadata) result.metadata = JSON.parse(meta.metadata);
|
|
78
86
|
return result;
|
|
79
87
|
}).filter(Boolean);
|
package/dist/db/upstash.mjs
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { t as extractSnippet } from "../_chunks/extract-snippet.mjs";
|
|
1
2
|
import { Index } from "@upstash/vector";
|
|
2
3
|
async function upstash(config) {
|
|
3
4
|
const { url, token, namespace } = config;
|
|
@@ -34,7 +35,14 @@ async function upstash(config) {
|
|
|
34
35
|
id: m.id,
|
|
35
36
|
score: Math.max(0, Math.min(1, m.score))
|
|
36
37
|
};
|
|
37
|
-
if (returnContent && m.metadata?._content)
|
|
38
|
+
if (returnContent && m.metadata?._content) {
|
|
39
|
+
const { snippet, highlights } = extractSnippet(m.metadata._content, query);
|
|
40
|
+
result.content = snippet;
|
|
41
|
+
if (highlights.length) result._meta = {
|
|
42
|
+
...result._meta,
|
|
43
|
+
highlights
|
|
44
|
+
};
|
|
45
|
+
}
|
|
38
46
|
if (returnMetadata && m.metadata) {
|
|
39
47
|
const { _content, ...rest } = m.metadata;
|
|
40
48
|
if (Object.keys(rest).length > 0) result.metadata = rest;
|
|
@@ -11,7 +11,7 @@ async function clearCorruptedCache(error, model) {
|
|
|
11
11
|
return true;
|
|
12
12
|
}
|
|
13
13
|
function transformersJs(options = {}) {
|
|
14
|
-
const model = resolveModelForPreset(options.model ?? "bge-
|
|
14
|
+
const model = resolveModelForPreset(options.model ?? "bge-small-en-v1.5", "transformers.js");
|
|
15
15
|
let cached = null;
|
|
16
16
|
return { async resolve() {
|
|
17
17
|
if (cached) return cached;
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "retriv",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.
|
|
4
|
+
"version": "0.3.0",
|
|
5
5
|
"description": "Unified document retrieval for search - semantic, full-text, and fuzzy.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -139,17 +139,18 @@
|
|
|
139
139
|
"@huggingface/transformers": "^3.8.1",
|
|
140
140
|
"@libsql/client": "^0.17.0",
|
|
141
141
|
"@types/node": "^25.2.0",
|
|
142
|
-
"@types/pg": "^8.
|
|
143
|
-
"@upstash/vector": "^1.
|
|
142
|
+
"@types/pg": "^8.16.0",
|
|
143
|
+
"@upstash/vector": "^1.2.2",
|
|
144
144
|
"@vitest/coverage-v8": "^4.0.18",
|
|
145
145
|
"ai": "^6.0.67",
|
|
146
|
-
"
|
|
146
|
+
"ai-sdk-provider-gemini-cli": "^2.0.1",
|
|
147
|
+
"bumpp": "^10.4.0",
|
|
147
148
|
"eslint": "^9.39.2",
|
|
148
|
-
"obuild": "^0.4.
|
|
149
|
+
"obuild": "^0.4.22",
|
|
149
150
|
"pg": "^8.18.0",
|
|
150
151
|
"sqlite-vec": "^0.1.7-alpha.2",
|
|
151
152
|
"typescript": "^5.9.3",
|
|
152
|
-
"vitest": "^4.0.
|
|
153
|
+
"vitest": "^4.0.18",
|
|
153
154
|
"zod": "^4.3.6"
|
|
154
155
|
},
|
|
155
156
|
"scripts": {
|
|
@@ -158,6 +159,7 @@
|
|
|
158
159
|
"lint": "eslint .",
|
|
159
160
|
"typecheck": "tsc --noEmit",
|
|
160
161
|
"test": "vitest --project unit",
|
|
162
|
+
"test:e2e": "vitest run --project e2e",
|
|
161
163
|
"test:eval": "vitest run --project eval",
|
|
162
164
|
"test:attw": "attw --pack",
|
|
163
165
|
"release": "pnpm build && bumpp --output=CHANGELOG.md"
|