retriv 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
const STOPWORDS = new Set([
|
|
2
|
+
"the",
|
|
3
|
+
"a",
|
|
4
|
+
"an",
|
|
5
|
+
"and",
|
|
6
|
+
"or",
|
|
7
|
+
"but",
|
|
8
|
+
"in",
|
|
9
|
+
"on",
|
|
10
|
+
"at",
|
|
11
|
+
"to",
|
|
12
|
+
"for",
|
|
13
|
+
"of",
|
|
14
|
+
"with",
|
|
15
|
+
"by",
|
|
16
|
+
"from",
|
|
17
|
+
"as",
|
|
18
|
+
"is",
|
|
19
|
+
"was",
|
|
20
|
+
"are",
|
|
21
|
+
"were",
|
|
22
|
+
"been",
|
|
23
|
+
"be",
|
|
24
|
+
"have",
|
|
25
|
+
"has",
|
|
26
|
+
"had",
|
|
27
|
+
"do",
|
|
28
|
+
"does",
|
|
29
|
+
"did",
|
|
30
|
+
"will",
|
|
31
|
+
"would",
|
|
32
|
+
"could",
|
|
33
|
+
"should",
|
|
34
|
+
"may",
|
|
35
|
+
"might",
|
|
36
|
+
"must",
|
|
37
|
+
"can",
|
|
38
|
+
"this",
|
|
39
|
+
"that",
|
|
40
|
+
"these",
|
|
41
|
+
"those",
|
|
42
|
+
"it",
|
|
43
|
+
"its",
|
|
44
|
+
"they",
|
|
45
|
+
"them",
|
|
46
|
+
"their",
|
|
47
|
+
"we",
|
|
48
|
+
"our",
|
|
49
|
+
"you",
|
|
50
|
+
"your",
|
|
51
|
+
"what",
|
|
52
|
+
"which",
|
|
53
|
+
"who",
|
|
54
|
+
"how",
|
|
55
|
+
"when",
|
|
56
|
+
"where",
|
|
57
|
+
"why",
|
|
58
|
+
"all",
|
|
59
|
+
"each",
|
|
60
|
+
"every",
|
|
61
|
+
"both",
|
|
62
|
+
"few",
|
|
63
|
+
"more",
|
|
64
|
+
"most",
|
|
65
|
+
"other",
|
|
66
|
+
"some",
|
|
67
|
+
"such",
|
|
68
|
+
"no",
|
|
69
|
+
"not",
|
|
70
|
+
"only"
|
|
71
|
+
]);
|
|
72
|
+
function scoreTerms(terms, content) {
|
|
73
|
+
const contentLower = content.toLowerCase();
|
|
74
|
+
const contentLen = content.length;
|
|
75
|
+
return terms.filter((t) => contentLower.includes(t)).map((term) => {
|
|
76
|
+
const regex = new RegExp(term, "gi");
|
|
77
|
+
const matches = contentLower.match(regex);
|
|
78
|
+
const tf = matches ? matches.length : 0;
|
|
79
|
+
const k1 = 1.2;
|
|
80
|
+
const tfNorm = tf * (k1 + 1) / (tf + k1 * (.25 + .75 * (contentLen / 500)));
|
|
81
|
+
const stopwordPenalty = STOPWORDS.has(term) ? .1 : 1;
|
|
82
|
+
const lengthBoost = Math.min(term.length / 5, 1.5);
|
|
83
|
+
return {
|
|
84
|
+
term,
|
|
85
|
+
score: tfNorm * stopwordPenalty * lengthBoost
|
|
86
|
+
};
|
|
87
|
+
}).sort((a, b) => b.score - a.score);
|
|
88
|
+
}
|
|
89
|
+
function extractSnippet(content, query, contextLines = 2) {
|
|
90
|
+
const lines = content.split("\n");
|
|
91
|
+
const totalContext = contextLines * 2 + 1;
|
|
92
|
+
const queryWords = query.toLowerCase().split(/\s+/).filter((w) => w.length > 2);
|
|
93
|
+
const scoredTerms = scoreTerms(queryWords, content);
|
|
94
|
+
const highlights = scoredTerms.slice(0, 5).map((t) => t.term);
|
|
95
|
+
if (lines.length <= totalContext) return {
|
|
96
|
+
snippet: content,
|
|
97
|
+
highlights
|
|
98
|
+
};
|
|
99
|
+
const termScores = new Map(scoredTerms.map((t) => [t.term, t.score]));
|
|
100
|
+
let bestIdx = 0;
|
|
101
|
+
let bestScore = 0;
|
|
102
|
+
for (let i = 0; i < lines.length; i++) {
|
|
103
|
+
const lineLower = lines[i].toLowerCase();
|
|
104
|
+
let score = 0;
|
|
105
|
+
for (const word of queryWords) if (lineLower.includes(word)) score += termScores.get(word) || 1;
|
|
106
|
+
if (score > bestScore) {
|
|
107
|
+
bestScore = score;
|
|
108
|
+
bestIdx = i;
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
if (bestScore === 0) return {
|
|
112
|
+
snippet: lines.slice(0, totalContext).join("\n"),
|
|
113
|
+
highlights
|
|
114
|
+
};
|
|
115
|
+
const start = Math.max(0, bestIdx - contextLines);
|
|
116
|
+
const end = Math.min(lines.length, bestIdx + contextLines + 1);
|
|
117
|
+
return {
|
|
118
|
+
snippet: lines.slice(start, end).join("\n"),
|
|
119
|
+
highlights
|
|
120
|
+
};
|
|
121
|
+
}
|
|
122
|
+
export { extractSnippet as t };
|
package/dist/db/cloudflare.mjs
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { resolveEmbedding } from "../embeddings/resolve.mjs";
|
|
2
|
+
import { t as extractSnippet } from "../_chunks/extract-snippet.mjs";
|
|
2
3
|
async function cloudflare(config) {
|
|
3
4
|
const { binding } = config;
|
|
4
5
|
if (!binding) throw new Error("[cloudflare] binding is required");
|
|
@@ -33,7 +34,14 @@ async function cloudflare(config) {
|
|
|
33
34
|
id: m.id,
|
|
34
35
|
score: Math.max(0, Math.min(1, m.score))
|
|
35
36
|
};
|
|
36
|
-
if (returnContent && m.metadata?._content)
|
|
37
|
+
if (returnContent && m.metadata?._content) {
|
|
38
|
+
const { snippet, highlights } = extractSnippet(m.metadata._content, query);
|
|
39
|
+
result.content = snippet;
|
|
40
|
+
if (highlights.length) result._meta = {
|
|
41
|
+
...result._meta,
|
|
42
|
+
highlights
|
|
43
|
+
};
|
|
44
|
+
}
|
|
37
45
|
if (returnMetadata && m.metadata) {
|
|
38
46
|
const { _content, ...rest } = m.metadata;
|
|
39
47
|
if (Object.keys(rest).length > 0) result.metadata = rest;
|
package/dist/db/libsql.mjs
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { resolveEmbedding } from "../embeddings/resolve.mjs";
|
|
2
|
+
import { t as extractSnippet } from "../_chunks/extract-snippet.mjs";
|
|
2
3
|
import { createClient } from "@libsql/client";
|
|
3
4
|
async function libsql(config) {
|
|
4
5
|
const url = config.url || config.path || "file:vectors.db";
|
|
@@ -63,7 +64,14 @@ async function libsql(config) {
|
|
|
63
64
|
id: row.id,
|
|
64
65
|
score: Math.max(0, 1 - row.distance)
|
|
65
66
|
};
|
|
66
|
-
if (returnContent && row.content)
|
|
67
|
+
if (returnContent && row.content) {
|
|
68
|
+
const { snippet, highlights } = extractSnippet(row.content, query);
|
|
69
|
+
result.content = snippet;
|
|
70
|
+
if (highlights.length) result._meta = {
|
|
71
|
+
...result._meta,
|
|
72
|
+
highlights
|
|
73
|
+
};
|
|
74
|
+
}
|
|
67
75
|
if (returnMetadata && row.metadata) result.metadata = JSON.parse(row.metadata);
|
|
68
76
|
return result;
|
|
69
77
|
});
|
package/dist/db/pgvector.mjs
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { resolveEmbedding } from "../embeddings/resolve.mjs";
|
|
2
|
+
import { t as extractSnippet } from "../_chunks/extract-snippet.mjs";
|
|
2
3
|
import pg from "pg";
|
|
3
4
|
async function pgvector(config) {
|
|
4
5
|
const { url, table = "vectors", metric = "cosine" } = config;
|
|
@@ -59,7 +60,14 @@ async function pgvector(config) {
|
|
|
59
60
|
id: row.id,
|
|
60
61
|
score
|
|
61
62
|
};
|
|
62
|
-
if (returnContent && row.content)
|
|
63
|
+
if (returnContent && row.content) {
|
|
64
|
+
const { snippet, highlights } = extractSnippet(row.content, query);
|
|
65
|
+
searchResult.content = snippet;
|
|
66
|
+
if (highlights.length) searchResult._meta = {
|
|
67
|
+
...searchResult._meta,
|
|
68
|
+
highlights
|
|
69
|
+
};
|
|
70
|
+
}
|
|
63
71
|
if (returnMetadata && row.metadata) searchResult.metadata = row.metadata;
|
|
64
72
|
return searchResult;
|
|
65
73
|
});
|
package/dist/db/sqlite-fts.mjs
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { t as extractSnippet } from "../_chunks/extract-snippet.mjs";
|
|
1
2
|
import { mkdirSync } from "node:fs";
|
|
2
3
|
import { dirname } from "node:path";
|
|
3
4
|
async function sqliteFts(config = {}) {
|
|
@@ -49,7 +50,14 @@ async function sqliteFts(config = {}) {
|
|
|
49
50
|
id: row.id,
|
|
50
51
|
score: normalizedScore
|
|
51
52
|
};
|
|
52
|
-
if (returnContent && row.content)
|
|
53
|
+
if (returnContent && row.content) {
|
|
54
|
+
const { snippet, highlights } = extractSnippet(row.content, query);
|
|
55
|
+
result.content = snippet;
|
|
56
|
+
if (highlights.length) result._meta = {
|
|
57
|
+
...result._meta,
|
|
58
|
+
highlights
|
|
59
|
+
};
|
|
60
|
+
}
|
|
53
61
|
if (returnMetadata && row.metadata) result.metadata = JSON.parse(row.metadata);
|
|
54
62
|
return result;
|
|
55
63
|
});
|
package/dist/db/sqlite-vec.mjs
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { resolveEmbedding } from "../embeddings/resolve.mjs";
|
|
2
|
+
import { t as extractSnippet } from "../_chunks/extract-snippet.mjs";
|
|
2
3
|
import { mkdirSync } from "node:fs";
|
|
3
4
|
import { dirname } from "node:path";
|
|
4
5
|
import * as sqliteVecExt from "sqlite-vec";
|
|
@@ -73,7 +74,14 @@ async function sqliteVec(config) {
|
|
|
73
74
|
id: meta.id,
|
|
74
75
|
score: 1 / (1 + row.distance)
|
|
75
76
|
};
|
|
76
|
-
if (returnContent && meta.content)
|
|
77
|
+
if (returnContent && meta.content) {
|
|
78
|
+
const { snippet, highlights } = extractSnippet(meta.content, query);
|
|
79
|
+
result.content = snippet;
|
|
80
|
+
if (highlights.length) result._meta = {
|
|
81
|
+
...result._meta,
|
|
82
|
+
highlights
|
|
83
|
+
};
|
|
84
|
+
}
|
|
77
85
|
if (returnMetadata && meta.metadata) result.metadata = JSON.parse(meta.metadata);
|
|
78
86
|
return result;
|
|
79
87
|
}).filter(Boolean);
|
package/dist/db/upstash.mjs
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { t as extractSnippet } from "../_chunks/extract-snippet.mjs";
|
|
1
2
|
import { Index } from "@upstash/vector";
|
|
2
3
|
async function upstash(config) {
|
|
3
4
|
const { url, token, namespace } = config;
|
|
@@ -34,7 +35,14 @@ async function upstash(config) {
|
|
|
34
35
|
id: m.id,
|
|
35
36
|
score: Math.max(0, Math.min(1, m.score))
|
|
36
37
|
};
|
|
37
|
-
if (returnContent && m.metadata?._content)
|
|
38
|
+
if (returnContent && m.metadata?._content) {
|
|
39
|
+
const { snippet, highlights } = extractSnippet(m.metadata._content, query);
|
|
40
|
+
result.content = snippet;
|
|
41
|
+
if (highlights.length) result._meta = {
|
|
42
|
+
...result._meta,
|
|
43
|
+
highlights
|
|
44
|
+
};
|
|
45
|
+
}
|
|
38
46
|
if (returnMetadata && m.metadata) {
|
|
39
47
|
const { _content, ...rest } = m.metadata;
|
|
40
48
|
if (Object.keys(rest).length > 0) result.metadata = rest;
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "retriv",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.
|
|
4
|
+
"version": "0.3.0",
|
|
5
5
|
"description": "Unified document retrieval for search - semantic, full-text, and fuzzy.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -159,6 +159,7 @@
|
|
|
159
159
|
"lint": "eslint .",
|
|
160
160
|
"typecheck": "tsc --noEmit",
|
|
161
161
|
"test": "vitest --project unit",
|
|
162
|
+
"test:e2e": "vitest run --project e2e",
|
|
162
163
|
"test:eval": "vitest run --project eval",
|
|
163
164
|
"test:attw": "attw --pack",
|
|
164
165
|
"release": "pnpm build && bumpp --output=CHANGELOG.md"
|