retriv 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -244,18 +244,18 @@ interface SearchResult {
244
244
 
245
245
  ## Benchmarks
246
246
 
247
- Retrieval accuracy on Nuxt documentation (639 docs):
247
+ Retrieval accuracy on Nuxt documentation (2,817 chunks):
248
248
 
249
249
  | Test Type | FTS | Vector | Hybrid |
250
250
  |-----------|-----|--------|--------|
251
- | Exact terminology (ports, config names) | 3/3 | 2/3 | 3/3 |
252
- | Doc retrieval (keyword overlap) | 3/3 | 2/3 | 3/3 |
253
- | Semantic queries (synonyms, no overlap) | 0/3 | 3/3 | 3/3 |
254
- | **Total** | **6/9 (67%)** | **7/9 (78%)** | **9/9 (100%)** |
251
+ | Exact terminology (ports, config names) | 7/7 | 5/7 | 7/7 |
252
+ | Doc retrieval (keyword overlap) | 0/7 | 5/7 | 5/7 |
253
+ | Semantic queries (synonyms, no overlap) | 1/6 | 5/6 | 5/6 |
254
+ | **Total** | **8/20 (40%)** | **15/20 (75%)** | **17/20 (85%)** |
255
255
 
256
256
  - **FTS** excels at exact terms but fails semantic queries ("reuse logic" → composables)
257
- - **Vector** understands meaning but misses precise terminology ("port 3000")
258
- - **Hybrid** combines both - never worse than either method alone
257
+ - **Vector** understands meaning but misses precise terminology (".global" suffix)
258
+ - **Hybrid** combines both - best overall recall across query types
259
259
 
260
260
  Run locally: `pnpm test:eval`
261
261
 
@@ -0,0 +1,122 @@
1
+ const STOPWORDS = new Set([
2
+ "the",
3
+ "a",
4
+ "an",
5
+ "and",
6
+ "or",
7
+ "but",
8
+ "in",
9
+ "on",
10
+ "at",
11
+ "to",
12
+ "for",
13
+ "of",
14
+ "with",
15
+ "by",
16
+ "from",
17
+ "as",
18
+ "is",
19
+ "was",
20
+ "are",
21
+ "were",
22
+ "been",
23
+ "be",
24
+ "have",
25
+ "has",
26
+ "had",
27
+ "do",
28
+ "does",
29
+ "did",
30
+ "will",
31
+ "would",
32
+ "could",
33
+ "should",
34
+ "may",
35
+ "might",
36
+ "must",
37
+ "can",
38
+ "this",
39
+ "that",
40
+ "these",
41
+ "those",
42
+ "it",
43
+ "its",
44
+ "they",
45
+ "them",
46
+ "their",
47
+ "we",
48
+ "our",
49
+ "you",
50
+ "your",
51
+ "what",
52
+ "which",
53
+ "who",
54
+ "how",
55
+ "when",
56
+ "where",
57
+ "why",
58
+ "all",
59
+ "each",
60
+ "every",
61
+ "both",
62
+ "few",
63
+ "more",
64
+ "most",
65
+ "other",
66
+ "some",
67
+ "such",
68
+ "no",
69
+ "not",
70
+ "only"
71
+ ]);
72
+ function scoreTerms(terms, content) {
73
+ const contentLower = content.toLowerCase();
74
+ const contentLen = content.length;
75
+ return terms.filter((t) => contentLower.includes(t)).map((term) => {
76
+ const regex = new RegExp(term, "gi");
77
+ const matches = contentLower.match(regex);
78
+ const tf = matches ? matches.length : 0;
79
+ const k1 = 1.2;
80
+ const tfNorm = tf * (k1 + 1) / (tf + k1 * (.25 + .75 * (contentLen / 500)));
81
+ const stopwordPenalty = STOPWORDS.has(term) ? .1 : 1;
82
+ const lengthBoost = Math.min(term.length / 5, 1.5);
83
+ return {
84
+ term,
85
+ score: tfNorm * stopwordPenalty * lengthBoost
86
+ };
87
+ }).sort((a, b) => b.score - a.score);
88
+ }
89
+ function extractSnippet(content, query, contextLines = 2) {
90
+ const lines = content.split("\n");
91
+ const totalContext = contextLines * 2 + 1;
92
+ const queryWords = query.toLowerCase().split(/\s+/).filter((w) => w.length > 2);
93
+ const scoredTerms = scoreTerms(queryWords, content);
94
+ const highlights = scoredTerms.slice(0, 5).map((t) => t.term);
95
+ if (lines.length <= totalContext) return {
96
+ snippet: content,
97
+ highlights
98
+ };
99
+ const termScores = new Map(scoredTerms.map((t) => [t.term, t.score]));
100
+ let bestIdx = 0;
101
+ let bestScore = 0;
102
+ for (let i = 0; i < lines.length; i++) {
103
+ const lineLower = lines[i].toLowerCase();
104
+ let score = 0;
105
+ for (const word of queryWords) if (lineLower.includes(word)) score += termScores.get(word) || 1;
106
+ if (score > bestScore) {
107
+ bestScore = score;
108
+ bestIdx = i;
109
+ }
110
+ }
111
+ if (bestScore === 0) return {
112
+ snippet: lines.slice(0, totalContext).join("\n"),
113
+ highlights
114
+ };
115
+ const start = Math.max(0, bestIdx - contextLines);
116
+ const end = Math.min(lines.length, bestIdx + contextLines + 1);
117
+ return {
118
+ snippet: lines.slice(start, end).join("\n"),
119
+ highlights
120
+ };
121
+ }
122
+ export { extractSnippet as t };
@@ -1,4 +1,5 @@
1
1
  import { resolveEmbedding } from "../embeddings/resolve.mjs";
2
+ import { t as extractSnippet } from "../_chunks/extract-snippet.mjs";
2
3
  async function cloudflare(config) {
3
4
  const { binding } = config;
4
5
  if (!binding) throw new Error("[cloudflare] binding is required");
@@ -33,7 +34,14 @@ async function cloudflare(config) {
33
34
  id: m.id,
34
35
  score: Math.max(0, Math.min(1, m.score))
35
36
  };
36
- if (returnContent && m.metadata?._content) result.content = m.metadata._content;
37
+ if (returnContent && m.metadata?._content) {
38
+ const { snippet, highlights } = extractSnippet(m.metadata._content, query);
39
+ result.content = snippet;
40
+ if (highlights.length) result._meta = {
41
+ ...result._meta,
42
+ highlights
43
+ };
44
+ }
37
45
  if (returnMetadata && m.metadata) {
38
46
  const { _content, ...rest } = m.metadata;
39
47
  if (Object.keys(rest).length > 0) result.metadata = rest;
@@ -1,4 +1,5 @@
1
1
  import { resolveEmbedding } from "../embeddings/resolve.mjs";
2
+ import { t as extractSnippet } from "../_chunks/extract-snippet.mjs";
2
3
  import { createClient } from "@libsql/client";
3
4
  async function libsql(config) {
4
5
  const url = config.url || config.path || "file:vectors.db";
@@ -63,7 +64,14 @@ async function libsql(config) {
63
64
  id: row.id,
64
65
  score: Math.max(0, 1 - row.distance)
65
66
  };
66
- if (returnContent && row.content) result.content = row.content;
67
+ if (returnContent && row.content) {
68
+ const { snippet, highlights } = extractSnippet(row.content, query);
69
+ result.content = snippet;
70
+ if (highlights.length) result._meta = {
71
+ ...result._meta,
72
+ highlights
73
+ };
74
+ }
67
75
  if (returnMetadata && row.metadata) result.metadata = JSON.parse(row.metadata);
68
76
  return result;
69
77
  });
@@ -1,4 +1,5 @@
1
1
  import { resolveEmbedding } from "../embeddings/resolve.mjs";
2
+ import { t as extractSnippet } from "../_chunks/extract-snippet.mjs";
2
3
  import pg from "pg";
3
4
  async function pgvector(config) {
4
5
  const { url, table = "vectors", metric = "cosine" } = config;
@@ -59,7 +60,14 @@ async function pgvector(config) {
59
60
  id: row.id,
60
61
  score
61
62
  };
62
- if (returnContent && row.content) searchResult.content = row.content;
63
+ if (returnContent && row.content) {
64
+ const { snippet, highlights } = extractSnippet(row.content, query);
65
+ searchResult.content = snippet;
66
+ if (highlights.length) searchResult._meta = {
67
+ ...searchResult._meta,
68
+ highlights
69
+ };
70
+ }
63
71
  if (returnMetadata && row.metadata) searchResult.metadata = row.metadata;
64
72
  return searchResult;
65
73
  });
@@ -1,3 +1,4 @@
1
+ import { t as extractSnippet } from "../_chunks/extract-snippet.mjs";
1
2
  import { mkdirSync } from "node:fs";
2
3
  import { dirname } from "node:path";
3
4
  async function sqliteFts(config = {}) {
@@ -49,7 +50,14 @@ async function sqliteFts(config = {}) {
49
50
  id: row.id,
50
51
  score: normalizedScore
51
52
  };
52
- if (returnContent && row.content) result.content = row.content;
53
+ if (returnContent && row.content) {
54
+ const { snippet, highlights } = extractSnippet(row.content, query);
55
+ result.content = snippet;
56
+ if (highlights.length) result._meta = {
57
+ ...result._meta,
58
+ highlights
59
+ };
60
+ }
53
61
  if (returnMetadata && row.metadata) result.metadata = JSON.parse(row.metadata);
54
62
  return result;
55
63
  });
@@ -1,4 +1,5 @@
1
1
  import { resolveEmbedding } from "../embeddings/resolve.mjs";
2
+ import { t as extractSnippet } from "../_chunks/extract-snippet.mjs";
2
3
  import { mkdirSync } from "node:fs";
3
4
  import { dirname } from "node:path";
4
5
  import * as sqliteVecExt from "sqlite-vec";
@@ -73,7 +74,14 @@ async function sqliteVec(config) {
73
74
  id: meta.id,
74
75
  score: 1 / (1 + row.distance)
75
76
  };
76
- if (returnContent && meta.content) result.content = meta.content;
77
+ if (returnContent && meta.content) {
78
+ const { snippet, highlights } = extractSnippet(meta.content, query);
79
+ result.content = snippet;
80
+ if (highlights.length) result._meta = {
81
+ ...result._meta,
82
+ highlights
83
+ };
84
+ }
77
85
  if (returnMetadata && meta.metadata) result.metadata = JSON.parse(meta.metadata);
78
86
  return result;
79
87
  }).filter(Boolean);
@@ -1,3 +1,4 @@
1
+ import { t as extractSnippet } from "../_chunks/extract-snippet.mjs";
1
2
  import { Index } from "@upstash/vector";
2
3
  async function upstash(config) {
3
4
  const { url, token, namespace } = config;
@@ -34,7 +35,14 @@ async function upstash(config) {
34
35
  id: m.id,
35
36
  score: Math.max(0, Math.min(1, m.score))
36
37
  };
37
- if (returnContent && m.metadata?._content) result.content = m.metadata._content;
38
+ if (returnContent && m.metadata?._content) {
39
+ const { snippet, highlights } = extractSnippet(m.metadata._content, query);
40
+ result.content = snippet;
41
+ if (highlights.length) result._meta = {
42
+ ...result._meta,
43
+ highlights
44
+ };
45
+ }
38
46
  if (returnMetadata && m.metadata) {
39
47
  const { _content, ...rest } = m.metadata;
40
48
  if (Object.keys(rest).length > 0) result.metadata = rest;
@@ -20,8 +20,8 @@ const DEFAULT_MODELS = {
20
20
  dimensions: 768
21
21
  },
22
22
  "transformers.js": {
23
- model: "Xenova/bge-base-en-v1.5",
24
- dimensions: 768
23
+ model: "Xenova/bge-small-en-v1.5",
24
+ dimensions: 384
25
25
  }
26
26
  };
27
27
  const MODEL_DIMENSIONS = {
@@ -11,7 +11,7 @@ async function clearCorruptedCache(error, model) {
11
11
  return true;
12
12
  }
13
13
  function transformersJs(options = {}) {
14
- const model = resolveModelForPreset(options.model ?? "bge-base-en-v1.5", "transformers.js");
14
+ const model = resolveModelForPreset(options.model ?? "bge-small-en-v1.5", "transformers.js");
15
15
  let cached = null;
16
16
  return { async resolve() {
17
17
  if (cached) return cached;
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "retriv",
3
3
  "type": "module",
4
- "version": "0.1.0",
4
+ "version": "0.3.0",
5
5
  "description": "Unified document retrieval for search - semantic, full-text, and fuzzy.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -139,17 +139,18 @@
139
139
  "@huggingface/transformers": "^3.8.1",
140
140
  "@libsql/client": "^0.17.0",
141
141
  "@types/node": "^25.2.0",
142
- "@types/pg": "^8.11.0",
143
- "@upstash/vector": "^1.1.0",
142
+ "@types/pg": "^8.16.0",
143
+ "@upstash/vector": "^1.2.2",
144
144
  "@vitest/coverage-v8": "^4.0.18",
145
145
  "ai": "^6.0.67",
146
- "bumpp": "^10.3.2",
146
+ "ai-sdk-provider-gemini-cli": "^2.0.1",
147
+ "bumpp": "^10.4.0",
147
148
  "eslint": "^9.39.2",
148
- "obuild": "^0.4.14",
149
+ "obuild": "^0.4.22",
149
150
  "pg": "^8.18.0",
150
151
  "sqlite-vec": "^0.1.7-alpha.2",
151
152
  "typescript": "^5.9.3",
152
- "vitest": "^4.0.16",
153
+ "vitest": "^4.0.18",
153
154
  "zod": "^4.3.6"
154
155
  },
155
156
  "scripts": {
@@ -158,6 +159,7 @@
158
159
  "lint": "eslint .",
159
160
  "typecheck": "tsc --noEmit",
160
161
  "test": "vitest --project unit",
162
+ "test:e2e": "vitest run --project e2e",
161
163
  "test:eval": "vitest run --project eval",
162
164
  "test:attw": "attw --pack",
163
165
  "release": "pnpm build && bumpp --output=CHANGELOG.md"