retriv 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,122 @@
1
+ const STOPWORDS = new Set([
2
+ "the",
3
+ "a",
4
+ "an",
5
+ "and",
6
+ "or",
7
+ "but",
8
+ "in",
9
+ "on",
10
+ "at",
11
+ "to",
12
+ "for",
13
+ "of",
14
+ "with",
15
+ "by",
16
+ "from",
17
+ "as",
18
+ "is",
19
+ "was",
20
+ "are",
21
+ "were",
22
+ "been",
23
+ "be",
24
+ "have",
25
+ "has",
26
+ "had",
27
+ "do",
28
+ "does",
29
+ "did",
30
+ "will",
31
+ "would",
32
+ "could",
33
+ "should",
34
+ "may",
35
+ "might",
36
+ "must",
37
+ "can",
38
+ "this",
39
+ "that",
40
+ "these",
41
+ "those",
42
+ "it",
43
+ "its",
44
+ "they",
45
+ "them",
46
+ "their",
47
+ "we",
48
+ "our",
49
+ "you",
50
+ "your",
51
+ "what",
52
+ "which",
53
+ "who",
54
+ "how",
55
+ "when",
56
+ "where",
57
+ "why",
58
+ "all",
59
+ "each",
60
+ "every",
61
+ "both",
62
+ "few",
63
+ "more",
64
+ "most",
65
+ "other",
66
+ "some",
67
+ "such",
68
+ "no",
69
+ "not",
70
+ "only"
71
+ ]);
72
+ function scoreTerms(terms, content) {
73
+ const contentLower = content.toLowerCase();
74
+ const contentLen = content.length;
75
+ return terms.filter((t) => contentLower.includes(t)).map((term) => {
76
+ const regex = new RegExp(term, "gi");
77
+ const matches = contentLower.match(regex);
78
+ const tf = matches ? matches.length : 0;
79
+ const k1 = 1.2;
80
+ const tfNorm = tf * (k1 + 1) / (tf + k1 * (.25 + .75 * (contentLen / 500)));
81
+ const stopwordPenalty = STOPWORDS.has(term) ? .1 : 1;
82
+ const lengthBoost = Math.min(term.length / 5, 1.5);
83
+ return {
84
+ term,
85
+ score: tfNorm * stopwordPenalty * lengthBoost
86
+ };
87
+ }).sort((a, b) => b.score - a.score);
88
+ }
89
+ function extractSnippet(content, query, contextLines = 2) {
90
+ const lines = content.split("\n");
91
+ const totalContext = contextLines * 2 + 1;
92
+ const queryWords = query.toLowerCase().split(/\s+/).filter((w) => w.length > 2);
93
+ const scoredTerms = scoreTerms(queryWords, content);
94
+ const highlights = scoredTerms.slice(0, 5).map((t) => t.term);
95
+ if (lines.length <= totalContext) return {
96
+ snippet: content,
97
+ highlights
98
+ };
99
+ const termScores = new Map(scoredTerms.map((t) => [t.term, t.score]));
100
+ let bestIdx = 0;
101
+ let bestScore = 0;
102
+ for (let i = 0; i < lines.length; i++) {
103
+ const lineLower = lines[i].toLowerCase();
104
+ let score = 0;
105
+ for (const word of queryWords) if (lineLower.includes(word)) score += termScores.get(word) || 1;
106
+ if (score > bestScore) {
107
+ bestScore = score;
108
+ bestIdx = i;
109
+ }
110
+ }
111
+ if (bestScore === 0) return {
112
+ snippet: lines.slice(0, totalContext).join("\n"),
113
+ highlights
114
+ };
115
+ const start = Math.max(0, bestIdx - contextLines);
116
+ const end = Math.min(lines.length, bestIdx + contextLines + 1);
117
+ return {
118
+ snippet: lines.slice(start, end).join("\n"),
119
+ highlights
120
+ };
121
+ }
122
+ export { extractSnippet as t };
@@ -1,4 +1,5 @@
1
1
  import { resolveEmbedding } from "../embeddings/resolve.mjs";
2
+ import { t as extractSnippet } from "../_chunks/extract-snippet.mjs";
2
3
  async function cloudflare(config) {
3
4
  const { binding } = config;
4
5
  if (!binding) throw new Error("[cloudflare] binding is required");
@@ -33,7 +34,14 @@ async function cloudflare(config) {
33
34
  id: m.id,
34
35
  score: Math.max(0, Math.min(1, m.score))
35
36
  };
36
- if (returnContent && m.metadata?._content) result.content = m.metadata._content;
37
+ if (returnContent && m.metadata?._content) {
38
+ const { snippet, highlights } = extractSnippet(m.metadata._content, query);
39
+ result.content = snippet;
40
+ if (highlights.length) result._meta = {
41
+ ...result._meta,
42
+ highlights
43
+ };
44
+ }
37
45
  if (returnMetadata && m.metadata) {
38
46
  const { _content, ...rest } = m.metadata;
39
47
  if (Object.keys(rest).length > 0) result.metadata = rest;
@@ -1,4 +1,5 @@
1
1
  import { resolveEmbedding } from "../embeddings/resolve.mjs";
2
+ import { t as extractSnippet } from "../_chunks/extract-snippet.mjs";
2
3
  import { createClient } from "@libsql/client";
3
4
  async function libsql(config) {
4
5
  const url = config.url || config.path || "file:vectors.db";
@@ -63,7 +64,14 @@ async function libsql(config) {
63
64
  id: row.id,
64
65
  score: Math.max(0, 1 - row.distance)
65
66
  };
66
- if (returnContent && row.content) result.content = row.content;
67
+ if (returnContent && row.content) {
68
+ const { snippet, highlights } = extractSnippet(row.content, query);
69
+ result.content = snippet;
70
+ if (highlights.length) result._meta = {
71
+ ...result._meta,
72
+ highlights
73
+ };
74
+ }
67
75
  if (returnMetadata && row.metadata) result.metadata = JSON.parse(row.metadata);
68
76
  return result;
69
77
  });
@@ -1,4 +1,5 @@
1
1
  import { resolveEmbedding } from "../embeddings/resolve.mjs";
2
+ import { t as extractSnippet } from "../_chunks/extract-snippet.mjs";
2
3
  import pg from "pg";
3
4
  async function pgvector(config) {
4
5
  const { url, table = "vectors", metric = "cosine" } = config;
@@ -59,7 +60,14 @@ async function pgvector(config) {
59
60
  id: row.id,
60
61
  score
61
62
  };
62
- if (returnContent && row.content) searchResult.content = row.content;
63
+ if (returnContent && row.content) {
64
+ const { snippet, highlights } = extractSnippet(row.content, query);
65
+ searchResult.content = snippet;
66
+ if (highlights.length) searchResult._meta = {
67
+ ...searchResult._meta,
68
+ highlights
69
+ };
70
+ }
63
71
  if (returnMetadata && row.metadata) searchResult.metadata = row.metadata;
64
72
  return searchResult;
65
73
  });
@@ -1,3 +1,4 @@
1
+ import { t as extractSnippet } from "../_chunks/extract-snippet.mjs";
1
2
  import { mkdirSync } from "node:fs";
2
3
  import { dirname } from "node:path";
3
4
  async function sqliteFts(config = {}) {
@@ -49,7 +50,14 @@ async function sqliteFts(config = {}) {
49
50
  id: row.id,
50
51
  score: normalizedScore
51
52
  };
52
- if (returnContent && row.content) result.content = row.content;
53
+ if (returnContent && row.content) {
54
+ const { snippet, highlights } = extractSnippet(row.content, query);
55
+ result.content = snippet;
56
+ if (highlights.length) result._meta = {
57
+ ...result._meta,
58
+ highlights
59
+ };
60
+ }
53
61
  if (returnMetadata && row.metadata) result.metadata = JSON.parse(row.metadata);
54
62
  return result;
55
63
  });
@@ -1,4 +1,5 @@
1
1
  import { resolveEmbedding } from "../embeddings/resolve.mjs";
2
+ import { t as extractSnippet } from "../_chunks/extract-snippet.mjs";
2
3
  import { mkdirSync } from "node:fs";
3
4
  import { dirname } from "node:path";
4
5
  import * as sqliteVecExt from "sqlite-vec";
@@ -73,7 +74,14 @@ async function sqliteVec(config) {
73
74
  id: meta.id,
74
75
  score: 1 / (1 + row.distance)
75
76
  };
76
- if (returnContent && meta.content) result.content = meta.content;
77
+ if (returnContent && meta.content) {
78
+ const { snippet, highlights } = extractSnippet(meta.content, query);
79
+ result.content = snippet;
80
+ if (highlights.length) result._meta = {
81
+ ...result._meta,
82
+ highlights
83
+ };
84
+ }
77
85
  if (returnMetadata && meta.metadata) result.metadata = JSON.parse(meta.metadata);
78
86
  return result;
79
87
  }).filter(Boolean);
@@ -1,3 +1,4 @@
1
+ import { t as extractSnippet } from "../_chunks/extract-snippet.mjs";
1
2
  import { Index } from "@upstash/vector";
2
3
  async function upstash(config) {
3
4
  const { url, token, namespace } = config;
@@ -34,7 +35,14 @@ async function upstash(config) {
34
35
  id: m.id,
35
36
  score: Math.max(0, Math.min(1, m.score))
36
37
  };
37
- if (returnContent && m.metadata?._content) result.content = m.metadata._content;
38
+ if (returnContent && m.metadata?._content) {
39
+ const { snippet, highlights } = extractSnippet(m.metadata._content, query);
40
+ result.content = snippet;
41
+ if (highlights.length) result._meta = {
42
+ ...result._meta,
43
+ highlights
44
+ };
45
+ }
38
46
  if (returnMetadata && m.metadata) {
39
47
  const { _content, ...rest } = m.metadata;
40
48
  if (Object.keys(rest).length > 0) result.metadata = rest;
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "retriv",
3
3
  "type": "module",
4
- "version": "0.2.0",
4
+ "version": "0.3.0",
5
5
  "description": "Unified document retrieval for search - semantic, full-text, and fuzzy.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -159,6 +159,7 @@
159
159
  "lint": "eslint .",
160
160
  "typecheck": "tsc --noEmit",
161
161
  "test": "vitest --project unit",
162
+ "test:e2e": "vitest run --project e2e",
162
163
  "test:eval": "vitest run --project eval",
163
164
  "test:attw": "attw --pack",
164
165
  "release": "pnpm build && bumpp --output=CHANGELOG.md"