hybrid-search-pgvector 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,41 @@
1
+ # hybrid-search-pgvector
2
+
3
+ Hybrid semantic and keyword search for Postgres with `pgvector`.
4
+
5
+ ## Features
6
+
7
+ - Combines vector similarity and full-text search
8
+ - Uses Reciprocal Rank Fusion
9
+ - Supports metadata and tag filters
10
+ - Includes a simple upsert helper
11
+
12
+ ## Install
13
+
14
+ ```bash
15
+ npm install hybrid-search-pgvector pg
16
+ ```
17
+
18
+ ## Usage
19
+
20
+ ```ts
21
+ import { createHybridSearch } from "hybrid-search-pgvector";
22
+
23
+ const search = createHybridSearch({
24
+ pool,
25
+ embedFn: (text) => embed(text),
26
+ table: "documents",
27
+ });
28
+
29
+ const results = await search.search({
30
+ query: "manufacturing automation roadmap",
31
+ limit: 10,
32
+ });
33
+ ```
34
+
35
+ ## Requirements
36
+
37
+ - PostgreSQL
38
+ - `pgvector`
39
+ - a table with text, vector, metadata, and timestamp fields
40
+
41
+ Best for AI memory stores, document search, and retrieval-augmented applications.
package/package.json ADDED
@@ -0,0 +1,29 @@
1
+ {
2
+ "name": "hybrid-search-pgvector",
3
+ "version": "1.0.0",
4
+ "description": "Hybrid semantic + keyword search over a pgvector Postgres store using Reciprocal Rank Fusion",
5
+ "license": "MIT",
6
+ "type": "module",
7
+ "main": "./dist/index.js",
8
+ "types": "./dist/index.d.ts",
9
+ "exports": {
10
+ ".": {
11
+ "import": "./dist/index.js",
12
+ "types": "./dist/index.d.ts"
13
+ }
14
+ },
15
+ "scripts": {
16
+ "build": "tsc",
17
+ "dev": "tsc --watch",
18
+ "typecheck": "tsc --noEmit --pretty false"
19
+ },
20
+ "keywords": ["pgvector", "semantic-search", "hybrid-search", "rrf", "postgres", "embeddings"],
21
+ "peerDependencies": {
22
+ "pg": ">=8"
23
+ },
24
+ "devDependencies": {
25
+ "@types/pg": "^8",
26
+ "@types/node": "^22.0.0",
27
+ "typescript": "^5"
28
+ }
29
+ }
package/src/index.ts ADDED
@@ -0,0 +1,215 @@
1
+ /**
2
+ * hybrid-search-pgvector
3
+ *
4
+ * Hybrid semantic + keyword search over a Postgres + pgvector table.
5
+ * Uses Reciprocal Rank Fusion (RRF) to combine cosine similarity (pgvector)
6
+ * with full-text rank (tsvector GIN) plus a mild recency boost.
7
+ *
8
+ * Requires a Postgres table with this shape:
9
+ * CREATE TABLE thoughts (
10
+ * id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
11
+ * content TEXT NOT NULL,
12
+ * embedding vector(768), -- or your dimension
13
+ * metadata JSONB DEFAULT '{}',
14
+ * tags TEXT[] DEFAULT '{}',
15
+ * source TEXT,
16
+ * created_at TIMESTAMPTZ DEFAULT now()
17
+ * );
18
+ * CREATE INDEX ON thoughts USING ivfflat (embedding vector_cosine_ops);
19
+ * CREATE INDEX ON thoughts USING GIN (to_tsvector('english', content));
20
+ *
21
+ * Usage:
22
+ * import { createHybridSearch } from "hybrid-search-pgvector";
23
+ *
24
+ * const search = createHybridSearch({
25
+ * pool, // node-postgres Pool
26
+ * embedFn: text => myEmbed(text), // any (text) => Promise<number[]>
27
+ * table: "thoughts", // optional, default "thoughts"
28
+ * });
29
+ *
30
+ * const results = await search({ query: "project roadmap Q3", limit: 10 });
31
+ */
32
+
33
+ import type { Pool } from "pg";
34
+
35
+ export interface HybridSearchConfig {
36
+ /** node-postgres Pool connected to a Postgres + pgvector database */
37
+ pool: Pool;
38
+ /** Function that converts text to an embedding vector */
39
+ embedFn: (text: string) => Promise<number[]>;
40
+ /** Table name — default "thoughts" */
41
+ table?: string;
42
+ }
43
+
44
+ export interface SearchInput {
45
+ query: string;
46
+ limit?: number;
47
+ /** Metadata field filters — all are optional and ANDed together */
48
+ filters?: {
49
+ type?: string; // metadata->>'type'
50
+ source?: string; // source column
51
+ tag?: string; // any(tags)
52
+ person?: string; // metadata->'people' contains
53
+ topic?: string; // metadata->'topics' contains
54
+ };
55
+ }
56
+
57
+ export interface SearchResult {
58
+ id: string;
59
+ content: string;
60
+ metadata: Record<string, unknown>;
61
+ tags: string[];
62
+ source: string;
63
+ created_at: Date;
64
+ score: number;
65
+ }
66
+
67
+ export interface UpsertInput {
68
+ /** Stable identifier for deduplication (if already exists, skips insert) */
69
+ externalId: string;
70
+ content: string;
71
+ source: string;
72
+ tags?: string[];
73
+ metadata?: Record<string, unknown>;
74
+ }
75
+
76
+ export interface HybridSearch {
77
+ search(input: SearchInput): Promise<SearchResult[]>;
78
+ upsert(item: UpsertInput): Promise<boolean>;
79
+ }
80
+
81
+ /**
82
+ * Create a search + upsert client backed by the given Postgres pool.
83
+ */
84
+ export function createHybridSearch(config: HybridSearchConfig): HybridSearch {
85
+ const table = config.table ?? "thoughts";
86
+
87
+ return {
88
+ search: (input) => hybridSearch(config, table, input),
89
+ upsert: (item) => upsertItem(config, table, item),
90
+ };
91
+ }
92
+
93
+ // ── Internal implementation ────────────────────────────────────────────────────
94
+
95
+ async function hybridSearch(
96
+ config: HybridSearchConfig,
97
+ table: string,
98
+ input: SearchInput,
99
+ ): Promise<SearchResult[]> {
100
+ const { query, limit = 10, filters = {} } = input;
101
+ const embedding = await config.embedFn(query);
102
+ if (!embedding?.length) return [];
103
+
104
+ const conditions: string[] = [];
105
+ const params: unknown[] = [];
106
+ let p = 3; // $1 = vector, $2 = tsquery text, $3+ = filters
107
+
108
+ if (filters.type) {
109
+ conditions.push(`metadata->>'type' = $${p++}`);
110
+ params.push(filters.type);
111
+ }
112
+ if (filters.person) {
113
+ conditions.push(`metadata->'people' ? $${p++}`);
114
+ params.push(filters.person);
115
+ }
116
+ if (filters.topic) {
117
+ conditions.push(`metadata->'topics' ? $${p++}`);
118
+ params.push(filters.topic);
119
+ }
120
+ if (filters.source) {
121
+ conditions.push(`source = $${p++}`);
122
+ params.push(filters.source);
123
+ }
124
+ if (filters.tag) {
125
+ conditions.push(`$${p++} = ANY(tags)`);
126
+ params.push(filters.tag);
127
+ }
128
+
129
+ const filterSQL = conditions.length > 0 ? `AND ${conditions.join(" AND ")}` : "";
130
+ params.push(limit);
131
+ const limitParam = `$${p}`;
132
+
133
+ const sql = `
134
+ WITH semantic AS (
135
+ SELECT id,
136
+ ROW_NUMBER() OVER (ORDER BY embedding <=> $1::vector) AS rank
137
+ FROM ${table}
138
+ WHERE embedding IS NOT NULL
139
+ ${filterSQL}
140
+ LIMIT 40
141
+ ),
142
+ keyword AS (
143
+ SELECT id,
144
+ ROW_NUMBER() OVER (
145
+ ORDER BY ts_rank(to_tsvector('english', content),
146
+ plainto_tsquery('english', $2)) DESC
147
+ ) AS rank
148
+ FROM ${table}
149
+ WHERE to_tsvector('english', content) @@ plainto_tsquery('english', $2)
150
+ ${filterSQL}
151
+ LIMIT 40
152
+ ),
153
+ rrf AS (
154
+ SELECT
155
+ COALESCE(s.id, k.id) AS id,
156
+ COALESCE(1.0 / (60.0 + s.rank), 0) +
157
+ COALESCE(1.0 / (60.0 + k.rank), 0) AS rrf_score
158
+ FROM semantic s
159
+ FULL OUTER JOIN keyword k ON s.id = k.id
160
+ )
161
+ SELECT
162
+ t.id, t.content, t.metadata, t.tags, t.source, t.created_at,
163
+ r.rrf_score * (0.8 + 0.2 * EXP(
164
+ -EXTRACT(EPOCH FROM (NOW() - t.created_at)) / (90.0 * 86400)
165
+ )) AS score
166
+ FROM rrf r
167
+ JOIN ${table} t ON t.id = r.id
168
+ ORDER BY score DESC
169
+ LIMIT ${limitParam}
170
+ `;
171
+
172
+ const result = await config.pool.query(sql, [
173
+ `[${embedding.join(",")}]`,
174
+ query,
175
+ ...params,
176
+ ]);
177
+
178
+ return result.rows.map((row) => ({
179
+ id: row.id,
180
+ content: row.content,
181
+ metadata: row.metadata,
182
+ tags: row.tags,
183
+ source: row.source,
184
+ created_at: row.created_at,
185
+ score: parseFloat(row.score),
186
+ }));
187
+ }
188
+
189
+ async function upsertItem(
190
+ config: HybridSearchConfig,
191
+ table: string,
192
+ item: UpsertInput,
193
+ ): Promise<boolean> {
194
+ const { pool, embedFn } = config;
195
+ const exists = await pool.query(
196
+ `SELECT 1 FROM ${table} WHERE external_id = $1`,
197
+ [item.externalId],
198
+ );
199
+ if (exists.rows.length > 0) return false;
200
+
201
+ const embedding = await embedFn(item.content);
202
+ await pool.query(
203
+ `INSERT INTO ${table} (content, embedding, metadata, tags, source, external_id)
204
+ VALUES ($1, $2::vector, $3, $4, $5, $6)`,
205
+ [
206
+ item.content,
207
+ `[${embedding.join(",")}]`,
208
+ JSON.stringify(item.metadata ?? {}),
209
+ item.tags ?? [],
210
+ item.source,
211
+ item.externalId,
212
+ ],
213
+ );
214
+ return true;
215
+ }
package/tsconfig.json ADDED
@@ -0,0 +1,15 @@
1
+ {
2
+ "compilerOptions": {
3
+ "target": "ES2022",
4
+ "module": "NodeNext",
5
+ "moduleResolution": "NodeNext",
6
+ "outDir": "./dist",
7
+ "declaration": true,
8
+ "declarationMap": true,
9
+ "sourceMap": true,
10
+ "strict": true,
11
+ "esModuleInterop": true,
12
+ "skipLibCheck": true
13
+ },
14
+ "include": ["src"]
15
+ }