hybrid-search-pgvector 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +41 -0
- package/package.json +29 -0
- package/src/index.ts +215 -0
- package/tsconfig.json +15 -0
package/README.md
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# hybrid-search-pgvector
|
|
2
|
+
|
|
3
|
+
Hybrid semantic and keyword search for Postgres with `pgvector`.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- Combines vector similarity and full-text search
|
|
8
|
+
- Uses Reciprocal Rank Fusion
|
|
9
|
+
- Supports metadata and tag filters
|
|
10
|
+
- Includes a simple upsert helper
|
|
11
|
+
|
|
12
|
+
## Install
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
npm install hybrid-search-pgvector pg
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## Usage
|
|
19
|
+
|
|
20
|
+
```ts
|
|
21
|
+
import { createHybridSearch } from "hybrid-search-pgvector";
|
|
22
|
+
|
|
23
|
+
const search = createHybridSearch({
|
|
24
|
+
pool,
|
|
25
|
+
embedFn: (text) => embed(text),
|
|
26
|
+
table: "documents",
|
|
27
|
+
});
|
|
28
|
+
|
|
29
|
+
const results = await search.search({
|
|
30
|
+
query: "manufacturing automation roadmap",
|
|
31
|
+
limit: 10,
|
|
32
|
+
});
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Requirements
|
|
36
|
+
|
|
37
|
+
- PostgreSQL
|
|
38
|
+
- `pgvector`
|
|
39
|
+
- a table with text, vector, metadata, and timestamp fields
|
|
40
|
+
|
|
41
|
+
Best for AI memory stores, document search, and retrieval-augmented applications.
|
package/package.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "hybrid-search-pgvector",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Hybrid semantic + keyword search over a pgvector Postgres store using Reciprocal Rank Fusion",
|
|
5
|
+
"license": "MIT",
|
|
6
|
+
"type": "module",
|
|
7
|
+
"main": "./dist/index.js",
|
|
8
|
+
"types": "./dist/index.d.ts",
|
|
9
|
+
"exports": {
|
|
10
|
+
".": {
|
|
11
|
+
"import": "./dist/index.js",
|
|
12
|
+
"types": "./dist/index.d.ts"
|
|
13
|
+
}
|
|
14
|
+
},
|
|
15
|
+
"scripts": {
|
|
16
|
+
"build": "tsc",
|
|
17
|
+
"dev": "tsc --watch",
|
|
18
|
+
"typecheck": "tsc --noEmit --pretty false"
|
|
19
|
+
},
|
|
20
|
+
"keywords": ["pgvector", "semantic-search", "hybrid-search", "rrf", "postgres", "embeddings"],
|
|
21
|
+
"peerDependencies": {
|
|
22
|
+
"pg": ">=8"
|
|
23
|
+
},
|
|
24
|
+
"devDependencies": {
|
|
25
|
+
"@types/pg": "^8",
|
|
26
|
+
"@types/node": "^22.0.0",
|
|
27
|
+
"typescript": "^5"
|
|
28
|
+
}
|
|
29
|
+
}
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* hybrid-search-pgvector
|
|
3
|
+
*
|
|
4
|
+
* Hybrid semantic + keyword search over a Postgres + pgvector table.
|
|
5
|
+
* Uses Reciprocal Rank Fusion (RRF) to combine cosine similarity (pgvector)
|
|
6
|
+
* with full-text rank (tsvector GIN) plus a mild recency boost.
|
|
7
|
+
*
|
|
8
|
+
* Requires a Postgres table with this shape:
|
|
9
|
+
* CREATE TABLE thoughts (
|
|
10
|
+
* id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
11
|
+
* content TEXT NOT NULL,
|
|
12
|
+
* embedding vector(768), -- or your dimension
|
|
13
|
+
* metadata JSONB DEFAULT '{}',
|
|
14
|
+
* tags TEXT[] DEFAULT '{}',
|
|
15
|
+
* source TEXT,
|
|
16
|
+
* created_at TIMESTAMPTZ DEFAULT now()
|
|
17
|
+
* );
|
|
18
|
+
* CREATE INDEX ON thoughts USING ivfflat (embedding vector_cosine_ops);
|
|
19
|
+
* CREATE INDEX ON thoughts USING GIN (to_tsvector('english', content));
|
|
20
|
+
*
|
|
21
|
+
* Usage:
|
|
22
|
+
* import { createHybridSearch } from "hybrid-search-pgvector";
|
|
23
|
+
*
|
|
24
|
+
* const search = createHybridSearch({
|
|
25
|
+
* pool, // node-postgres Pool
|
|
26
|
+
* embedFn: text => myEmbed(text), // any (text) => Promise<number[]>
|
|
27
|
+
* table: "thoughts", // optional, default "thoughts"
|
|
28
|
+
* });
|
|
29
|
+
*
|
|
30
|
+
* const results = await search({ query: "project roadmap Q3", limit: 10 });
|
|
31
|
+
*/
|
|
32
|
+
|
|
33
|
+
import type { Pool } from "pg";
|
|
34
|
+
|
|
35
|
+
export interface HybridSearchConfig {
|
|
36
|
+
/** node-postgres Pool connected to a Postgres + pgvector database */
|
|
37
|
+
pool: Pool;
|
|
38
|
+
/** Function that converts text to an embedding vector */
|
|
39
|
+
embedFn: (text: string) => Promise<number[]>;
|
|
40
|
+
/** Table name — default "thoughts" */
|
|
41
|
+
table?: string;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export interface SearchInput {
|
|
45
|
+
query: string;
|
|
46
|
+
limit?: number;
|
|
47
|
+
/** Metadata field filters — all are optional and ANDed together */
|
|
48
|
+
filters?: {
|
|
49
|
+
type?: string; // metadata->>'type'
|
|
50
|
+
source?: string; // source column
|
|
51
|
+
tag?: string; // any(tags)
|
|
52
|
+
person?: string; // metadata->'people' contains
|
|
53
|
+
topic?: string; // metadata->'topics' contains
|
|
54
|
+
};
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
export interface SearchResult {
|
|
58
|
+
id: string;
|
|
59
|
+
content: string;
|
|
60
|
+
metadata: Record<string, unknown>;
|
|
61
|
+
tags: string[];
|
|
62
|
+
source: string;
|
|
63
|
+
created_at: Date;
|
|
64
|
+
score: number;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
export interface UpsertInput {
|
|
68
|
+
/** Stable identifier for deduplication (if already exists, skips insert) */
|
|
69
|
+
externalId: string;
|
|
70
|
+
content: string;
|
|
71
|
+
source: string;
|
|
72
|
+
tags?: string[];
|
|
73
|
+
metadata?: Record<string, unknown>;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
export interface HybridSearch {
|
|
77
|
+
search(input: SearchInput): Promise<SearchResult[]>;
|
|
78
|
+
upsert(item: UpsertInput): Promise<boolean>;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/**
|
|
82
|
+
* Create a search + upsert client backed by the given Postgres pool.
|
|
83
|
+
*/
|
|
84
|
+
export function createHybridSearch(config: HybridSearchConfig): HybridSearch {
|
|
85
|
+
const table = config.table ?? "thoughts";
|
|
86
|
+
|
|
87
|
+
return {
|
|
88
|
+
search: (input) => hybridSearch(config, table, input),
|
|
89
|
+
upsert: (item) => upsertItem(config, table, item),
|
|
90
|
+
};
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
// ── Internal implementation ────────────────────────────────────────────────────
|
|
94
|
+
|
|
95
|
+
async function hybridSearch(
|
|
96
|
+
config: HybridSearchConfig,
|
|
97
|
+
table: string,
|
|
98
|
+
input: SearchInput,
|
|
99
|
+
): Promise<SearchResult[]> {
|
|
100
|
+
const { query, limit = 10, filters = {} } = input;
|
|
101
|
+
const embedding = await config.embedFn(query);
|
|
102
|
+
if (!embedding?.length) return [];
|
|
103
|
+
|
|
104
|
+
const conditions: string[] = [];
|
|
105
|
+
const params: unknown[] = [];
|
|
106
|
+
let p = 3; // $1 = vector, $2 = tsquery text, $3+ = filters
|
|
107
|
+
|
|
108
|
+
if (filters.type) {
|
|
109
|
+
conditions.push(`metadata->>'type' = $${p++}`);
|
|
110
|
+
params.push(filters.type);
|
|
111
|
+
}
|
|
112
|
+
if (filters.person) {
|
|
113
|
+
conditions.push(`metadata->'people' ? $${p++}`);
|
|
114
|
+
params.push(filters.person);
|
|
115
|
+
}
|
|
116
|
+
if (filters.topic) {
|
|
117
|
+
conditions.push(`metadata->'topics' ? $${p++}`);
|
|
118
|
+
params.push(filters.topic);
|
|
119
|
+
}
|
|
120
|
+
if (filters.source) {
|
|
121
|
+
conditions.push(`source = $${p++}`);
|
|
122
|
+
params.push(filters.source);
|
|
123
|
+
}
|
|
124
|
+
if (filters.tag) {
|
|
125
|
+
conditions.push(`$${p++} = ANY(tags)`);
|
|
126
|
+
params.push(filters.tag);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
const filterSQL = conditions.length > 0 ? `AND ${conditions.join(" AND ")}` : "";
|
|
130
|
+
params.push(limit);
|
|
131
|
+
const limitParam = `$${p}`;
|
|
132
|
+
|
|
133
|
+
const sql = `
|
|
134
|
+
WITH semantic AS (
|
|
135
|
+
SELECT id,
|
|
136
|
+
ROW_NUMBER() OVER (ORDER BY embedding <=> $1::vector) AS rank
|
|
137
|
+
FROM ${table}
|
|
138
|
+
WHERE embedding IS NOT NULL
|
|
139
|
+
${filterSQL}
|
|
140
|
+
LIMIT 40
|
|
141
|
+
),
|
|
142
|
+
keyword AS (
|
|
143
|
+
SELECT id,
|
|
144
|
+
ROW_NUMBER() OVER (
|
|
145
|
+
ORDER BY ts_rank(to_tsvector('english', content),
|
|
146
|
+
plainto_tsquery('english', $2)) DESC
|
|
147
|
+
) AS rank
|
|
148
|
+
FROM ${table}
|
|
149
|
+
WHERE to_tsvector('english', content) @@ plainto_tsquery('english', $2)
|
|
150
|
+
${filterSQL}
|
|
151
|
+
LIMIT 40
|
|
152
|
+
),
|
|
153
|
+
rrf AS (
|
|
154
|
+
SELECT
|
|
155
|
+
COALESCE(s.id, k.id) AS id,
|
|
156
|
+
COALESCE(1.0 / (60.0 + s.rank), 0) +
|
|
157
|
+
COALESCE(1.0 / (60.0 + k.rank), 0) AS rrf_score
|
|
158
|
+
FROM semantic s
|
|
159
|
+
FULL OUTER JOIN keyword k ON s.id = k.id
|
|
160
|
+
)
|
|
161
|
+
SELECT
|
|
162
|
+
t.id, t.content, t.metadata, t.tags, t.source, t.created_at,
|
|
163
|
+
r.rrf_score * (0.8 + 0.2 * EXP(
|
|
164
|
+
-EXTRACT(EPOCH FROM (NOW() - t.created_at)) / (90.0 * 86400)
|
|
165
|
+
)) AS score
|
|
166
|
+
FROM rrf r
|
|
167
|
+
JOIN ${table} t ON t.id = r.id
|
|
168
|
+
ORDER BY score DESC
|
|
169
|
+
LIMIT ${limitParam}
|
|
170
|
+
`;
|
|
171
|
+
|
|
172
|
+
const result = await config.pool.query(sql, [
|
|
173
|
+
`[${embedding.join(",")}]`,
|
|
174
|
+
query,
|
|
175
|
+
...params,
|
|
176
|
+
]);
|
|
177
|
+
|
|
178
|
+
return result.rows.map((row) => ({
|
|
179
|
+
id: row.id,
|
|
180
|
+
content: row.content,
|
|
181
|
+
metadata: row.metadata,
|
|
182
|
+
tags: row.tags,
|
|
183
|
+
source: row.source,
|
|
184
|
+
created_at: row.created_at,
|
|
185
|
+
score: parseFloat(row.score),
|
|
186
|
+
}));
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
async function upsertItem(
|
|
190
|
+
config: HybridSearchConfig,
|
|
191
|
+
table: string,
|
|
192
|
+
item: UpsertInput,
|
|
193
|
+
): Promise<boolean> {
|
|
194
|
+
const { pool, embedFn } = config;
|
|
195
|
+
const exists = await pool.query(
|
|
196
|
+
`SELECT 1 FROM ${table} WHERE external_id = $1`,
|
|
197
|
+
[item.externalId],
|
|
198
|
+
);
|
|
199
|
+
if (exists.rows.length > 0) return false;
|
|
200
|
+
|
|
201
|
+
const embedding = await embedFn(item.content);
|
|
202
|
+
await pool.query(
|
|
203
|
+
`INSERT INTO ${table} (content, embedding, metadata, tags, source, external_id)
|
|
204
|
+
VALUES ($1, $2::vector, $3, $4, $5, $6)`,
|
|
205
|
+
[
|
|
206
|
+
item.content,
|
|
207
|
+
`[${embedding.join(",")}]`,
|
|
208
|
+
JSON.stringify(item.metadata ?? {}),
|
|
209
|
+
item.tags ?? [],
|
|
210
|
+
item.source,
|
|
211
|
+
item.externalId,
|
|
212
|
+
],
|
|
213
|
+
);
|
|
214
|
+
return true;
|
|
215
|
+
}
|
package/tsconfig.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
{
|
|
2
|
+
"compilerOptions": {
|
|
3
|
+
"target": "ES2022",
|
|
4
|
+
"module": "NodeNext",
|
|
5
|
+
"moduleResolution": "NodeNext",
|
|
6
|
+
"outDir": "./dist",
|
|
7
|
+
"declaration": true,
|
|
8
|
+
"declarationMap": true,
|
|
9
|
+
"sourceMap": true,
|
|
10
|
+
"strict": true,
|
|
11
|
+
"esModuleInterop": true,
|
|
12
|
+
"skipLibCheck": true
|
|
13
|
+
},
|
|
14
|
+
"include": ["src"]
|
|
15
|
+
}
|