graphile-search 1.12.0 → 1.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/adapters/bm25.d.ts +5 -0
- package/adapters/bm25.js +54 -3
- package/adapters/chunks.d.ts +39 -0
- package/adapters/chunks.js +81 -0
- package/adapters/index.d.ts +2 -0
- package/adapters/index.js +3 -1
- package/adapters/pgvector.js +2 -52
- package/adapters/trgm.d.ts +5 -0
- package/adapters/trgm.js +44 -2
- package/adapters/tsvector.d.ts +5 -0
- package/adapters/tsvector.js +55 -2
- package/esm/adapters/bm25.d.ts +5 -0
- package/esm/adapters/bm25.js +54 -3
- package/esm/adapters/chunks.d.ts +39 -0
- package/esm/adapters/chunks.js +78 -0
- package/esm/adapters/index.d.ts +2 -0
- package/esm/adapters/index.js +1 -0
- package/esm/adapters/pgvector.js +1 -51
- package/esm/adapters/trgm.d.ts +5 -0
- package/esm/adapters/trgm.js +44 -2
- package/esm/adapters/tsvector.d.ts +5 -0
- package/esm/adapters/tsvector.js +55 -2
- package/package.json +5 -5
package/adapters/bm25.d.ts
CHANGED
|
@@ -6,6 +6,11 @@
|
|
|
6
6
|
*
|
|
7
7
|
* Requires the Bm25CodecPlugin to be loaded first (for index discovery).
|
|
8
8
|
* The adapter reads from the bm25IndexStore populated during the gather phase.
|
|
9
|
+
*
|
|
10
|
+
* Supports chunk-aware querying via @hasChunks smart tag: when the parent
|
|
11
|
+
* table has chunks with a BM25 index, the adapter includes a lateral
|
|
12
|
+
* subquery to find the best-matching chunk and returns
|
|
13
|
+
* LEAST(parent_score, chunk_score) (lower = better for BM25).
|
|
9
14
|
*/
|
|
10
15
|
import type { SearchAdapter } from '../types';
|
|
11
16
|
/**
|
package/adapters/bm25.js
CHANGED
|
@@ -7,10 +7,16 @@
|
|
|
7
7
|
*
|
|
8
8
|
* Requires the Bm25CodecPlugin to be loaded first (for index discovery).
|
|
9
9
|
* The adapter reads from the bm25IndexStore populated during the gather phase.
|
|
10
|
+
*
|
|
11
|
+
* Supports chunk-aware querying via @hasChunks smart tag: when the parent
|
|
12
|
+
* table has chunks with a BM25 index, the adapter includes a lateral
|
|
13
|
+
* subquery to find the best-matching chunk and returns
|
|
14
|
+
* LEAST(parent_score, chunk_score) (lower = better for BM25).
|
|
10
15
|
*/
|
|
11
16
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
17
|
exports.createBm25Adapter = createBm25Adapter;
|
|
13
18
|
const bm25_codec_1 = require("../codecs/bm25-codec");
|
|
19
|
+
const chunks_1 = require("./chunks");
|
|
14
20
|
function isTextCodec(codec) {
|
|
15
21
|
const name = codec?.name;
|
|
16
22
|
return name === 'text' || name === 'varchar' || name === 'bpchar';
|
|
@@ -62,7 +68,14 @@ function createBm25Adapter(options = {}) {
|
|
|
62
68
|
const bm25Index = getBm25IndexForAttribute(codec, attributeName, build);
|
|
63
69
|
if (!bm25Index)
|
|
64
70
|
continue;
|
|
65
|
-
|
|
71
|
+
// Check for chunk-aware BM25
|
|
72
|
+
const chunksInfo = (0, chunks_1.getChunksInfo)(codec);
|
|
73
|
+
const hasChunkBm25 = chunksInfo?.searchIndexes.includes('bm25');
|
|
74
|
+
const columnData = {
|
|
75
|
+
bm25Index,
|
|
76
|
+
chunksInfo: hasChunkBm25 ? chunksInfo : undefined,
|
|
77
|
+
};
|
|
78
|
+
columns.push({ attributeName, adapterData: columnData });
|
|
66
79
|
}
|
|
67
80
|
return columns;
|
|
68
81
|
},
|
|
@@ -97,15 +110,53 @@ function createBm25Adapter(options = {}) {
|
|
|
97
110
|
buildFilterApply(sql, alias, column, filterValue, _build) {
|
|
98
111
|
if (filterValue == null)
|
|
99
112
|
return null;
|
|
100
|
-
const { query, threshold } = filterValue;
|
|
113
|
+
const { query, threshold, includeChunks } = filterValue;
|
|
101
114
|
if (!query || typeof query !== 'string' || query.trim().length === 0)
|
|
102
115
|
return null;
|
|
103
|
-
const
|
|
116
|
+
const columnData = column.adapterData;
|
|
117
|
+
const bm25Index = columnData.bm25Index;
|
|
104
118
|
const columnExpr = sql `${alias}.${sql.identifier(column.attributeName)}`;
|
|
105
119
|
// Use quoteQualifiedIdentifier to produce the qualified index name
|
|
106
120
|
const qualifiedIndexName = `"${bm25Index.schemaName}"."${bm25Index.indexName}"`;
|
|
107
121
|
const bm25queryExpr = sql `to_bm25query(${sql.value(query)}, ${sql.value(qualifiedIndexName)})`;
|
|
108
122
|
const scoreExpr = sql `(${columnExpr} <@> ${bm25queryExpr})`;
|
|
123
|
+
// Check for chunk-aware querying
|
|
124
|
+
const chunksInfo = columnData.chunksInfo;
|
|
125
|
+
if (chunksInfo && chunksInfo.searchIndexes.includes('bm25') && (includeChunks !== false)) {
|
|
126
|
+
const chunksTableRef = chunksInfo.chunksSchema
|
|
127
|
+
? sql `${sql.identifier(chunksInfo.chunksSchema)}.${sql.identifier(chunksInfo.chunksTableName)}`
|
|
128
|
+
: sql `${sql.identifier(chunksInfo.chunksTableName)}`;
|
|
129
|
+
const parentFk = sql.identifier(chunksInfo.parentFkField);
|
|
130
|
+
const chunkContentField = sql.identifier(chunksInfo.contentField);
|
|
131
|
+
const parentId = sql `${alias}.${sql.identifier(chunksInfo.parentPkField)}`;
|
|
132
|
+
const chunksAlias = sql.identifier('__bm25_chunks');
|
|
133
|
+
// BM25 on chunks requires an index name on the chunks table.
|
|
134
|
+
// We construct it from the chunks table schema + a conventional index name.
|
|
135
|
+
// The BM25 index on chunks is named: {chunks_table}_{content_field}_bm25_idx
|
|
136
|
+
const chunksIndexName = `"${chunksInfo.chunksSchema || bm25Index.schemaName}"."${chunksInfo.chunksTableName}_${chunksInfo.contentField}_bm25_idx"`;
|
|
137
|
+
const chunkBm25queryExpr = sql `to_bm25query(${sql.value(query)}, ${sql.value(chunksIndexName)})`;
|
|
138
|
+
const chunkScoreExpr = sql `(${chunksAlias}.${chunkContentField} <@> ${chunkBm25queryExpr})`;
|
|
139
|
+
// Subquery: MIN(bm25_score) across chunks (lower = better for BM25)
|
|
140
|
+
const chunkScoreSubquery = sql `(
|
|
141
|
+
SELECT MIN(${chunkScoreExpr})
|
|
142
|
+
FROM ${chunksTableRef} AS ${chunksAlias}
|
|
143
|
+
WHERE ${chunksAlias}.${parentFk} = ${parentId}
|
|
144
|
+
)`;
|
|
145
|
+
// Combined: LEAST of parent score and best chunk score (lower = better)
|
|
146
|
+
const combinedScoreExpr = sql `LEAST(
|
|
147
|
+
COALESCE(${scoreExpr}, 0::real),
|
|
148
|
+
COALESCE(${chunkScoreSubquery}, 0::real)
|
|
149
|
+
)`;
|
|
150
|
+
let whereClause = null;
|
|
151
|
+
if (threshold !== undefined && threshold !== null) {
|
|
152
|
+
whereClause = sql `${combinedScoreExpr} < ${sql.value(threshold)}`;
|
|
153
|
+
}
|
|
154
|
+
return {
|
|
155
|
+
whereClause,
|
|
156
|
+
scoreExpression: combinedScoreExpr,
|
|
157
|
+
};
|
|
158
|
+
}
|
|
159
|
+
// Standard (non-chunk) query
|
|
109
160
|
let whereClause = null;
|
|
110
161
|
if (threshold !== undefined && threshold !== null) {
|
|
111
162
|
whereClause = sql `${scoreExpr} < ${sql.value(threshold)}`;
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared @hasChunks smart tag utilities.
|
|
3
|
+
*
|
|
4
|
+
* Extracts chunk table metadata from the @hasChunks smart tag on a codec.
|
|
5
|
+
* Used by pgvector, tsvector, BM25, and trgm adapters to build lateral
|
|
6
|
+
* subqueries against the chunks table for chunk-aware search.
|
|
7
|
+
*/
|
|
8
|
+
/**
|
|
9
|
+
* Chunks table info detected from @hasChunks smart tag.
|
|
10
|
+
*/
|
|
11
|
+
export interface ChunksInfo {
|
|
12
|
+
chunksSchema: string | null;
|
|
13
|
+
chunksTableName: string;
|
|
14
|
+
parentFkField: string;
|
|
15
|
+
parentPkField: string;
|
|
16
|
+
embeddingField: string;
|
|
17
|
+
/** Text content field on chunks table (e.g. "content") */
|
|
18
|
+
contentField: string;
|
|
19
|
+
/** tsvector field on chunks table, if fulltext search is enabled (e.g. "search") */
|
|
20
|
+
searchField: string | null;
|
|
21
|
+
/** Which search indexes are created on the chunks table (e.g. ["fulltext", "bm25"]) */
|
|
22
|
+
searchIndexes: string[];
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Read @hasChunks smart tag from codec extensions.
|
|
26
|
+
*
|
|
27
|
+
* The tag value is a JSON object like:
|
|
28
|
+
* {
|
|
29
|
+
* "chunksTable": "documents_chunks",
|
|
30
|
+
* "chunksSchema": "app_private", // optional, defaults to parent table's schema
|
|
31
|
+
* "parentFk": "document_id", // optional, defaults to "parent_id"
|
|
32
|
+
* "parentPk": "id", // optional, defaults to "id"
|
|
33
|
+
* "embeddingField": "embedding", // optional, defaults to "embedding"
|
|
34
|
+
* "contentField": "content", // optional, defaults to "content"
|
|
35
|
+
* "searchField": "search", // optional, null if no fulltext
|
|
36
|
+
* "searchIndexes": ["fulltext","bm25"] // optional, defaults to []
|
|
37
|
+
* }
|
|
38
|
+
*/
|
|
39
|
+
export declare function getChunksInfo(codec: any): ChunksInfo | undefined;
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Shared @hasChunks smart tag utilities.
|
|
4
|
+
*
|
|
5
|
+
* Extracts chunk table metadata from the @hasChunks smart tag on a codec.
|
|
6
|
+
* Used by pgvector, tsvector, BM25, and trgm adapters to build lateral
|
|
7
|
+
* subqueries against the chunks table for chunk-aware search.
|
|
8
|
+
*/
|
|
9
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
10
|
+
exports.getChunksInfo = getChunksInfo;
|
|
11
|
+
/**
|
|
12
|
+
* Read @hasChunks smart tag from codec extensions.
|
|
13
|
+
*
|
|
14
|
+
* The tag value is a JSON object like:
|
|
15
|
+
* {
|
|
16
|
+
* "chunksTable": "documents_chunks",
|
|
17
|
+
* "chunksSchema": "app_private", // optional, defaults to parent table's schema
|
|
18
|
+
* "parentFk": "document_id", // optional, defaults to "parent_id"
|
|
19
|
+
* "parentPk": "id", // optional, defaults to "id"
|
|
20
|
+
* "embeddingField": "embedding", // optional, defaults to "embedding"
|
|
21
|
+
* "contentField": "content", // optional, defaults to "content"
|
|
22
|
+
* "searchField": "search", // optional, null if no fulltext
|
|
23
|
+
* "searchIndexes": ["fulltext","bm25"] // optional, defaults to []
|
|
24
|
+
* }
|
|
25
|
+
*/
|
|
26
|
+
function getChunksInfo(codec) {
|
|
27
|
+
const tags = codec?.extensions?.tags;
|
|
28
|
+
if (!tags)
|
|
29
|
+
return undefined;
|
|
30
|
+
const raw = tags.hasChunks;
|
|
31
|
+
if (!raw)
|
|
32
|
+
return undefined;
|
|
33
|
+
let parsed;
|
|
34
|
+
if (typeof raw === 'string') {
|
|
35
|
+
try {
|
|
36
|
+
parsed = JSON.parse(raw);
|
|
37
|
+
}
|
|
38
|
+
catch {
|
|
39
|
+
return undefined;
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
else if (typeof raw === 'object') {
|
|
43
|
+
parsed = raw;
|
|
44
|
+
}
|
|
45
|
+
else if (raw === true) {
|
|
46
|
+
return undefined;
|
|
47
|
+
}
|
|
48
|
+
else {
|
|
49
|
+
return undefined;
|
|
50
|
+
}
|
|
51
|
+
if (!parsed.chunksTable)
|
|
52
|
+
return undefined;
|
|
53
|
+
const chunksSchema = parsed.chunksSchema
|
|
54
|
+
|| codec?.extensions?.pg?.schemaName
|
|
55
|
+
|| null;
|
|
56
|
+
// Parse searchIndexes from tag (may be array or JSON string)
|
|
57
|
+
let searchIndexes = [];
|
|
58
|
+
if (Array.isArray(parsed.searchIndexes)) {
|
|
59
|
+
searchIndexes = parsed.searchIndexes;
|
|
60
|
+
}
|
|
61
|
+
else if (typeof parsed.searchIndexes === 'string') {
|
|
62
|
+
try {
|
|
63
|
+
const arr = JSON.parse(parsed.searchIndexes);
|
|
64
|
+
if (Array.isArray(arr))
|
|
65
|
+
searchIndexes = arr;
|
|
66
|
+
}
|
|
67
|
+
catch {
|
|
68
|
+
// ignore
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
return {
|
|
72
|
+
chunksSchema,
|
|
73
|
+
chunksTableName: parsed.chunksTable,
|
|
74
|
+
parentFkField: parsed.parentFk || 'parent_id',
|
|
75
|
+
parentPkField: parsed.parentPk || 'id',
|
|
76
|
+
embeddingField: parsed.embeddingField || 'embedding',
|
|
77
|
+
contentField: parsed.contentField || 'content',
|
|
78
|
+
searchField: parsed.searchField || null,
|
|
79
|
+
searchIndexes,
|
|
80
|
+
};
|
|
81
|
+
}
|
package/adapters/index.d.ts
CHANGED
|
@@ -12,3 +12,5 @@ export { createTrgmAdapter } from './trgm';
|
|
|
12
12
|
export type { TrgmAdapterOptions } from './trgm';
|
|
13
13
|
export { createPgvectorAdapter } from './pgvector';
|
|
14
14
|
export type { PgvectorAdapterOptions } from './pgvector';
|
|
15
|
+
export { getChunksInfo } from './chunks';
|
|
16
|
+
export type { ChunksInfo } from './chunks';
|
package/adapters/index.js
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
* search algorithm. They are plain objects — not Graphile plugins.
|
|
7
7
|
*/
|
|
8
8
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
9
|
-
exports.createPgvectorAdapter = exports.createTrgmAdapter = exports.createBm25Adapter = exports.createTsvectorAdapter = void 0;
|
|
9
|
+
exports.getChunksInfo = exports.createPgvectorAdapter = exports.createTrgmAdapter = exports.createBm25Adapter = exports.createTsvectorAdapter = void 0;
|
|
10
10
|
var tsvector_1 = require("./tsvector");
|
|
11
11
|
Object.defineProperty(exports, "createTsvectorAdapter", { enumerable: true, get: function () { return tsvector_1.createTsvectorAdapter; } });
|
|
12
12
|
var bm25_1 = require("./bm25");
|
|
@@ -15,3 +15,5 @@ var trgm_1 = require("./trgm");
|
|
|
15
15
|
Object.defineProperty(exports, "createTrgmAdapter", { enumerable: true, get: function () { return trgm_1.createTrgmAdapter; } });
|
|
16
16
|
var pgvector_1 = require("./pgvector");
|
|
17
17
|
Object.defineProperty(exports, "createPgvectorAdapter", { enumerable: true, get: function () { return pgvector_1.createPgvectorAdapter; } });
|
|
18
|
+
var chunks_1 = require("./chunks");
|
|
19
|
+
Object.defineProperty(exports, "getChunksInfo", { enumerable: true, get: function () { return chunks_1.getChunksInfo; } });
|
package/adapters/pgvector.js
CHANGED
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
*/
|
|
9
9
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
10
10
|
exports.createPgvectorAdapter = createPgvectorAdapter;
|
|
11
|
+
const chunks_1 = require("./chunks");
|
|
11
12
|
/**
|
|
12
13
|
* Build a distance expression for the given metric.
|
|
13
14
|
* Uses explicit SQL template literals for each operator to avoid sql.raw.
|
|
@@ -26,57 +27,6 @@ function buildDistanceExpr(sql, columnExpr, vectorExpr, metric) {
|
|
|
26
27
|
function isVectorCodec(codec) {
|
|
27
28
|
return codec?.name === 'vector';
|
|
28
29
|
}
|
|
29
|
-
/**
|
|
30
|
-
* Read @hasChunks smart tag from codec extensions.
|
|
31
|
-
* The tag value is a JSON object like:
|
|
32
|
-
* {
|
|
33
|
-
* "chunksTable": "documents_chunks",
|
|
34
|
-
* "chunksSchema": "app_private", // optional, defaults to parent table's schema
|
|
35
|
-
* "parentFk": "document_id", // optional, defaults to "parent_id"
|
|
36
|
-
* "parentPk": "id", // optional, defaults to "id"
|
|
37
|
-
* "embeddingField": "embedding" // optional, defaults to "embedding"
|
|
38
|
-
* }
|
|
39
|
-
*/
|
|
40
|
-
function getChunksInfo(codec) {
|
|
41
|
-
const tags = codec?.extensions?.tags;
|
|
42
|
-
if (!tags)
|
|
43
|
-
return undefined;
|
|
44
|
-
const raw = tags.hasChunks;
|
|
45
|
-
if (!raw)
|
|
46
|
-
return undefined;
|
|
47
|
-
let parsed;
|
|
48
|
-
if (typeof raw === 'string') {
|
|
49
|
-
try {
|
|
50
|
-
parsed = JSON.parse(raw);
|
|
51
|
-
}
|
|
52
|
-
catch {
|
|
53
|
-
// If it's just "true" or a plain string, use convention-based defaults
|
|
54
|
-
return undefined;
|
|
55
|
-
}
|
|
56
|
-
}
|
|
57
|
-
else if (typeof raw === 'object') {
|
|
58
|
-
parsed = raw;
|
|
59
|
-
}
|
|
60
|
-
else if (raw === true) {
|
|
61
|
-
return undefined; // boolean true = no metadata, can't resolve
|
|
62
|
-
}
|
|
63
|
-
else {
|
|
64
|
-
return undefined;
|
|
65
|
-
}
|
|
66
|
-
if (!parsed.chunksTable)
|
|
67
|
-
return undefined;
|
|
68
|
-
// Resolve schema: explicit chunksSchema > parent codec schema > null
|
|
69
|
-
const chunksSchema = parsed.chunksSchema
|
|
70
|
-
|| codec?.extensions?.pg?.schemaName
|
|
71
|
-
|| null;
|
|
72
|
-
return {
|
|
73
|
-
chunksSchema,
|
|
74
|
-
chunksTableName: parsed.chunksTable,
|
|
75
|
-
parentFkField: parsed.parentFk || 'parent_id',
|
|
76
|
-
parentPkField: parsed.parentPk || 'id',
|
|
77
|
-
embeddingField: parsed.embeddingField || 'embedding',
|
|
78
|
-
};
|
|
79
|
-
}
|
|
80
30
|
function createPgvectorAdapter(options = {}) {
|
|
81
31
|
const { filterPrefix = 'vector', defaultMetric = 'COSINE', enableChunkQuerying = true } = options;
|
|
82
32
|
return {
|
|
@@ -96,7 +46,7 @@ function createPgvectorAdapter(options = {}) {
|
|
|
96
46
|
if (!codec?.attributes)
|
|
97
47
|
return [];
|
|
98
48
|
const columns = [];
|
|
99
|
-
const chunksInfo = enableChunkQuerying ? getChunksInfo(codec) : undefined;
|
|
49
|
+
const chunksInfo = enableChunkQuerying ? (0, chunks_1.getChunksInfo)(codec) : undefined;
|
|
100
50
|
for (const [attributeName, attribute] of Object.entries(codec.attributes)) {
|
|
101
51
|
if (isVectorCodec(attribute.codec)) {
|
|
102
52
|
columns.push({
|
package/adapters/trgm.d.ts
CHANGED
|
@@ -3,6 +3,11 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Detects text/varchar columns and generates trigram similarity scoring.
|
|
5
5
|
* Wraps the same SQL logic as graphile-trgm but as a SearchAdapter.
|
|
6
|
+
*
|
|
7
|
+
* Supports chunk-aware querying via @hasChunks smart tag: when the parent
|
|
8
|
+
* table has chunks with a trigram index, the adapter includes a lateral
|
|
9
|
+
* subquery to find the best-matching chunk and returns
|
|
10
|
+
* GREATEST(parent_similarity, chunk_similarity).
|
|
6
11
|
*/
|
|
7
12
|
import type { SearchAdapter } from '../types';
|
|
8
13
|
export interface TrgmAdapterOptions {
|
package/adapters/trgm.js
CHANGED
|
@@ -4,9 +4,15 @@
|
|
|
4
4
|
*
|
|
5
5
|
* Detects text/varchar columns and generates trigram similarity scoring.
|
|
6
6
|
* Wraps the same SQL logic as graphile-trgm but as a SearchAdapter.
|
|
7
|
+
*
|
|
8
|
+
* Supports chunk-aware querying via @hasChunks smart tag: when the parent
|
|
9
|
+
* table has chunks with a trigram index, the adapter includes a lateral
|
|
10
|
+
* subquery to find the best-matching chunk and returns
|
|
11
|
+
* GREATEST(parent_similarity, chunk_similarity).
|
|
7
12
|
*/
|
|
8
13
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
9
14
|
exports.createTrgmAdapter = createTrgmAdapter;
|
|
15
|
+
const chunks_1 = require("./chunks");
|
|
10
16
|
function isTextCodec(codec) {
|
|
11
17
|
const name = codec?.name;
|
|
12
18
|
return name === 'text' || name === 'varchar' || name === 'bpchar';
|
|
@@ -37,7 +43,13 @@ function createTrgmAdapter(options = {}) {
|
|
|
37
43
|
const columns = [];
|
|
38
44
|
for (const [attributeName, attribute] of Object.entries(codec.attributes)) {
|
|
39
45
|
if (isTextCodec(attribute.codec)) {
|
|
40
|
-
|
|
46
|
+
// Store chunks info if available and chunks have trigram search
|
|
47
|
+
const chunksInfo = (0, chunks_1.getChunksInfo)(codec);
|
|
48
|
+
const hasChunkTrgm = chunksInfo?.searchIndexes.includes('trigram');
|
|
49
|
+
columns.push({
|
|
50
|
+
attributeName,
|
|
51
|
+
adapterData: hasChunkTrgm ? chunksInfo : undefined,
|
|
52
|
+
});
|
|
41
53
|
}
|
|
42
54
|
}
|
|
43
55
|
return columns;
|
|
@@ -73,12 +85,42 @@ function createTrgmAdapter(options = {}) {
|
|
|
73
85
|
buildFilterApply(sql, alias, column, filterValue, _build) {
|
|
74
86
|
if (filterValue == null)
|
|
75
87
|
return null;
|
|
76
|
-
const { value, threshold } = filterValue;
|
|
88
|
+
const { value, threshold, includeChunks } = filterValue;
|
|
77
89
|
if (!value || typeof value !== 'string' || value.trim().length === 0)
|
|
78
90
|
return null;
|
|
79
91
|
const th = threshold != null ? threshold : defaultThreshold;
|
|
80
92
|
const columnExpr = sql `${alias}.${sql.identifier(column.attributeName)}`;
|
|
81
93
|
const similarityExpr = sql `similarity(${columnExpr}, ${sql.value(value)})`;
|
|
94
|
+
// Check for chunk-aware querying
|
|
95
|
+
const chunksInfo = column.adapterData;
|
|
96
|
+
if (chunksInfo && chunksInfo.searchIndexes.includes('trigram') && (includeChunks !== false)) {
|
|
97
|
+
const chunksTableRef = chunksInfo.chunksSchema
|
|
98
|
+
? sql `${sql.identifier(chunksInfo.chunksSchema)}.${sql.identifier(chunksInfo.chunksTableName)}`
|
|
99
|
+
: sql `${sql.identifier(chunksInfo.chunksTableName)}`;
|
|
100
|
+
const parentFk = sql.identifier(chunksInfo.parentFkField);
|
|
101
|
+
const chunkContentField = sql.identifier(chunksInfo.contentField);
|
|
102
|
+
const parentId = sql `${alias}.${sql.identifier(chunksInfo.parentPkField)}`;
|
|
103
|
+
const chunksAlias = sql.identifier('__trgm_chunks');
|
|
104
|
+
// Subquery: MAX(similarity) across chunks (higher = better for trgm)
|
|
105
|
+
const chunkSimilaritySubquery = sql `(
|
|
106
|
+
SELECT MAX(similarity(${chunksAlias}.${chunkContentField}, ${sql.value(value)}))
|
|
107
|
+
FROM ${chunksTableRef} AS ${chunksAlias}
|
|
108
|
+
WHERE ${chunksAlias}.${parentFk} = ${parentId}
|
|
109
|
+
AND similarity(${chunksAlias}.${chunkContentField}, ${sql.value(value)}) > ${sql.value(th)}
|
|
110
|
+
)`;
|
|
111
|
+
// Combined: GREATEST of parent similarity and best chunk similarity
|
|
112
|
+
const combinedSimilarityExpr = sql `GREATEST(
|
|
113
|
+
COALESCE(${similarityExpr}, 0::real),
|
|
114
|
+
COALESCE(${chunkSimilaritySubquery}, 0::real)
|
|
115
|
+
)`;
|
|
116
|
+
// WHERE: parent matches OR any chunk matches
|
|
117
|
+
const whereClause = sql `(${similarityExpr} > ${sql.value(th)} OR ${chunkSimilaritySubquery} IS NOT NULL)`;
|
|
118
|
+
return {
|
|
119
|
+
whereClause,
|
|
120
|
+
scoreExpression: combinedSimilarityExpr,
|
|
121
|
+
};
|
|
122
|
+
}
|
|
123
|
+
// Standard (non-chunk) query
|
|
82
124
|
return {
|
|
83
125
|
whereClause: sql `${similarityExpr} > ${sql.value(th)}`,
|
|
84
126
|
scoreExpression: similarityExpr,
|
package/adapters/tsvector.d.ts
CHANGED
|
@@ -3,6 +3,11 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Detects tsvector columns and generates ts_rank-based scoring.
|
|
5
5
|
* Wraps the same SQL logic as graphile-tsvector but as a SearchAdapter.
|
|
6
|
+
*
|
|
7
|
+
* Supports chunk-aware querying via @hasChunks smart tag: when the parent
|
|
8
|
+
* table has chunks with a tsvector search field, the adapter includes a
|
|
9
|
+
* lateral subquery to find the best-matching chunk and returns
|
|
10
|
+
* GREATEST(parent_rank, chunk_rank).
|
|
6
11
|
*/
|
|
7
12
|
import type { SearchAdapter } from '../types';
|
|
8
13
|
export interface TsvectorAdapterOptions {
|
package/adapters/tsvector.js
CHANGED
|
@@ -4,9 +4,15 @@
|
|
|
4
4
|
*
|
|
5
5
|
* Detects tsvector columns and generates ts_rank-based scoring.
|
|
6
6
|
* Wraps the same SQL logic as graphile-tsvector but as a SearchAdapter.
|
|
7
|
+
*
|
|
8
|
+
* Supports chunk-aware querying via @hasChunks smart tag: when the parent
|
|
9
|
+
* table has chunks with a tsvector search field, the adapter includes a
|
|
10
|
+
* lateral subquery to find the best-matching chunk and returns
|
|
11
|
+
* GREATEST(parent_rank, chunk_rank).
|
|
7
12
|
*/
|
|
8
13
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
9
14
|
exports.createTsvectorAdapter = createTsvectorAdapter;
|
|
15
|
+
const chunks_1 = require("./chunks");
|
|
10
16
|
function isTsvectorCodec(codec) {
|
|
11
17
|
// In graphile-build-pg >= 5.0.0-rc.8, the built-in TYPES.tsvector codec
|
|
12
18
|
// has name === 'tsvector' but does NOT have extensions.pg. We need to
|
|
@@ -36,7 +42,14 @@ function createTsvectorAdapter(options = {}) {
|
|
|
36
42
|
const columns = [];
|
|
37
43
|
for (const [attributeName, attribute] of Object.entries(codec.attributes)) {
|
|
38
44
|
if (isTsvectorCodec(attribute.codec)) {
|
|
39
|
-
|
|
45
|
+
// Store chunks info if available and chunks have fulltext search
|
|
46
|
+
const chunksInfo = (0, chunks_1.getChunksInfo)(codec);
|
|
47
|
+
const hasChunkFulltext = chunksInfo?.searchField &&
|
|
48
|
+
chunksInfo.searchIndexes.includes('fulltext');
|
|
49
|
+
columns.push({
|
|
50
|
+
attributeName,
|
|
51
|
+
adapterData: hasChunkFulltext ? chunksInfo : undefined,
|
|
52
|
+
});
|
|
40
53
|
}
|
|
41
54
|
}
|
|
42
55
|
return columns;
|
|
@@ -50,11 +63,51 @@ function createTsvectorAdapter(options = {}) {
|
|
|
50
63
|
buildFilterApply(sql, alias, column, filterValue, _build) {
|
|
51
64
|
if (filterValue == null)
|
|
52
65
|
return null;
|
|
53
|
-
|
|
66
|
+
// Handle includeChunks option when filter is an object
|
|
67
|
+
let val;
|
|
68
|
+
let includeChunks;
|
|
69
|
+
if (typeof filterValue === 'object' && filterValue !== null && 'query' in filterValue) {
|
|
70
|
+
val = typeof filterValue.query === 'string' ? filterValue.query : String(filterValue.query);
|
|
71
|
+
includeChunks = filterValue.includeChunks;
|
|
72
|
+
}
|
|
73
|
+
else {
|
|
74
|
+
val = typeof filterValue === 'string' ? filterValue : String(filterValue);
|
|
75
|
+
}
|
|
54
76
|
if (val.trim().length === 0)
|
|
55
77
|
return null;
|
|
56
78
|
const tsquery = sql `websearch_to_tsquery(${sql.literal(tsConfig)}, ${sql.value(val)})`;
|
|
57
79
|
const columnExpr = sql `${alias}.${sql.identifier(column.attributeName)}`;
|
|
80
|
+
// Check for chunk-aware querying
|
|
81
|
+
const chunksInfo = column.adapterData;
|
|
82
|
+
if (chunksInfo && chunksInfo.searchField && (includeChunks !== false)) {
|
|
83
|
+
const chunksTableRef = chunksInfo.chunksSchema
|
|
84
|
+
? sql `${sql.identifier(chunksInfo.chunksSchema)}.${sql.identifier(chunksInfo.chunksTableName)}`
|
|
85
|
+
: sql `${sql.identifier(chunksInfo.chunksTableName)}`;
|
|
86
|
+
const parentFk = sql.identifier(chunksInfo.parentFkField);
|
|
87
|
+
const chunkSearchField = sql.identifier(chunksInfo.searchField);
|
|
88
|
+
const parentId = sql `${alias}.${sql.identifier(chunksInfo.parentPkField)}`;
|
|
89
|
+
const chunksAlias = sql.identifier('__tsv_chunks');
|
|
90
|
+
// Subquery: MAX(ts_rank) across matching chunks
|
|
91
|
+
const chunkRankSubquery = sql `(
|
|
92
|
+
SELECT MAX(ts_rank(${chunksAlias}.${chunkSearchField}, ${tsquery}))
|
|
93
|
+
FROM ${chunksTableRef} AS ${chunksAlias}
|
|
94
|
+
WHERE ${chunksAlias}.${parentFk} = ${parentId}
|
|
95
|
+
AND ${chunksAlias}.${chunkSearchField} @@ ${tsquery}
|
|
96
|
+
)`;
|
|
97
|
+
const parentRankExpr = sql `ts_rank(${columnExpr}, ${tsquery})`;
|
|
98
|
+
// Combined: GREATEST of parent rank and best chunk rank
|
|
99
|
+
const combinedRankExpr = sql `GREATEST(
|
|
100
|
+
COALESCE(CASE WHEN ${columnExpr} @@ ${tsquery} THEN ${parentRankExpr} ELSE 0::real END, 0::real),
|
|
101
|
+
COALESCE(${chunkRankSubquery}, 0::real)
|
|
102
|
+
)`;
|
|
103
|
+
// WHERE: parent matches OR any chunk matches
|
|
104
|
+
const whereClause = sql `(${columnExpr} @@ ${tsquery} OR ${chunkRankSubquery} IS NOT NULL)`;
|
|
105
|
+
return {
|
|
106
|
+
whereClause,
|
|
107
|
+
scoreExpression: combinedRankExpr,
|
|
108
|
+
};
|
|
109
|
+
}
|
|
110
|
+
// Standard (non-chunk) query
|
|
58
111
|
return {
|
|
59
112
|
whereClause: sql `${columnExpr} @@ ${tsquery}`,
|
|
60
113
|
scoreExpression: sql `ts_rank(${columnExpr}, ${tsquery})`,
|
package/esm/adapters/bm25.d.ts
CHANGED
|
@@ -6,6 +6,11 @@
|
|
|
6
6
|
*
|
|
7
7
|
* Requires the Bm25CodecPlugin to be loaded first (for index discovery).
|
|
8
8
|
* The adapter reads from the bm25IndexStore populated during the gather phase.
|
|
9
|
+
*
|
|
10
|
+
* Supports chunk-aware querying via @hasChunks smart tag: when the parent
|
|
11
|
+
* table has chunks with a BM25 index, the adapter includes a lateral
|
|
12
|
+
* subquery to find the best-matching chunk and returns
|
|
13
|
+
* LEAST(parent_score, chunk_score) (lower = better for BM25).
|
|
9
14
|
*/
|
|
10
15
|
import type { SearchAdapter } from '../types';
|
|
11
16
|
/**
|
package/esm/adapters/bm25.js
CHANGED
|
@@ -6,8 +6,14 @@
|
|
|
6
6
|
*
|
|
7
7
|
* Requires the Bm25CodecPlugin to be loaded first (for index discovery).
|
|
8
8
|
* The adapter reads from the bm25IndexStore populated during the gather phase.
|
|
9
|
+
*
|
|
10
|
+
* Supports chunk-aware querying via @hasChunks smart tag: when the parent
|
|
11
|
+
* table has chunks with a BM25 index, the adapter includes a lateral
|
|
12
|
+
* subquery to find the best-matching chunk and returns
|
|
13
|
+
* LEAST(parent_score, chunk_score) (lower = better for BM25).
|
|
9
14
|
*/
|
|
10
15
|
import { bm25IndexStore as moduleBm25IndexStore } from '../codecs/bm25-codec';
|
|
16
|
+
import { getChunksInfo } from './chunks';
|
|
11
17
|
function isTextCodec(codec) {
|
|
12
18
|
const name = codec?.name;
|
|
13
19
|
return name === 'text' || name === 'varchar' || name === 'bpchar';
|
|
@@ -59,7 +65,14 @@ export function createBm25Adapter(options = {}) {
|
|
|
59
65
|
const bm25Index = getBm25IndexForAttribute(codec, attributeName, build);
|
|
60
66
|
if (!bm25Index)
|
|
61
67
|
continue;
|
|
62
|
-
|
|
68
|
+
// Check for chunk-aware BM25
|
|
69
|
+
const chunksInfo = getChunksInfo(codec);
|
|
70
|
+
const hasChunkBm25 = chunksInfo?.searchIndexes.includes('bm25');
|
|
71
|
+
const columnData = {
|
|
72
|
+
bm25Index,
|
|
73
|
+
chunksInfo: hasChunkBm25 ? chunksInfo : undefined,
|
|
74
|
+
};
|
|
75
|
+
columns.push({ attributeName, adapterData: columnData });
|
|
63
76
|
}
|
|
64
77
|
return columns;
|
|
65
78
|
},
|
|
@@ -94,15 +107,53 @@ export function createBm25Adapter(options = {}) {
|
|
|
94
107
|
buildFilterApply(sql, alias, column, filterValue, _build) {
|
|
95
108
|
if (filterValue == null)
|
|
96
109
|
return null;
|
|
97
|
-
const { query, threshold } = filterValue;
|
|
110
|
+
const { query, threshold, includeChunks } = filterValue;
|
|
98
111
|
if (!query || typeof query !== 'string' || query.trim().length === 0)
|
|
99
112
|
return null;
|
|
100
|
-
const
|
|
113
|
+
const columnData = column.adapterData;
|
|
114
|
+
const bm25Index = columnData.bm25Index;
|
|
101
115
|
const columnExpr = sql `${alias}.${sql.identifier(column.attributeName)}`;
|
|
102
116
|
// Use quoteQualifiedIdentifier to produce the qualified index name
|
|
103
117
|
const qualifiedIndexName = `"${bm25Index.schemaName}"."${bm25Index.indexName}"`;
|
|
104
118
|
const bm25queryExpr = sql `to_bm25query(${sql.value(query)}, ${sql.value(qualifiedIndexName)})`;
|
|
105
119
|
const scoreExpr = sql `(${columnExpr} <@> ${bm25queryExpr})`;
|
|
120
|
+
// Check for chunk-aware querying
|
|
121
|
+
const chunksInfo = columnData.chunksInfo;
|
|
122
|
+
if (chunksInfo && chunksInfo.searchIndexes.includes('bm25') && (includeChunks !== false)) {
|
|
123
|
+
const chunksTableRef = chunksInfo.chunksSchema
|
|
124
|
+
? sql `${sql.identifier(chunksInfo.chunksSchema)}.${sql.identifier(chunksInfo.chunksTableName)}`
|
|
125
|
+
: sql `${sql.identifier(chunksInfo.chunksTableName)}`;
|
|
126
|
+
const parentFk = sql.identifier(chunksInfo.parentFkField);
|
|
127
|
+
const chunkContentField = sql.identifier(chunksInfo.contentField);
|
|
128
|
+
const parentId = sql `${alias}.${sql.identifier(chunksInfo.parentPkField)}`;
|
|
129
|
+
const chunksAlias = sql.identifier('__bm25_chunks');
|
|
130
|
+
// BM25 on chunks requires an index name on the chunks table.
|
|
131
|
+
// We construct it from the chunks table schema + a conventional index name.
|
|
132
|
+
// The BM25 index on chunks is named: {chunks_table}_{content_field}_bm25_idx
|
|
133
|
+
const chunksIndexName = `"${chunksInfo.chunksSchema || bm25Index.schemaName}"."${chunksInfo.chunksTableName}_${chunksInfo.contentField}_bm25_idx"`;
|
|
134
|
+
const chunkBm25queryExpr = sql `to_bm25query(${sql.value(query)}, ${sql.value(chunksIndexName)})`;
|
|
135
|
+
const chunkScoreExpr = sql `(${chunksAlias}.${chunkContentField} <@> ${chunkBm25queryExpr})`;
|
|
136
|
+
// Subquery: MIN(bm25_score) across chunks (lower = better for BM25)
|
|
137
|
+
const chunkScoreSubquery = sql `(
|
|
138
|
+
SELECT MIN(${chunkScoreExpr})
|
|
139
|
+
FROM ${chunksTableRef} AS ${chunksAlias}
|
|
140
|
+
WHERE ${chunksAlias}.${parentFk} = ${parentId}
|
|
141
|
+
)`;
|
|
142
|
+
// Combined: LEAST of parent score and best chunk score (lower = better)
|
|
143
|
+
const combinedScoreExpr = sql `LEAST(
|
|
144
|
+
COALESCE(${scoreExpr}, 0::real),
|
|
145
|
+
COALESCE(${chunkScoreSubquery}, 0::real)
|
|
146
|
+
)`;
|
|
147
|
+
let whereClause = null;
|
|
148
|
+
if (threshold !== undefined && threshold !== null) {
|
|
149
|
+
whereClause = sql `${combinedScoreExpr} < ${sql.value(threshold)}`;
|
|
150
|
+
}
|
|
151
|
+
return {
|
|
152
|
+
whereClause,
|
|
153
|
+
scoreExpression: combinedScoreExpr,
|
|
154
|
+
};
|
|
155
|
+
}
|
|
156
|
+
// Standard (non-chunk) query
|
|
106
157
|
let whereClause = null;
|
|
107
158
|
if (threshold !== undefined && threshold !== null) {
|
|
108
159
|
whereClause = sql `${scoreExpr} < ${sql.value(threshold)}`;
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared @hasChunks smart tag utilities.
|
|
3
|
+
*
|
|
4
|
+
* Extracts chunk table metadata from the @hasChunks smart tag on a codec.
|
|
5
|
+
* Used by pgvector, tsvector, BM25, and trgm adapters to build lateral
|
|
6
|
+
* subqueries against the chunks table for chunk-aware search.
|
|
7
|
+
*/
|
|
8
|
+
/**
|
|
9
|
+
* Chunks table info detected from @hasChunks smart tag.
|
|
10
|
+
*/
|
|
11
|
+
export interface ChunksInfo {
|
|
12
|
+
chunksSchema: string | null;
|
|
13
|
+
chunksTableName: string;
|
|
14
|
+
parentFkField: string;
|
|
15
|
+
parentPkField: string;
|
|
16
|
+
embeddingField: string;
|
|
17
|
+
/** Text content field on chunks table (e.g. "content") */
|
|
18
|
+
contentField: string;
|
|
19
|
+
/** tsvector field on chunks table, if fulltext search is enabled (e.g. "search") */
|
|
20
|
+
searchField: string | null;
|
|
21
|
+
/** Which search indexes are created on the chunks table (e.g. ["fulltext", "bm25"]) */
|
|
22
|
+
searchIndexes: string[];
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Read @hasChunks smart tag from codec extensions.
|
|
26
|
+
*
|
|
27
|
+
* The tag value is a JSON object like:
|
|
28
|
+
* {
|
|
29
|
+
* "chunksTable": "documents_chunks",
|
|
30
|
+
* "chunksSchema": "app_private", // optional, defaults to parent table's schema
|
|
31
|
+
* "parentFk": "document_id", // optional, defaults to "parent_id"
|
|
32
|
+
* "parentPk": "id", // optional, defaults to "id"
|
|
33
|
+
* "embeddingField": "embedding", // optional, defaults to "embedding"
|
|
34
|
+
* "contentField": "content", // optional, defaults to "content"
|
|
35
|
+
* "searchField": "search", // optional, null if no fulltext
|
|
36
|
+
* "searchIndexes": ["fulltext","bm25"] // optional, defaults to []
|
|
37
|
+
* }
|
|
38
|
+
*/
|
|
39
|
+
export declare function getChunksInfo(codec: any): ChunksInfo | undefined;
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared @hasChunks smart tag utilities.
|
|
3
|
+
*
|
|
4
|
+
* Extracts chunk table metadata from the @hasChunks smart tag on a codec.
|
|
5
|
+
* Used by pgvector, tsvector, BM25, and trgm adapters to build lateral
|
|
6
|
+
* subqueries against the chunks table for chunk-aware search.
|
|
7
|
+
*/
|
|
8
|
+
/**
|
|
9
|
+
* Read @hasChunks smart tag from codec extensions.
|
|
10
|
+
*
|
|
11
|
+
* The tag value is a JSON object like:
|
|
12
|
+
* {
|
|
13
|
+
* "chunksTable": "documents_chunks",
|
|
14
|
+
* "chunksSchema": "app_private", // optional, defaults to parent table's schema
|
|
15
|
+
* "parentFk": "document_id", // optional, defaults to "parent_id"
|
|
16
|
+
* "parentPk": "id", // optional, defaults to "id"
|
|
17
|
+
* "embeddingField": "embedding", // optional, defaults to "embedding"
|
|
18
|
+
* "contentField": "content", // optional, defaults to "content"
|
|
19
|
+
* "searchField": "search", // optional, null if no fulltext
|
|
20
|
+
* "searchIndexes": ["fulltext","bm25"] // optional, defaults to []
|
|
21
|
+
* }
|
|
22
|
+
*/
|
|
23
|
+
export function getChunksInfo(codec) {
|
|
24
|
+
const tags = codec?.extensions?.tags;
|
|
25
|
+
if (!tags)
|
|
26
|
+
return undefined;
|
|
27
|
+
const raw = tags.hasChunks;
|
|
28
|
+
if (!raw)
|
|
29
|
+
return undefined;
|
|
30
|
+
let parsed;
|
|
31
|
+
if (typeof raw === 'string') {
|
|
32
|
+
try {
|
|
33
|
+
parsed = JSON.parse(raw);
|
|
34
|
+
}
|
|
35
|
+
catch {
|
|
36
|
+
return undefined;
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
else if (typeof raw === 'object') {
|
|
40
|
+
parsed = raw;
|
|
41
|
+
}
|
|
42
|
+
else if (raw === true) {
|
|
43
|
+
return undefined;
|
|
44
|
+
}
|
|
45
|
+
else {
|
|
46
|
+
return undefined;
|
|
47
|
+
}
|
|
48
|
+
if (!parsed.chunksTable)
|
|
49
|
+
return undefined;
|
|
50
|
+
const chunksSchema = parsed.chunksSchema
|
|
51
|
+
|| codec?.extensions?.pg?.schemaName
|
|
52
|
+
|| null;
|
|
53
|
+
// Parse searchIndexes from tag (may be array or JSON string)
|
|
54
|
+
let searchIndexes = [];
|
|
55
|
+
if (Array.isArray(parsed.searchIndexes)) {
|
|
56
|
+
searchIndexes = parsed.searchIndexes;
|
|
57
|
+
}
|
|
58
|
+
else if (typeof parsed.searchIndexes === 'string') {
|
|
59
|
+
try {
|
|
60
|
+
const arr = JSON.parse(parsed.searchIndexes);
|
|
61
|
+
if (Array.isArray(arr))
|
|
62
|
+
searchIndexes = arr;
|
|
63
|
+
}
|
|
64
|
+
catch {
|
|
65
|
+
// ignore
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
return {
|
|
69
|
+
chunksSchema,
|
|
70
|
+
chunksTableName: parsed.chunksTable,
|
|
71
|
+
parentFkField: parsed.parentFk || 'parent_id',
|
|
72
|
+
parentPkField: parsed.parentPk || 'id',
|
|
73
|
+
embeddingField: parsed.embeddingField || 'embedding',
|
|
74
|
+
contentField: parsed.contentField || 'content',
|
|
75
|
+
searchField: parsed.searchField || null,
|
|
76
|
+
searchIndexes,
|
|
77
|
+
};
|
|
78
|
+
}
|
package/esm/adapters/index.d.ts
CHANGED
|
@@ -12,3 +12,5 @@ export { createTrgmAdapter } from './trgm';
|
|
|
12
12
|
export type { TrgmAdapterOptions } from './trgm';
|
|
13
13
|
export { createPgvectorAdapter } from './pgvector';
|
|
14
14
|
export type { PgvectorAdapterOptions } from './pgvector';
|
|
15
|
+
export { getChunksInfo } from './chunks';
|
|
16
|
+
export type { ChunksInfo } from './chunks';
|
package/esm/adapters/index.js
CHANGED
package/esm/adapters/pgvector.js
CHANGED
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
* pgvector operators (<=> cosine, <-> L2, <#> inner product).
|
|
6
6
|
* Wraps the same SQL logic as graphile-pgvector but as a SearchAdapter.
|
|
7
7
|
*/
|
|
8
|
+
import { getChunksInfo } from './chunks';
|
|
8
9
|
/**
|
|
9
10
|
* Build a distance expression for the given metric.
|
|
10
11
|
* Uses explicit SQL template literals for each operator to avoid sql.raw.
|
|
@@ -23,57 +24,6 @@ function buildDistanceExpr(sql, columnExpr, vectorExpr, metric) {
|
|
|
23
24
|
function isVectorCodec(codec) {
|
|
24
25
|
return codec?.name === 'vector';
|
|
25
26
|
}
|
|
26
|
-
/**
|
|
27
|
-
* Read @hasChunks smart tag from codec extensions.
|
|
28
|
-
* The tag value is a JSON object like:
|
|
29
|
-
* {
|
|
30
|
-
* "chunksTable": "documents_chunks",
|
|
31
|
-
* "chunksSchema": "app_private", // optional, defaults to parent table's schema
|
|
32
|
-
* "parentFk": "document_id", // optional, defaults to "parent_id"
|
|
33
|
-
* "parentPk": "id", // optional, defaults to "id"
|
|
34
|
-
* "embeddingField": "embedding" // optional, defaults to "embedding"
|
|
35
|
-
* }
|
|
36
|
-
*/
|
|
37
|
-
function getChunksInfo(codec) {
|
|
38
|
-
const tags = codec?.extensions?.tags;
|
|
39
|
-
if (!tags)
|
|
40
|
-
return undefined;
|
|
41
|
-
const raw = tags.hasChunks;
|
|
42
|
-
if (!raw)
|
|
43
|
-
return undefined;
|
|
44
|
-
let parsed;
|
|
45
|
-
if (typeof raw === 'string') {
|
|
46
|
-
try {
|
|
47
|
-
parsed = JSON.parse(raw);
|
|
48
|
-
}
|
|
49
|
-
catch {
|
|
50
|
-
// If it's just "true" or a plain string, use convention-based defaults
|
|
51
|
-
return undefined;
|
|
52
|
-
}
|
|
53
|
-
}
|
|
54
|
-
else if (typeof raw === 'object') {
|
|
55
|
-
parsed = raw;
|
|
56
|
-
}
|
|
57
|
-
else if (raw === true) {
|
|
58
|
-
return undefined; // boolean true = no metadata, can't resolve
|
|
59
|
-
}
|
|
60
|
-
else {
|
|
61
|
-
return undefined;
|
|
62
|
-
}
|
|
63
|
-
if (!parsed.chunksTable)
|
|
64
|
-
return undefined;
|
|
65
|
-
// Resolve schema: explicit chunksSchema > parent codec schema > null
|
|
66
|
-
const chunksSchema = parsed.chunksSchema
|
|
67
|
-
|| codec?.extensions?.pg?.schemaName
|
|
68
|
-
|| null;
|
|
69
|
-
return {
|
|
70
|
-
chunksSchema,
|
|
71
|
-
chunksTableName: parsed.chunksTable,
|
|
72
|
-
parentFkField: parsed.parentFk || 'parent_id',
|
|
73
|
-
parentPkField: parsed.parentPk || 'id',
|
|
74
|
-
embeddingField: parsed.embeddingField || 'embedding',
|
|
75
|
-
};
|
|
76
|
-
}
|
|
77
27
|
export function createPgvectorAdapter(options = {}) {
|
|
78
28
|
const { filterPrefix = 'vector', defaultMetric = 'COSINE', enableChunkQuerying = true } = options;
|
|
79
29
|
return {
|
package/esm/adapters/trgm.d.ts
CHANGED
|
@@ -3,6 +3,11 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Detects text/varchar columns and generates trigram similarity scoring.
|
|
5
5
|
* Wraps the same SQL logic as graphile-trgm but as a SearchAdapter.
|
|
6
|
+
*
|
|
7
|
+
* Supports chunk-aware querying via @hasChunks smart tag: when the parent
|
|
8
|
+
* table has chunks with a trigram index, the adapter includes a lateral
|
|
9
|
+
* subquery to find the best-matching chunk and returns
|
|
10
|
+
* GREATEST(parent_similarity, chunk_similarity).
|
|
6
11
|
*/
|
|
7
12
|
import type { SearchAdapter } from '../types';
|
|
8
13
|
export interface TrgmAdapterOptions {
|
package/esm/adapters/trgm.js
CHANGED
|
@@ -3,7 +3,13 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Detects text/varchar columns and generates trigram similarity scoring.
|
|
5
5
|
* Wraps the same SQL logic as graphile-trgm but as a SearchAdapter.
|
|
6
|
+
*
|
|
7
|
+
* Supports chunk-aware querying via @hasChunks smart tag: when the parent
|
|
8
|
+
* table has chunks with a trigram index, the adapter includes a lateral
|
|
9
|
+
* subquery to find the best-matching chunk and returns
|
|
10
|
+
* GREATEST(parent_similarity, chunk_similarity).
|
|
6
11
|
*/
|
|
12
|
+
import { getChunksInfo } from './chunks';
|
|
7
13
|
function isTextCodec(codec) {
|
|
8
14
|
const name = codec?.name;
|
|
9
15
|
return name === 'text' || name === 'varchar' || name === 'bpchar';
|
|
@@ -34,7 +40,13 @@ export function createTrgmAdapter(options = {}) {
|
|
|
34
40
|
const columns = [];
|
|
35
41
|
for (const [attributeName, attribute] of Object.entries(codec.attributes)) {
|
|
36
42
|
if (isTextCodec(attribute.codec)) {
|
|
37
|
-
|
|
43
|
+
// Store chunks info if available and chunks have trigram search
|
|
44
|
+
const chunksInfo = getChunksInfo(codec);
|
|
45
|
+
const hasChunkTrgm = chunksInfo?.searchIndexes.includes('trigram');
|
|
46
|
+
columns.push({
|
|
47
|
+
attributeName,
|
|
48
|
+
adapterData: hasChunkTrgm ? chunksInfo : undefined,
|
|
49
|
+
});
|
|
38
50
|
}
|
|
39
51
|
}
|
|
40
52
|
return columns;
|
|
@@ -70,12 +82,42 @@ export function createTrgmAdapter(options = {}) {
|
|
|
70
82
|
buildFilterApply(sql, alias, column, filterValue, _build) {
|
|
71
83
|
if (filterValue == null)
|
|
72
84
|
return null;
|
|
73
|
-
const { value, threshold } = filterValue;
|
|
85
|
+
const { value, threshold, includeChunks } = filterValue;
|
|
74
86
|
if (!value || typeof value !== 'string' || value.trim().length === 0)
|
|
75
87
|
return null;
|
|
76
88
|
const th = threshold != null ? threshold : defaultThreshold;
|
|
77
89
|
const columnExpr = sql `${alias}.${sql.identifier(column.attributeName)}`;
|
|
78
90
|
const similarityExpr = sql `similarity(${columnExpr}, ${sql.value(value)})`;
|
|
91
|
+
// Check for chunk-aware querying
|
|
92
|
+
const chunksInfo = column.adapterData;
|
|
93
|
+
if (chunksInfo && chunksInfo.searchIndexes.includes('trigram') && (includeChunks !== false)) {
|
|
94
|
+
const chunksTableRef = chunksInfo.chunksSchema
|
|
95
|
+
? sql `${sql.identifier(chunksInfo.chunksSchema)}.${sql.identifier(chunksInfo.chunksTableName)}`
|
|
96
|
+
: sql `${sql.identifier(chunksInfo.chunksTableName)}`;
|
|
97
|
+
const parentFk = sql.identifier(chunksInfo.parentFkField);
|
|
98
|
+
const chunkContentField = sql.identifier(chunksInfo.contentField);
|
|
99
|
+
const parentId = sql `${alias}.${sql.identifier(chunksInfo.parentPkField)}`;
|
|
100
|
+
const chunksAlias = sql.identifier('__trgm_chunks');
|
|
101
|
+
// Subquery: MAX(similarity) across chunks (higher = better for trgm)
|
|
102
|
+
const chunkSimilaritySubquery = sql `(
|
|
103
|
+
SELECT MAX(similarity(${chunksAlias}.${chunkContentField}, ${sql.value(value)}))
|
|
104
|
+
FROM ${chunksTableRef} AS ${chunksAlias}
|
|
105
|
+
WHERE ${chunksAlias}.${parentFk} = ${parentId}
|
|
106
|
+
AND similarity(${chunksAlias}.${chunkContentField}, ${sql.value(value)}) > ${sql.value(th)}
|
|
107
|
+
)`;
|
|
108
|
+
// Combined: GREATEST of parent similarity and best chunk similarity
|
|
109
|
+
const combinedSimilarityExpr = sql `GREATEST(
|
|
110
|
+
COALESCE(${similarityExpr}, 0::real),
|
|
111
|
+
COALESCE(${chunkSimilaritySubquery}, 0::real)
|
|
112
|
+
)`;
|
|
113
|
+
// WHERE: parent matches OR any chunk matches
|
|
114
|
+
const whereClause = sql `(${similarityExpr} > ${sql.value(th)} OR ${chunkSimilaritySubquery} IS NOT NULL)`;
|
|
115
|
+
return {
|
|
116
|
+
whereClause,
|
|
117
|
+
scoreExpression: combinedSimilarityExpr,
|
|
118
|
+
};
|
|
119
|
+
}
|
|
120
|
+
// Standard (non-chunk) query
|
|
79
121
|
return {
|
|
80
122
|
whereClause: sql `${similarityExpr} > ${sql.value(th)}`,
|
|
81
123
|
scoreExpression: similarityExpr,
|
|
@@ -3,6 +3,11 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Detects tsvector columns and generates ts_rank-based scoring.
|
|
5
5
|
* Wraps the same SQL logic as graphile-tsvector but as a SearchAdapter.
|
|
6
|
+
*
|
|
7
|
+
* Supports chunk-aware querying via @hasChunks smart tag: when the parent
|
|
8
|
+
* table has chunks with a tsvector search field, the adapter includes a
|
|
9
|
+
* lateral subquery to find the best-matching chunk and returns
|
|
10
|
+
* GREATEST(parent_rank, chunk_rank).
|
|
6
11
|
*/
|
|
7
12
|
import type { SearchAdapter } from '../types';
|
|
8
13
|
export interface TsvectorAdapterOptions {
|
package/esm/adapters/tsvector.js
CHANGED
|
@@ -3,7 +3,13 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Detects tsvector columns and generates ts_rank-based scoring.
|
|
5
5
|
* Wraps the same SQL logic as graphile-tsvector but as a SearchAdapter.
|
|
6
|
+
*
|
|
7
|
+
* Supports chunk-aware querying via @hasChunks smart tag: when the parent
|
|
8
|
+
* table has chunks with a tsvector search field, the adapter includes a
|
|
9
|
+
* lateral subquery to find the best-matching chunk and returns
|
|
10
|
+
* GREATEST(parent_rank, chunk_rank).
|
|
6
11
|
*/
|
|
12
|
+
import { getChunksInfo } from './chunks';
|
|
7
13
|
function isTsvectorCodec(codec) {
|
|
8
14
|
// In graphile-build-pg >= 5.0.0-rc.8, the built-in TYPES.tsvector codec
|
|
9
15
|
// has name === 'tsvector' but does NOT have extensions.pg. We need to
|
|
@@ -33,7 +39,14 @@ export function createTsvectorAdapter(options = {}) {
|
|
|
33
39
|
const columns = [];
|
|
34
40
|
for (const [attributeName, attribute] of Object.entries(codec.attributes)) {
|
|
35
41
|
if (isTsvectorCodec(attribute.codec)) {
|
|
36
|
-
|
|
42
|
+
// Store chunks info if available and chunks have fulltext search
|
|
43
|
+
const chunksInfo = getChunksInfo(codec);
|
|
44
|
+
const hasChunkFulltext = chunksInfo?.searchField &&
|
|
45
|
+
chunksInfo.searchIndexes.includes('fulltext');
|
|
46
|
+
columns.push({
|
|
47
|
+
attributeName,
|
|
48
|
+
adapterData: hasChunkFulltext ? chunksInfo : undefined,
|
|
49
|
+
});
|
|
37
50
|
}
|
|
38
51
|
}
|
|
39
52
|
return columns;
|
|
@@ -47,11 +60,51 @@ export function createTsvectorAdapter(options = {}) {
|
|
|
47
60
|
buildFilterApply(sql, alias, column, filterValue, _build) {
|
|
48
61
|
if (filterValue == null)
|
|
49
62
|
return null;
|
|
50
|
-
|
|
63
|
+
// Handle includeChunks option when filter is an object
|
|
64
|
+
let val;
|
|
65
|
+
let includeChunks;
|
|
66
|
+
if (typeof filterValue === 'object' && filterValue !== null && 'query' in filterValue) {
|
|
67
|
+
val = typeof filterValue.query === 'string' ? filterValue.query : String(filterValue.query);
|
|
68
|
+
includeChunks = filterValue.includeChunks;
|
|
69
|
+
}
|
|
70
|
+
else {
|
|
71
|
+
val = typeof filterValue === 'string' ? filterValue : String(filterValue);
|
|
72
|
+
}
|
|
51
73
|
if (val.trim().length === 0)
|
|
52
74
|
return null;
|
|
53
75
|
const tsquery = sql `websearch_to_tsquery(${sql.literal(tsConfig)}, ${sql.value(val)})`;
|
|
54
76
|
const columnExpr = sql `${alias}.${sql.identifier(column.attributeName)}`;
|
|
77
|
+
// Check for chunk-aware querying
|
|
78
|
+
const chunksInfo = column.adapterData;
|
|
79
|
+
if (chunksInfo && chunksInfo.searchField && (includeChunks !== false)) {
|
|
80
|
+
const chunksTableRef = chunksInfo.chunksSchema
|
|
81
|
+
? sql `${sql.identifier(chunksInfo.chunksSchema)}.${sql.identifier(chunksInfo.chunksTableName)}`
|
|
82
|
+
: sql `${sql.identifier(chunksInfo.chunksTableName)}`;
|
|
83
|
+
const parentFk = sql.identifier(chunksInfo.parentFkField);
|
|
84
|
+
const chunkSearchField = sql.identifier(chunksInfo.searchField);
|
|
85
|
+
const parentId = sql `${alias}.${sql.identifier(chunksInfo.parentPkField)}`;
|
|
86
|
+
const chunksAlias = sql.identifier('__tsv_chunks');
|
|
87
|
+
// Subquery: MAX(ts_rank) across matching chunks
|
|
88
|
+
const chunkRankSubquery = sql `(
|
|
89
|
+
SELECT MAX(ts_rank(${chunksAlias}.${chunkSearchField}, ${tsquery}))
|
|
90
|
+
FROM ${chunksTableRef} AS ${chunksAlias}
|
|
91
|
+
WHERE ${chunksAlias}.${parentFk} = ${parentId}
|
|
92
|
+
AND ${chunksAlias}.${chunkSearchField} @@ ${tsquery}
|
|
93
|
+
)`;
|
|
94
|
+
const parentRankExpr = sql `ts_rank(${columnExpr}, ${tsquery})`;
|
|
95
|
+
// Combined: GREATEST of parent rank and best chunk rank
|
|
96
|
+
const combinedRankExpr = sql `GREATEST(
|
|
97
|
+
COALESCE(CASE WHEN ${columnExpr} @@ ${tsquery} THEN ${parentRankExpr} ELSE 0::real END, 0::real),
|
|
98
|
+
COALESCE(${chunkRankSubquery}, 0::real)
|
|
99
|
+
)`;
|
|
100
|
+
// WHERE: parent matches OR any chunk matches
|
|
101
|
+
const whereClause = sql `(${columnExpr} @@ ${tsquery} OR ${chunkRankSubquery} IS NOT NULL)`;
|
|
102
|
+
return {
|
|
103
|
+
whereClause,
|
|
104
|
+
scoreExpression: combinedRankExpr,
|
|
105
|
+
};
|
|
106
|
+
}
|
|
107
|
+
// Standard (non-chunk) query
|
|
55
108
|
return {
|
|
56
109
|
whereClause: sql `${columnExpr} @@ ${tsquery}`,
|
|
57
110
|
scoreExpression: sql `ts_rank(${columnExpr}, ${tsquery})`,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "graphile-search",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.13.0",
|
|
4
4
|
"description": "Unified PostGraphile v5 search plugin — abstracts tsvector, BM25, pg_trgm, and pgvector behind a single adapter-based architecture with composite searchScore",
|
|
5
5
|
"author": "Constructive <developers@constructive.io>",
|
|
6
6
|
"homepage": "https://github.com/constructive-io/constructive",
|
|
@@ -31,11 +31,11 @@
|
|
|
31
31
|
"devDependencies": {
|
|
32
32
|
"@types/node": "^22.19.11",
|
|
33
33
|
"@types/pg": "^8.18.0",
|
|
34
|
-
"graphile-connection-filter": "^1.10.
|
|
35
|
-
"graphile-test": "^4.15.
|
|
34
|
+
"graphile-connection-filter": "^1.10.1",
|
|
35
|
+
"graphile-test": "^4.15.1",
|
|
36
36
|
"makage": "^0.3.0",
|
|
37
37
|
"pg": "^8.20.0",
|
|
38
|
-
"pgsql-test": "^4.14.
|
|
38
|
+
"pgsql-test": "^4.14.1"
|
|
39
39
|
},
|
|
40
40
|
"peerDependencies": {
|
|
41
41
|
"@dataplan/pg": "1.0.0",
|
|
@@ -62,5 +62,5 @@
|
|
|
62
62
|
"hybrid-search",
|
|
63
63
|
"searchScore"
|
|
64
64
|
],
|
|
65
|
-
"gitHead": "
|
|
65
|
+
"gitHead": "1aaafe14a8ba4eeeaab099f5fdc69865ce4e2a2e"
|
|
66
66
|
}
|