@lde/search-typesense 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,61 @@
1
+ # @lde/search-typesense
2
+
3
+ [Typesense](https://typesense.org/) engine adapter for RDF-backed search
4
+ pipelines. Engine-specific (Typesense) but domain-agnostic – the caller supplies
5
+ the collection schema and documents.
6
+
7
+ The engine-agnostic half of the pipeline – framing `CONSTRUCT` quads into a
8
+ JSON-LD IR and projecting that IR into flat documents from a declarative field
9
+ spec – lives in [`@lde/search`](../search). This package consumes those
10
+ documents and writes them to Typesense.
11
+
12
+ ## Indexing
13
+
14
+ `rebuild` blue/green-rebuilds a search index in one call: it creates a fresh
15
+ versioned collection (`${schema.name}_<timestamp>`), streams the documents into
16
+ it in batches, atomically repoints the `schema.name` alias to it, then drops the
17
+ collection it superseded. The caller passes only the logical index name (as
18
+ `schema.name`) and a stream of documents; the versioned collection and the alias
19
+ are managed for them.
20
+
21
+ ```ts
22
+ import { Client } from 'typesense';
23
+ import { rebuild } from '@lde/search-typesense';
24
+
25
+ const client = new Client({
26
+ nodes: [{ host, port, protocol: 'https' }],
27
+ apiKey,
28
+ });
29
+
30
+ // `documents` is an async iterable (e.g. a streaming projection); only one
31
+ // batch is held in memory at a time. `rebuild` returns the live collection name
32
+ // and the imported count (or `null` if another rebuild was already running).
33
+ const result = await rebuild(client, schema, documents);
34
+ ```
35
+
36
+ `rebuild` takes a `Client` the caller owns (and reuses for queries), so this
37
+ package adds no connection or document type of its own – any object with an `id`
38
+ is a valid document, including the `SearchDocument`s `@lde/search` produces.
39
+
40
+ ## Concurrency
41
+
42
+ `rebuild` is **single-flight per index**: it first takes a lock (a marker
43
+ document in a `rebuild_locks` collection, created on demand) via Typesense’s
44
+ atomic create, so concurrent callers across pods never rebuild the same index at
45
+ once. A call made while another rebuild for that index is in flight returns
46
+ `null` instead of a count. This keeps blue/green safe under replication: without
47
+ it, two same-millisecond rebuilds would collide on the versioned collection name
48
+ and one would delete the other’s in-flight build.
49
+
50
+ Limitations to design around:
51
+
52
+ - **Advisory, not a strict mutex.** The lock is built on Typesense, not a
53
+ consensus store. Under a TTL-reclaim race two rebuilds can briefly run at
54
+ once; this is safe because blue/green is idempotent (worst case: redundant
55
+ work and a transient orphaned collection).
56
+ - **Single-flight, not coalescing.** A call skipped with `null` is _not_ queued.
57
+ If you must capture state that changed mid-build, re-trigger after the running
58
+ rebuild finishes.
59
+ - **Lock TTL.** A rebuild running longer than `lockTtlMs` (default 10 minutes)
60
+ can be reclaimed by another caller and run concurrently; size the TTL above
61
+ your longest rebuild.
@@ -0,0 +1,48 @@
1
+ import type { Client, CollectionCreateSchema } from 'typesense';
2
+ /**
3
+ * Blue/green-rebuild the search index `name`.
4
+ *
5
+ * 1. create a fresh versioned collection (`${name}_<timestamp>`) from `schema`
6
+ * 2. stream `documents` into it in batches
7
+ * 3. atomically repoint the `name` alias to the new collection, then
8
+ * drop the collection it superseded. The caller passes only the logical
9
+ * index `name`; the versioned collection name and the alias are managed here.
10
+ *
11
+ * The rebuild is **single-flight per index**: it first takes a lock (a marker
12
+ * document in a `rebuild_locks` collection, created on demand) via Typesense’s
13
+ * atomic create, so concurrent callers across pods never rebuild the same index
14
+ * at once. This keeps blue/green safe under replication.
15
+ *
16
+ * `documents` is an async iterable (e.g. a streaming projection); only one
17
+ * `batchSize`-sized chunk is held in memory at a time. On any failure before the
18
+ * swap nothing is repointed, so the live alias never points at a partial build,
19
+ * and the orphaned half-built collection is dropped.
20
+ *
21
+ * @returns the live collection name and the number of documents imported, or
22
+ * `null` when the rebuild was skipped because another rebuild for the same index
23
+ * was already running.
24
+ *
25
+ * Limitations:
26
+ * - **Advisory, not a strict mutex.** The lock is built on Typesense, not a
27
+ * consensus store. Under a TTL-reclaim race two rebuilds can briefly run at
28
+ * once; this is safe because blue/green is idempotent (worst case: redundant
29
+ * work and a transient orphaned collection).
30
+ * - **Single-flight, not coalescing.** A call made while a rebuild is in flight
31
+ * is skipped (returns `null`), not queued. If you must capture state that
32
+ * changed mid-build, re-trigger after the running rebuild finishes.
33
+ * - **Lock TTL.** A rebuild that runs longer than `lockTtlMs` (default 10
34
+ * minutes) can be reclaimed by another caller and run concurrently; size the
35
+ * TTL above your longest rebuild.
36
+ */
37
+ export declare function rebuild<Document extends {
38
+ id: string;
39
+ }>(client: Client, schema: CollectionCreateSchema, documents: AsyncIterable<Document>, options?: {
40
+ /** Documents imported per Typesense request (default 1000). */
41
+ batchSize?: number;
42
+ /** A held lock older than this (ms) is reclaimed (default 10 minutes). */
43
+ lockTtlMs?: number;
44
+ }): Promise<{
45
+ collection: string;
46
+ imported: number;
47
+ } | null>;
48
+ //# sourceMappingURL=adapter.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"adapter.d.ts","sourceRoot":"","sources":["../src/adapter.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,EAAE,sBAAsB,EAAkB,MAAM,WAAW,CAAC;AAKhF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAkCG;AACH,wBAAsB,OAAO,CAAC,QAAQ,SAAS;IAAE,EAAE,EAAE,MAAM,CAAA;CAAE,EAC3D,MAAM,EAAE,MAAM,EACd,MAAM,EAAE,sBAAsB,EAC9B,SAAS,EAAE,aAAa,CAAC,QAAQ,CAAC,EAClC,OAAO,GAAE;IACP,+DAA+D;IAC/D,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,0EAA0E;IAC1E,SAAS,CAAC,EAAE,MAAM,CAAC;CACf,GACL,OAAO,CAAC;IAAE,UAAU,EAAE,MAAM,CAAC;IAAC,QAAQ,EAAE,MAAM,CAAA;CAAE,GAAG,IAAI,CAAC,CAmC1D"}
@@ -0,0 +1,209 @@
1
+ const LOCK_COLLECTION = 'rebuild_locks';
2
+ const DEFAULT_LOCK_TTL_MS = 10 * 60 * 1000;
3
+ /**
4
+ * Blue/green-rebuild the search index `name`.
5
+ *
6
+ * 1. create a fresh versioned collection (`${name}_<timestamp>`) from `schema`
7
+ * 2. stream `documents` into it in batches
8
+ * 3. atomically repoint the `name` alias to the new collection, then
9
+ * drop the collection it superseded. The caller passes only the logical
10
+ * index `name`; the versioned collection name and the alias are managed here.
11
+ *
12
+ * The rebuild is **single-flight per index**: it first takes a lock (a marker
13
+ * document in a `rebuild_locks` collection, created on demand) via Typesense’s
14
+ * atomic create, so concurrent callers across pods never rebuild the same index
15
+ * at once. This keeps blue/green safe under replication.
16
+ *
17
+ * `documents` is an async iterable (e.g. a streaming projection); only one
18
+ * `batchSize`-sized chunk is held in memory at a time. On any failure before the
19
+ * swap nothing is repointed, so the live alias never points at a partial build,
20
+ * and the orphaned half-built collection is dropped.
21
+ *
22
+ * @returns the live collection name and the number of documents imported, or
23
+ * `null` when the rebuild was skipped because another rebuild for the same index
24
+ * was already running.
25
+ *
26
+ * Limitations:
27
+ * - **Advisory, not a strict mutex.** The lock is built on Typesense, not a
28
+ * consensus store. Under a TTL-reclaim race two rebuilds can briefly run at
29
+ * once; this is safe because blue/green is idempotent (worst case: redundant
30
+ * work and a transient orphaned collection).
31
+ * - **Single-flight, not coalescing.** A call made while a rebuild is in flight
32
+ * is skipped (returns `null`), not queued. If you must capture state that
33
+ * changed mid-build, re-trigger after the running rebuild finishes.
34
+ * - **Lock TTL.** A rebuild that runs longer than `lockTtlMs` (default 10
35
+ * minutes) can be reclaimed by another caller and run concurrently; size the
36
+ * TTL above your longest rebuild.
37
+ */
38
+ export async function rebuild(client, schema, documents, options = {}) {
39
+ const { batchSize = 1000, lockTtlMs = DEFAULT_LOCK_TTL_MS } = options;
40
+ const name = schema.name;
41
+ if (!(await acquireLock(client, name, lockTtlMs))) {
42
+ return null;
43
+ }
44
+ const collection = `${name}_${Date.now()}`;
45
+ try {
46
+ const previous = await aliasTarget(client, name);
47
+ await client.collections().create({ ...schema, name: collection });
48
+ let imported;
49
+ try {
50
+ imported = await importStreamed(client, collection, documents, batchSize);
51
+ await client.aliases().upsert(name, { collection_name: collection });
52
+ }
53
+ catch (error) {
54
+ // The build failed before the swap: the live alias is untouched, so just
55
+ // drop the orphaned half-built collection rather than let it accumulate.
56
+ await client
57
+ .collections(collection)
58
+ .delete()
59
+ .catch(() => undefined);
60
+ throw error;
61
+ }
62
+ if (previous !== undefined && previous !== collection) {
63
+ await client
64
+ .collections(previous)
65
+ .delete()
66
+ .catch(() => undefined);
67
+ }
68
+ return { collection, imported };
69
+ }
70
+ finally {
71
+ await releaseLock(client, name);
72
+ }
73
+ }
74
+ /** The collection an alias currently points at, or `undefined` if unset. */
75
+ async function aliasTarget(client, alias) {
76
+ try {
77
+ const { collection_name } = await client.aliases(alias).retrieve();
78
+ return collection_name;
79
+ }
80
+ catch (error) {
81
+ if (httpStatus(error) === 404) {
82
+ return undefined;
83
+ }
84
+ throw error;
85
+ }
86
+ }
87
+ /** Upsert a stream of documents in `batchSize` chunks; returns the count. */
88
+ async function importStreamed(client, collection, documents, batchSize) {
89
+ let imported = 0;
90
+ let batch = [];
91
+ for await (const document of documents) {
92
+ batch.push(document);
93
+ if (batch.length >= batchSize) {
94
+ await importBatch(client, collection, batch);
95
+ imported += batch.length;
96
+ batch = [];
97
+ }
98
+ }
99
+ if (batch.length > 0) {
100
+ await importBatch(client, collection, batch);
101
+ imported += batch.length;
102
+ }
103
+ return imported;
104
+ }
105
+ /**
106
+ * Create-or-replace one batch of whole documents, keyed on `id`, throwing if any
107
+ * individual document fails (Typesense’s bulk import otherwise reports
108
+ * per-document failures without rejecting).
109
+ */
110
+ async function importBatch(client, collection, batch) {
111
+ const results = (await client
112
+ .collections(collection)
113
+ .documents()
114
+ .import(batch, {
115
+ action: 'upsert',
116
+ // Collect per-document outcomes instead of throwing the client’s opaque
117
+ // ImportError, so we can report which documents failed and why.
118
+ throwOnFail: false,
119
+ }));
120
+ const failures = results.filter((result) => !result.success);
121
+ if (failures.length > 0) {
122
+ throw new Error(`Typesense upsert into “${collection}” failed for ${failures.length}/${results.length} documents: ${failures
123
+ .map((failure) => failure.error)
124
+ .join('; ')}`);
125
+ }
126
+ }
127
+ /**
128
+ * Take the per-alias rebuild lock via an atomic create, reclaiming it if the
129
+ * current holder is older than `ttlMs`. Returns `false` if another caller holds
130
+ * a fresh lock.
131
+ */
132
+ async function acquireLock(client, alias, ttlMs) {
133
+ await ensureLockCollection(client);
134
+ try {
135
+ await client
136
+ .collections(LOCK_COLLECTION)
137
+ .documents()
138
+ .create({ id: alias, acquired_at: Date.now() });
139
+ return true;
140
+ }
141
+ catch (error) {
142
+ if (httpStatus(error) === 409) {
143
+ return reclaimIfStale(client, alias, ttlMs);
144
+ }
145
+ throw error;
146
+ }
147
+ }
148
+ /** Take over the lock if its holder has not refreshed it within `ttlMs`. */
149
+ async function reclaimIfStale(client, alias, ttlMs) {
150
+ let held;
151
+ try {
152
+ held = (await client
153
+ .collections(LOCK_COLLECTION)
154
+ .documents(alias)
155
+ .retrieve());
156
+ }
157
+ catch (error) {
158
+ // Released between our create and this read — leave it for the next try.
159
+ if (httpStatus(error) === 404) {
160
+ return false;
161
+ }
162
+ throw error;
163
+ }
164
+ if (Date.now() - held.acquired_at <= ttlMs) {
165
+ return false;
166
+ }
167
+ await client
168
+ .collections(LOCK_COLLECTION)
169
+ .documents()
170
+ .upsert({ id: alias, acquired_at: Date.now() });
171
+ return true;
172
+ }
173
+ /** Release the per-alias lock; a no-op when it is not currently held. */
174
+ async function releaseLock(client, alias) {
175
+ try {
176
+ await client.collections(LOCK_COLLECTION).documents(alias).delete();
177
+ }
178
+ catch (error) {
179
+ if (httpStatus(error) !== 404) {
180
+ throw error;
181
+ }
182
+ }
183
+ }
184
+ /** Create the lock collection on demand, tolerating a concurrent creator. */
185
+ async function ensureLockCollection(client) {
186
+ try {
187
+ await client.collections(LOCK_COLLECTION).retrieve();
188
+ return;
189
+ }
190
+ catch (error) {
191
+ if (httpStatus(error) !== 404) {
192
+ throw error;
193
+ }
194
+ }
195
+ try {
196
+ await client.collections().create({
197
+ name: LOCK_COLLECTION,
198
+ fields: [{ name: 'acquired_at', type: 'int64' }],
199
+ });
200
+ }
201
+ catch (error) {
202
+ if (httpStatus(error) !== 409) {
203
+ throw error;
204
+ }
205
+ }
206
+ }
207
+ function httpStatus(error) {
208
+ return error.httpStatus;
209
+ }
@@ -0,0 +1,28 @@
1
+ import type { Quad } from '@rdfjs/types';
2
+ /**
3
+ * A flat Typesense document. `id` is required (Typesense uses it as the document
4
+ * key); every other field is engine-typed scalar data or an array thereof.
5
+ */
6
+ export type TypesenseDocument = {
7
+ id: string;
8
+ } & Record<string, unknown>;
9
+ export type FrameFieldType = 'string' | 'string[]' | 'int' | 'float' | 'bool' | 'unixtime';
10
+ /**
11
+ * Maps one document field to a single RDF predicate on the framed subject.
12
+ * This is the generic, engine-agnostic half of projection — straight
13
+ * predicate-to-field mappings with datatype coercion. Domain-specific
14
+ * derivations (folding, grouping, cross-graph joins) are the consumer’s job.
15
+ */
16
+ export interface FrameField {
17
+ readonly field: string;
18
+ readonly predicate: string;
19
+ readonly type: FrameFieldType;
20
+ }
21
+ /**
22
+ * Frame the quads describing one subject into a flat document, pulling each
23
+ * configured field’s value(s) from its predicate and coercing to the field’s
24
+ * type. Single-valued fields take the first object; `string[]` collects all.
25
+ * Predicates with no matching quad are omitted (left to Typesense optionality).
26
+ */
27
+ export declare function frame(quads: Iterable<Quad>, subject: string, fields: readonly FrameField[]): TypesenseDocument;
28
+ //# sourceMappingURL=frame.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"frame.d.ts","sourceRoot":"","sources":["../src/frame.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,cAAc,CAAC;AAEzC;;;GAGG;AACH,MAAM,MAAM,iBAAiB,GAAG;IAAE,EAAE,EAAE,MAAM,CAAA;CAAE,GAAG,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;AAEzE,MAAM,MAAM,cAAc,GACtB,QAAQ,GACR,UAAU,GACV,KAAK,GACL,OAAO,GACP,MAAM,GACN,UAAU,CAAC;AAEf;;;;;GAKG;AACH,MAAM,WAAW,UAAU;IACzB,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,IAAI,EAAE,cAAc,CAAC;CAC/B;AAED;;;;;GAKG;AACH,wBAAgB,KAAK,CACnB,KAAK,EAAE,QAAQ,CAAC,IAAI,CAAC,EACrB,OAAO,EAAE,MAAM,EACf,MAAM,EAAE,SAAS,UAAU,EAAE,GAC5B,iBAAiB,CAoBnB"}
package/dist/frame.js ADDED
@@ -0,0 +1,42 @@
1
+ /**
2
+ * Frame the quads describing one subject into a flat document, pulling each
3
+ * configured field’s value(s) from its predicate and coercing to the field’s
4
+ * type. Single-valued fields take the first object; `string[]` collects all.
5
+ * Predicates with no matching quad are omitted (left to Typesense optionality).
6
+ */
7
+ export function frame(quads, subject, fields) {
8
+ const objectsByPredicate = new Map();
9
+ for (const quad of quads) {
10
+ if (quad.subject.value !== subject) {
11
+ continue;
12
+ }
13
+ const values = objectsByPredicate.get(quad.predicate.value) ?? [];
14
+ values.push(quad.object.value);
15
+ objectsByPredicate.set(quad.predicate.value, values);
16
+ }
17
+ const document = { id: subject };
18
+ for (const { field, predicate, type } of fields) {
19
+ const values = objectsByPredicate.get(predicate);
20
+ if (values === undefined || values.length === 0) {
21
+ continue;
22
+ }
23
+ document[field] = coerce(values, type);
24
+ }
25
+ return document;
26
+ }
27
+ function coerce(values, type) {
28
+ switch (type) {
29
+ case 'string':
30
+ return values[0];
31
+ case 'string[]':
32
+ return values;
33
+ case 'int':
34
+ return Math.trunc(Number(values[0]));
35
+ case 'float':
36
+ return Number(values[0]);
37
+ case 'bool':
38
+ return values[0] === 'true' || values[0] === '1';
39
+ case 'unixtime':
40
+ return Math.trunc(new Date(values[0]).getTime() / 1000);
41
+ }
42
+ }
@@ -0,0 +1,2 @@
1
+ export { rebuild } from './adapter.js';
2
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC"}
package/dist/index.js ADDED
@@ -0,0 +1 @@
1
+ export { rebuild } from './adapter.js';
package/package.json ADDED
@@ -0,0 +1,34 @@
1
+ {
2
+ "name": "@lde/search-typesense",
3
+ "version": "0.0.0",
4
+ "description": "Generic Typesense engine adapter for RDF-backed search pipelines: collection lifecycle, bulk upsert and blue/green alias swap",
5
+ "repository": {
6
+ "url": "git+https://github.com/ldelements/lde.git",
7
+ "directory": "packages/search-typesense"
8
+ },
9
+ "license": "MIT",
10
+ "type": "module",
11
+ "exports": {
12
+ "./package.json": "./package.json",
13
+ ".": {
14
+ "types": "./dist/index.d.ts",
15
+ "import": "./dist/index.js",
16
+ "development": "./src/index.ts",
17
+ "default": "./dist/index.js"
18
+ }
19
+ },
20
+ "main": "./dist/index.js",
21
+ "module": "./dist/index.js",
22
+ "types": "./dist/index.d.ts",
23
+ "files": [
24
+ "dist",
25
+ "!**/*.tsbuildinfo"
26
+ ],
27
+ "dependencies": {
28
+ "tslib": "^2.3.0",
29
+ "typesense": "^3.0.6"
30
+ },
31
+ "devDependencies": {
32
+ "testcontainers": "^12.0.1"
33
+ }
34
+ }