@mixpeek/react-searchkit 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,364 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * mixpeek-index — Scrape a website and index it into a Mixpeek collection,
4
+ * then scaffold a retriever ready for use with <SearchKit />.
5
+ *
6
+ * Usage:
7
+ * mixpeek-index --api-key <KEY> --url <URL> [options]
8
+ *
9
+ * Options:
10
+ * --api-key Mixpeek API key (required)
11
+ * --url Website URL to scrape and index (required)
12
+ * --base-url Mixpeek API base URL (default: https://api.mixpeek.com)
13
+ * --namespace Namespace ID to use (created if --namespace-name is set)
14
+ * --namespace-name Name for a new namespace (creates one if --namespace not given)
15
+ * --slug Retriever slug for SearchKit (default: searchkit-<timestamp>)
16
+ * --wait Wait for indexing to complete before exiting (default: true)
17
+ * --poll-interval Seconds between batch status polls (default: 10)
18
+ * --timeout Max seconds to wait for indexing (default: 300)
19
+ * --help Show this help
20
+ *
21
+ * Example:
22
+ * mixpeek-index --api-key mxp_sk_abc123 --url https://docs.example.com
23
+ */
24
+
25
+ import { parseArgs } from "node:util";
26
+
27
+ const USAGE = `
28
+ mixpeek-index — Scrape a website and index it into Mixpeek, then scaffold a retriever.
29
+
30
+ Usage:
31
+ mixpeek-index --api-key <KEY> --url <URL> [options]
32
+
33
+ Options:
34
+ --api-key Mixpeek API key (required)
35
+ --url Website URL to scrape and index (required)
36
+ --base-url API base URL (default: https://api.mixpeek.com)
37
+ --namespace Existing namespace ID to use
38
+ --namespace-name Name for a new namespace (auto-created if --namespace not set)
39
+ --slug Retriever name/slug for SearchKit (default: searchkit-<timestamp>)
40
+ --wait Wait for indexing to complete (default: true)
41
+ --poll-interval Poll interval in seconds (default: 10)
42
+ --timeout Max wait time in seconds (default: 300)
43
+ --help Show this help
44
+
45
+ Example:
46
+ mixpeek-index --api-key mxp_sk_abc123 --url https://docs.example.com --slug my-docs-search
47
+ `;
48
+
49
+ let args;
50
+ try {
51
+ args = parseArgs({
52
+ options: {
53
+ "api-key": { type: "string" },
54
+ url: { type: "string" },
55
+ "base-url": { type: "string", default: "https://api.mixpeek.com" },
56
+ namespace: { type: "string" },
57
+ "namespace-name": { type: "string" },
58
+ slug: { type: "string" },
59
+ wait: { type: "boolean", default: true },
60
+ "poll-interval": { type: "string", default: "10" },
61
+ timeout: { type: "string", default: "300" },
62
+ help: { type: "boolean", default: false },
63
+ },
64
+ strict: true,
65
+ });
66
+ } catch (e) {
67
+ console.error(`Error: ${e.message}\n`);
68
+ console.error(USAGE);
69
+ process.exit(1);
70
+ }
71
+
72
+ if (args.values.help) {
73
+ console.log(USAGE);
74
+ process.exit(0);
75
+ }
76
+
77
+ const apiKey = args.values["api-key"];
78
+ const targetUrl = args.values["url"];
79
+
80
+ if (!apiKey) {
81
+ console.error("Error: --api-key is required.\n");
82
+ process.exit(1);
83
+ }
84
+ if (!targetUrl) {
85
+ console.error("Error: --url is required.\n");
86
+ process.exit(1);
87
+ }
88
+
89
+ const BASE_URL = (args.values["base-url"] || "https://api.mixpeek.com").replace(/\/+$/, "");
90
+ const slug = args.values["slug"] || `searchkit-${Date.now()}`;
91
+ const shouldWait = args.values["wait"];
92
+ const pollInterval = parseInt(args.values["poll-interval"] || "10", 10) * 1000;
93
+ const timeout = parseInt(args.values["timeout"] || "300", 10) * 1000;
94
+
95
+ // ---------------------------------------------------------------------------
96
+ // Helpers
97
+ // ---------------------------------------------------------------------------
98
+
99
+ function headers(nsId) {
100
+ const h = {
101
+ "Content-Type": "application/json",
102
+ Authorization: `Bearer ${apiKey}`,
103
+ };
104
+ if (nsId) h["X-Namespace"] = nsId;
105
+ return h;
106
+ }
107
+
108
+ async function api(method, path, body, nsId) {
109
+ const res = await fetch(`${BASE_URL}${path}`, {
110
+ method,
111
+ headers: headers(nsId),
112
+ body: body ? JSON.stringify(body) : undefined,
113
+ });
114
+ const data = await res.json();
115
+ if (!res.ok) {
116
+ const msg = data?.error?.message || data?.detail || JSON.stringify(data);
117
+ throw new Error(`${method} ${path} → ${res.status}: ${msg}`);
118
+ }
119
+ return data;
120
+ }
121
+
122
+ function sleep(ms) {
123
+ return new Promise((r) => setTimeout(r, ms));
124
+ }
125
+
126
+ // ---------------------------------------------------------------------------
127
+ // Main flow
128
+ // ---------------------------------------------------------------------------
129
+
130
+ async function run() {
131
+ console.log(`\n🌐 mixpeek-index — Indexing ${targetUrl}\n`);
132
+
133
+ // ── Step 1: Resolve namespace ──────────────────────────────────────────────
134
+ let namespaceId = args.values["namespace"];
135
+
136
+ if (!namespaceId) {
137
+ const nsName = args.values["namespace-name"] || `searchkit-ns-${Date.now()}`;
138
+ console.log(`📦 Creating namespace "${nsName}"...`);
139
+ const nsData = await api("POST", "/v1/namespaces", {
140
+ namespace_name: nsName,
141
+ feature_extractors: [
142
+ { feature_extractor_name: "web_scraper", version: "v1" },
143
+ { feature_extractor_name: "text_extractor", version: "v1" },
144
+ ],
145
+ });
146
+ namespaceId = nsData.namespace_id;
147
+ console.log(` ✓ Namespace: ${namespaceId}`);
148
+ } else {
149
+ console.log(` ✓ Using existing namespace: ${namespaceId}`);
150
+ }
151
+
152
+ // ── Step 2: Create bucket ──────────────────────────────────────────────────
153
+ const bucketName = `${slug}-bucket`;
154
+ console.log(`\n🗄️ Creating bucket "${bucketName}"...`);
155
+ const bucketData = await api(
156
+ "POST",
157
+ "/v1/buckets",
158
+ {
159
+ bucket_name: bucketName,
160
+ description: `Web content bucket for ${targetUrl}`,
161
+ bucket_schema: {
162
+ properties: {
163
+ url: { type: "string" },
164
+ title: { type: "string" },
165
+ },
166
+ },
167
+ },
168
+ namespaceId
169
+ );
170
+ const bucketId = bucketData.bucket_id;
171
+ console.log(` ✓ Bucket: ${bucketId}`);
172
+
173
+ // ── Step 3: Add URL as bucket object ──────────────────────────────────────
174
+ console.log(`\n📎 Adding ${targetUrl} as bucket object...`);
175
+ const objData = await api(
176
+ "POST",
177
+ `/v1/buckets/${bucketId}/objects`,
178
+ {
179
+ url: targetUrl,
180
+ metadata: { title: new URL(targetUrl).hostname, source: "web-scrape" },
181
+ },
182
+ namespaceId
183
+ );
184
+ const objectId = objData.object_id;
185
+ console.log(` ✓ Object: ${objectId}`);
186
+
187
+ // ── Step 4: Create collection with web_scraper ─────────────────────────────
188
+ const collectionName = `${slug}-collection`;
189
+ console.log(`\n🗂️ Creating collection "${collectionName}"...`);
190
+ const colData = await api(
191
+ "POST",
192
+ "/v1/collections",
193
+ {
194
+ collection_name: collectionName,
195
+ description: `Web-scraped content from ${targetUrl}`,
196
+ source: { type: "bucket", bucket_ids: [bucketId] },
197
+ feature_extractor: {
198
+ feature_extractor_name: "web_scraper",
199
+ version: "v1",
200
+ input_mappings: { url: "url" },
201
+ },
202
+ },
203
+ namespaceId
204
+ );
205
+ const collectionId = colData.collection_id;
206
+ console.log(` ✓ Collection: ${collectionId}`);
207
+
208
+ // ── Step 5: Trigger processing ────────────────────────────────────────────
209
+ console.log(`\n⚙️ Triggering collection processing...`);
210
+ const triggerData = await api(
211
+ "POST",
212
+ `/v1/collections/${collectionId}/trigger`,
213
+ {},
214
+ namespaceId
215
+ );
216
+ const batchId = triggerData.batch_id;
217
+ console.log(` ✓ Batch: ${batchId} (${triggerData.object_count} objects)`);
218
+
219
+ // ── Step 6: Wait for completion ───────────────────────────────────────────
220
+ if (shouldWait) {
221
+ console.log(`\n⏳ Waiting for indexing to complete (timeout: ${timeout / 1000}s)...`);
222
+ const deadline = Date.now() + timeout;
223
+ let dotCount = 0;
224
+
225
+ while (Date.now() < deadline) {
226
+ await sleep(pollInterval);
227
+ const batchInfo = await api(
228
+ "GET",
229
+ `/v1/buckets/${bucketId}/batches/${batchId}`,
230
+ null,
231
+ namespaceId
232
+ );
233
+ const status = batchInfo.status || "UNKNOWN";
234
+ process.stdout.write(`\r ${".".repeat((dotCount++ % 3) + 1).padEnd(3)} ${status} `);
235
+
236
+ if (status === "COMPLETED") {
237
+ const docCount = batchInfo.document_count || "?";
238
+ console.log(`\n ✓ Indexing complete! Documents: ${docCount}`);
239
+ break;
240
+ }
241
+ if (status === "FAILED" || status === "ERROR") {
242
+ console.error(`\n ✗ Batch failed with status: ${status}`);
243
+ process.exit(1);
244
+ }
245
+ }
246
+ }
247
+
248
+ // ── Step 7: Get feature URI ───────────────────────────────────────────────
249
+ console.log(`\n🔍 Discovering collection features...`);
250
+ let featureUri = `mixpeek://web_scraper@v1/intfloat__multilingual_e5_large_instruct`;
251
+ try {
252
+ const featData = await api(
253
+ "GET",
254
+ `/v1/collections/${collectionId}/features`,
255
+ null,
256
+ namespaceId
257
+ );
258
+ const features = featData.features || [];
259
+ const textFeature = features.find(
260
+ (f) => f.vector_index?.purpose === "text" || f.feature_address?.includes("multilingual")
261
+ );
262
+ if (textFeature?.feature_address) {
263
+ featureUri = textFeature.feature_address;
264
+ }
265
+ } catch {
266
+ console.log(` ℹ️ Using default feature URI`);
267
+ }
268
+ console.log(` ✓ Feature URI: ${featureUri}`);
269
+
270
+ // ── Step 8: Create retriever ──────────────────────────────────────────────
271
+ console.log(`\n🔎 Creating retriever "${slug}"...`);
272
+ const retData = await api(
273
+ "POST",
274
+ "/v1/retrievers",
275
+ {
276
+ retriever_name: slug,
277
+ description: `SearchKit retriever for ${targetUrl}`,
278
+ collection_identifiers: [collectionId],
279
+ input_schema: {
280
+ query: { type: "text", required: true, description: "Search query" },
281
+ doc_type: { type: "text", required: false, description: "Filter by page type (page/code/image)" },
282
+ keyword: { type: "text", required: false, description: "Keyword filter on content" },
283
+ },
284
+ stages: [
285
+ {
286
+ stage_name: "semantic_search",
287
+ stage_type: "filter",
288
+ config: {
289
+ stage_id: "feature_search",
290
+ parameters: {
291
+ searches: [
292
+ {
293
+ feature_uri: featureUri,
294
+ query: { input_mode: "text", value: "{{INPUT.query}}" },
295
+ top_k: 20,
296
+ },
297
+ ],
298
+ final_top_k: 20,
299
+ },
300
+ },
301
+ },
302
+ {
303
+ stage_name: "sort_by_relevance",
304
+ stage_type: "sort",
305
+ config: {
306
+ stage_id: "sort_relevance",
307
+ parameters: { score_field: "score", direction: "desc" },
308
+ },
309
+ },
310
+ ],
311
+ },
312
+ namespaceId
313
+ );
314
+ const retrieverId = retData.retriever?.retriever_id || retData.retriever_id;
315
+ console.log(` ✓ Retriever: ${retrieverId}`);
316
+
317
+ // ── Done ──────────────────────────────────────────────────────────────────
318
+ console.log(`
319
+ ╔════════════════════════════════════════════════════════╗
320
+ ║ ✅ Indexing complete! ║
321
+ ╚════════════════════════════════════════════════════════╝
322
+
323
+ Resources created:
324
+ namespace_id: ${namespaceId}
325
+ bucket_id: ${bucketId}
326
+ collection_id: ${collectionId}
327
+ retriever_id: ${retrieverId}
328
+
329
+ Use with SearchKit (React component):
330
+
331
+ import { SearchKit } from "@mixpeek/react-searchkit";
332
+
333
+ <SearchKit
334
+ projectKey="${retrieverId}"
335
+ bearerToken="<YOUR_API_KEY>"
336
+ apiBaseUrl="${BASE_URL}"
337
+ placeholder="Search ${new URL(targetUrl).hostname}..."
338
+ />
339
+
340
+ Or test via API:
341
+
342
+ curl -X POST ${BASE_URL}/v1/retrievers/${retrieverId}/execute \\
343
+ -H "Authorization: Bearer ${apiKey}" \\
344
+ -H "X-Namespace: ${namespaceId}" \\
345
+ -H "Content-Type: application/json" \\
346
+ -d '{"inputs": {"query": "your search query"}, "settings": {"limit": 5}}'
347
+ `);
348
+
349
+ // Output machine-readable JSON for piping/scripting
350
+ const output = {
351
+ namespace_id: namespaceId,
352
+ bucket_id: bucketId,
353
+ collection_id: collectionId,
354
+ retriever_id: retrieverId,
355
+ api_key: apiKey,
356
+ base_url: BASE_URL,
357
+ };
358
+ process.stdout.write("\n__RESULT__=" + JSON.stringify(output) + "\n");
359
+ }
360
+
361
+ run().catch((err) => {
362
+ console.error(`\n❌ Error: ${err.message}`);
363
+ process.exit(1);
364
+ });