@gscdump/analysis 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,391 @@
1
+ var ContentGapSourceUnsupportedError = class extends Error {
2
+ constructor(kind) {
3
+ super(`content-gap requires a source with executeSql (got '${kind}'); use createBrowserQuerySource or createSqliteQuerySource`);
4
+ this.name = "ContentGapSourceUnsupportedError";
5
+ }
6
+ };
7
+ const ORIGIN_RE = /^https?:\/\/[^/]+/;
8
+ const HTML_EXT_RE = /\.(html?|php|aspx?)$/i;
9
+ const SEP_RE = /[-_]+/g;
10
+ const DIGITS_ONLY_RE = /^\d+$/;
11
+ const WWW_RE = /^www\./;
12
+ const DOT_RE = /\./g;
13
+ const HASH_RE = /#.*$/;
14
+ const QUERY_RE = /\?.*$/;
15
+ const TRAIL_SLASH_RE = /(?<=.)\/$/;
16
+ const MODEL_ID = "Xenova/bge-base-en-v1.5";
17
+ const QUERY_PREFIX = "Represent this sentence for searching relevant passages: ";
18
+ const DB_NAME = "content-gap-embeddings";
19
+ const STORE = "vectors";
20
+ let dbPromise = null;
21
+ function normalizeUrl(u) {
22
+ try {
23
+ const url = new URL(u);
24
+ url.hash = "";
25
+ url.search = "";
26
+ let path = url.pathname;
27
+ if (path.length > 1 && path.endsWith("/")) path = path.slice(0, -1);
28
+ return `${url.origin}${path}`;
29
+ } catch {
30
+ return u.replace(HASH_RE, "").replace(QUERY_RE, "").replace(TRAIL_SLASH_RE, "");
31
+ }
32
+ }
33
+ function deriveUrlText(url) {
34
+ try {
35
+ const u = new URL(url);
36
+ const text = u.pathname.split("/").filter(Boolean).map((s) => decodeURIComponent(s).toLowerCase()).map((s) => s.replace(HTML_EXT_RE, "")).map((s) => s.replace(SEP_RE, " ")).filter((s) => s.length > 0 && !DIGITS_ONLY_RE.test(s)).join(" ");
37
+ if (text.length > 0) return text;
38
+ return u.hostname.replace(WWW_RE, "").replace(DOT_RE, " ");
39
+ } catch {
40
+ return url.replace(ORIGIN_RE, "").replace(SEP_RE, " ");
41
+ }
42
+ }
43
+ function cosineNormalized(a, b) {
44
+ let dot = 0;
45
+ const n = a.length;
46
+ for (let i = 0; i < n; i++) dot += a[i] * b[i];
47
+ return dot;
48
+ }
49
+ function notify(onProgress, progress) {
50
+ onProgress?.(progress);
51
+ }
52
+ function openDb() {
53
+ if (dbPromise != null) return dbPromise;
54
+ dbPromise = new Promise((resolve, reject) => {
55
+ const req = indexedDB.open(DB_NAME, 1);
56
+ req.onupgradeneeded = () => {
57
+ const db = req.result;
58
+ if (!db.objectStoreNames.contains(STORE)) db.createObjectStore(STORE);
59
+ };
60
+ req.onsuccess = () => resolve(req.result);
61
+ req.onerror = () => reject(req.error ?? /* @__PURE__ */ new Error("indexedDB open failed"));
62
+ });
63
+ return dbPromise;
64
+ }
65
+ function cacheKey(role, text) {
66
+ return `${MODEL_ID}|${role}|${text}`;
67
+ }
68
+ async function cacheGetMany(role, texts) {
69
+ const db = await openDb().catch(() => null);
70
+ if (db == null) return /* @__PURE__ */ new Map();
71
+ return new Promise((resolve) => {
72
+ const store = db.transaction(STORE, "readonly").objectStore(STORE);
73
+ const out = /* @__PURE__ */ new Map();
74
+ let pending = texts.length;
75
+ if (pending === 0) {
76
+ resolve(out);
77
+ return;
78
+ }
79
+ for (const t of texts) {
80
+ const req = store.get(cacheKey(role, t));
81
+ req.onsuccess = () => {
82
+ const v = req.result;
83
+ if (v instanceof Float32Array) out.set(t, v);
84
+ pending -= 1;
85
+ if (pending === 0) resolve(out);
86
+ };
87
+ req.onerror = () => {
88
+ pending -= 1;
89
+ if (pending === 0) resolve(out);
90
+ };
91
+ }
92
+ });
93
+ }
94
+ async function cachePutMany(role, entries) {
95
+ const db = await openDb().catch(() => null);
96
+ if (db == null) return;
97
+ await new Promise((resolve) => {
98
+ const tx = db.transaction(STORE, "readwrite");
99
+ for (const [text, vec] of entries) tx.objectStore(STORE).put(vec, cacheKey(role, text));
100
+ tx.oncomplete = () => resolve();
101
+ tx.onerror = () => resolve();
102
+ tx.onabort = () => resolve();
103
+ });
104
+ }
105
+ async function embedRawBatch(extractor, texts, onProgress, batchSize = 32) {
106
+ const result = [];
107
+ for (let i = 0; i < texts.length; i += batchSize) {
108
+ const batch = texts.slice(i, i + batchSize);
109
+ const out = await extractor(batch, {
110
+ pooling: "mean",
111
+ normalize: true
112
+ });
113
+ const dim = out.dims[out.dims.length - 1];
114
+ for (let k = 0; k < batch.length; k++) {
115
+ const start = k * dim;
116
+ result.push(new Float32Array(out.data.buffer, out.data.byteOffset + start * 4, dim).slice());
117
+ }
118
+ onProgress(result.length);
119
+ }
120
+ return result;
121
+ }
122
+ async function embedCached(extractor, role, texts, transform, onProgress) {
123
+ const cached = await cacheGetMany(role, texts);
124
+ const vectors = Array.from({ length: texts.length });
125
+ const missIdx = [];
126
+ const missTexts = [];
127
+ for (let i = 0; i < texts.length; i++) {
128
+ const hit = cached.get(texts[i]);
129
+ if (hit != null) vectors[i] = hit;
130
+ else {
131
+ missIdx.push(i);
132
+ missTexts.push(transform(texts[i]));
133
+ }
134
+ }
135
+ const hits = cached.size;
136
+ const misses = missTexts.length;
137
+ onProgress(hits, texts.length);
138
+ if (missTexts.length > 0) {
139
+ const embedded = await embedRawBatch(extractor, missTexts, (done) => {
140
+ onProgress(hits + done, texts.length);
141
+ });
142
+ const toPersist = [];
143
+ for (let m = 0; m < embedded.length; m++) {
144
+ const i = missIdx[m];
145
+ vectors[i] = embedded[m];
146
+ toPersist.push([texts[i], embedded[m]]);
147
+ }
148
+ await cachePutMany(role, toPersist);
149
+ }
150
+ return {
151
+ vectors,
152
+ hits,
153
+ misses
154
+ };
155
+ }
156
+ async function selectDevice(requested) {
157
+ let chosenDevice = "wasm";
158
+ if (requested === "webgpu" || requested == null) {
159
+ const gpu = globalThis.navigator?.gpu;
160
+ if (gpu != null) {
161
+ if (await gpu.requestAdapter().catch(() => null) != null) chosenDevice = "webgpu";
162
+ }
163
+ }
164
+ return chosenDevice;
165
+ }
166
+ async function loadExtractor(device) {
167
+ const { pipeline, env } = await import("@huggingface/transformers");
168
+ env.useBrowserCache = true;
169
+ return await pipeline("feature-extraction", MODEL_ID, {
170
+ device,
171
+ dtype: "fp32"
172
+ });
173
+ }
174
+ async function fetchContentGapInputs(executeSql, options) {
175
+ const t1 = performance.now();
176
+ const queryRows = await executeSql(`
177
+ WITH query_totals AS (
178
+ SELECT query,
179
+ SUM(impressions)::BIGINT AS total_impressions,
180
+ SUM(clicks)::BIGINT AS total_clicks,
181
+ SUM(sum_position) / NULLIF(SUM(impressions), 0) + 1 AS avg_position
182
+ FROM main.page_keywords
183
+ WHERE query IS NOT NULL AND query <> ''
184
+ GROUP BY query
185
+ HAVING SUM(impressions) >= ?
186
+ ORDER BY total_impressions DESC
187
+ LIMIT ?
188
+ ),
189
+ per_query_url AS (
190
+ SELECT pk.query, pk.url,
191
+ SUM(pk.impressions)::BIGINT AS url_impressions,
192
+ SUM(pk.sum_position) / NULLIF(SUM(pk.impressions), 0) + 1 AS url_position,
193
+ ROW_NUMBER() OVER (PARTITION BY pk.query ORDER BY SUM(pk.impressions) DESC) AS rnk
194
+ FROM main.page_keywords pk
195
+ JOIN query_totals qt USING (query)
196
+ WHERE pk.url IS NOT NULL AND pk.url <> ''
197
+ GROUP BY pk.query, pk.url
198
+ )
199
+ SELECT q.query, q.total_impressions AS impressions, q.total_clicks AS clicks, q.avg_position,
200
+ pu.url AS current_url, pu.url_position AS current_position
201
+ FROM query_totals q
202
+ JOIN per_query_url pu USING (query)
203
+ WHERE pu.rnk = 1
204
+ `, [Number(options.minImpressions), Number(options.maxQueries)]);
205
+ const urlRows = await executeSql(`
206
+ SELECT url, SUM(impressions)::BIGINT AS impressions
207
+ FROM main.page_keywords
208
+ WHERE url IS NOT NULL AND url <> ''
209
+ GROUP BY url
210
+ ORDER BY impressions DESC
211
+ LIMIT ?
212
+ `, [Number(options.maxUrls)]);
213
+ const sqlMs = performance.now() - t1;
214
+ const queries = queryRows.map((row) => ({
215
+ query: String(row.query),
216
+ impressions: Number(row.impressions),
217
+ clicks: Number(row.clicks),
218
+ avgPosition: Number(row.avg_position),
219
+ currentUrl: normalizeUrl(String(row.current_url))
220
+ }));
221
+ const urlAgg = /* @__PURE__ */ new Map();
222
+ for (const row of urlRows) {
223
+ const norm = normalizeUrl(String(row.url));
224
+ urlAgg.set(norm, (urlAgg.get(norm) ?? 0) + Number(row.impressions));
225
+ }
226
+ return {
227
+ queries,
228
+ urls: [...urlAgg.entries()].sort((a, b) => b[1] - a[1]).slice(0, Number(options.maxUrls)).map(([url]) => url),
229
+ sqlMs
230
+ };
231
+ }
232
+ function rankContentGaps(queries, urls, queryEmbeddings, urlEmbeddings, minDivergence) {
233
+ const urlIndex = /* @__PURE__ */ new Map();
234
+ for (let i = 0; i < urls.length; i++) urlIndex.set(urls[i], i);
235
+ const gaps = [];
236
+ for (let i = 0; i < queries.length; i++) {
237
+ const qr = queries[i];
238
+ const qEmb = queryEmbeddings[i];
239
+ const currentIdx = urlIndex.get(qr.currentUrl);
240
+ const scored = Array.from({ length: urls.length });
241
+ for (let j = 0; j < urls.length; j++) scored[j] = {
242
+ url: urls[j],
243
+ similarity: cosineNormalized(qEmb, urlEmbeddings[j])
244
+ };
245
+ scored.sort((a, b) => b.similarity - a.similarity);
246
+ const currentSimilarity = currentIdx != null ? cosineNormalized(qEmb, urlEmbeddings[currentIdx]) : 0;
247
+ const suggestedUrl = scored[0].url;
248
+ const suggestedSimilarity = scored[0].similarity;
249
+ if (suggestedUrl === qr.currentUrl) continue;
250
+ const divergence = suggestedSimilarity - currentSimilarity;
251
+ if (divergence < minDivergence) continue;
252
+ gaps.push({
253
+ query: qr.query,
254
+ impressions: qr.impressions,
255
+ clicks: qr.clicks,
256
+ avgPosition: qr.avgPosition,
257
+ currentUrl: qr.currentUrl,
258
+ currentSimilarity,
259
+ suggestedUrl,
260
+ suggestedSimilarity,
261
+ alternatives: scored.slice(1, 4),
262
+ divergence,
263
+ impact: qr.impressions * divergence
264
+ });
265
+ }
266
+ gaps.sort((a, b) => b.impact - a.impact);
267
+ return gaps;
268
+ }
269
+ async function analyzeContentGap(source, opts = {}) {
270
+ if (!source.executeSql) throw new ContentGapSourceUnsupportedError(source.name ?? "unknown");
271
+ const executeSql = source.executeSql.bind(source);
272
+ const { maxQueries = 1500, maxUrls = 400, minImpressions = 50, minDivergence = .12, device, onProgress } = opts;
273
+ notify(onProgress, {
274
+ phase: "loading-model",
275
+ message: "Checking device..."
276
+ });
277
+ const t0 = performance.now();
278
+ const chosenDevice = await selectDevice(device);
279
+ notify(onProgress, {
280
+ phase: "loading-model",
281
+ message: `Loading ${MODEL_ID} on ${chosenDevice} (~110MB, cached after first run)...`
282
+ });
283
+ const extractor = await loadExtractor(chosenDevice);
284
+ const modelMs = performance.now() - t0;
285
+ notify(onProgress, {
286
+ phase: "fetching-data",
287
+ message: `Running SQL (device: ${chosenDevice})...`,
288
+ modelMs
289
+ });
290
+ const { queries, urls, sqlMs } = await fetchContentGapInputs(executeSql, {
291
+ maxQueries,
292
+ maxUrls,
293
+ minImpressions
294
+ });
295
+ if (queries.length === 0 || urls.length === 0) {
296
+ notify(onProgress, {
297
+ phase: "done",
298
+ message: "Not enough data to analyze.",
299
+ modelMs,
300
+ sqlMs
301
+ });
302
+ return {
303
+ results: [],
304
+ meta: {
305
+ modelMs,
306
+ sqlMs,
307
+ embedMs: 0,
308
+ computeMs: 0,
309
+ cacheHits: 0,
310
+ totalInputs: 0,
311
+ device: chosenDevice,
312
+ modelId: MODEL_ID
313
+ }
314
+ };
315
+ }
316
+ const queryTexts = queries.map((q) => q.query);
317
+ const urlTexts = urls.map(deriveUrlText);
318
+ notify(onProgress, {
319
+ phase: "embedding-queries",
320
+ message: `Embedding ${queryTexts.length} queries on ${chosenDevice}...`,
321
+ total: queryTexts.length,
322
+ done: 0,
323
+ modelMs,
324
+ sqlMs
325
+ });
326
+ const t2 = performance.now();
327
+ const queryEmbed = await embedCached(extractor, "query", queryTexts, (t) => QUERY_PREFIX + t, (done, total) => {
328
+ notify(onProgress, {
329
+ phase: "embedding-queries",
330
+ message: `Embedding ${queryTexts.length} queries on ${chosenDevice}...`,
331
+ done,
332
+ total,
333
+ modelMs,
334
+ sqlMs
335
+ });
336
+ });
337
+ notify(onProgress, {
338
+ phase: "embedding-urls",
339
+ message: `Embedding ${urls.length} URLs...`,
340
+ total: urls.length,
341
+ done: 0,
342
+ modelMs,
343
+ sqlMs
344
+ });
345
+ const urlEmbed = await embedCached(extractor, "passage", urlTexts, (t) => t, (done, total) => {
346
+ notify(onProgress, {
347
+ phase: "embedding-urls",
348
+ message: `Embedding ${urls.length} URLs...`,
349
+ done,
350
+ total,
351
+ modelMs,
352
+ sqlMs
353
+ });
354
+ });
355
+ const embedMs = performance.now() - t2;
356
+ notify(onProgress, {
357
+ phase: "computing-gaps",
358
+ message: "Computing semantic similarities...",
359
+ modelMs,
360
+ sqlMs,
361
+ embedMs
362
+ });
363
+ const t3 = performance.now();
364
+ const gaps = rankContentGaps(queries, urls, queryEmbed.vectors, urlEmbed.vectors, minDivergence);
365
+ const computeMs = performance.now() - t3;
366
+ const totalHits = queryEmbed.hits + urlEmbed.hits;
367
+ const totalInputs = queryTexts.length + urls.length;
368
+ const cacheNote = totalHits > 0 ? ` · ${totalHits}/${totalInputs} cache hits` : "";
369
+ notify(onProgress, {
370
+ phase: "done",
371
+ message: `Found ${gaps.length} content gaps across ${queries.length} queries${cacheNote}`,
372
+ modelMs,
373
+ sqlMs,
374
+ embedMs,
375
+ computeMs
376
+ });
377
+ return {
378
+ results: gaps.slice(0, 150),
379
+ meta: {
380
+ modelMs,
381
+ sqlMs,
382
+ embedMs,
383
+ computeMs,
384
+ cacheHits: totalHits,
385
+ totalInputs,
386
+ device: chosenDevice,
387
+ modelId: MODEL_ID
388
+ }
389
+ };
390
+ }
391
+ export { ContentGapSourceUnsupportedError, analyzeContentGap, cosineNormalized, deriveUrlText, normalizeUrl, rankContentGaps };