@gscdump/engine 0.11.0 → 0.11.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_chunks/engine.mjs +1 -0
- package/dist/_chunks/storage.d.mts +9 -0
- package/dist/rollups.mjs +24 -16
- package/package.json +2 -2
package/dist/_chunks/engine.mjs
CHANGED
|
@@ -318,6 +318,7 @@ function createStorageEngine(opts) {
|
|
|
318
318
|
opts.signal?.throwIfAborted();
|
|
319
319
|
const entries = Object.entries(opts.fileSets);
|
|
320
320
|
const perSet = await Promise.all(entries.map(async ([name, ref]) => {
|
|
321
|
+
if (ref.keys !== void 0) return [name, ref.keys];
|
|
321
322
|
return [name, (await manifestStore.listLive({
|
|
322
323
|
userId: opts.ctx.userId,
|
|
323
324
|
siteId: opts.ctx.siteId,
|
|
@@ -375,6 +375,15 @@ interface QueryExecutor {
|
|
|
375
375
|
interface FileSetRef {
|
|
376
376
|
table: TableName;
|
|
377
377
|
partitions?: string[];
|
|
378
|
+
/**
|
|
379
|
+
* Pre-resolved object keys, bypassing the manifest lookup. When provided,
|
|
380
|
+
* runSQL skips `manifestStore.listLive` for this entry and uses these keys
|
|
381
|
+
* directly. Use for entity-store sidecars (`entities/inspections/index.parquet`,
|
|
382
|
+
* `entities/sitemaps/urls/index.parquet`) which aren't registered in the
|
|
383
|
+
* analytics manifest. `table` is still required as the schema sentinel for
|
|
384
|
+
* the empty-fallback rewrite, but isn't consulted when `keys` is non-empty.
|
|
385
|
+
*/
|
|
386
|
+
keys?: string[];
|
|
378
387
|
}
|
|
379
388
|
interface RunSQLOptions {
|
|
380
389
|
ctx: TenantCtx;
|
package/dist/rollups.mjs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { encodeRowsToParquetFlex } from "./adapters/hyparquet.mjs";
|
|
2
|
-
import { createIndexingMetadataStore,
|
|
2
|
+
import { createIndexingMetadataStore, createSitemapStore, inspectionParquetKey, sitemapUrlsIndexKey } from "./entities.mjs";
|
|
3
3
|
import { MS_PER_DAY } from "gscdump";
|
|
4
4
|
function rollupPrefix(ctx) {
|
|
5
5
|
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/rollups` : `u_${ctx.userId}/rollups`;
|
|
@@ -335,16 +335,12 @@ const indexingMetadataRollup = {
|
|
|
335
335
|
};
|
|
336
336
|
}
|
|
337
337
|
};
|
|
338
|
-
function sqlString(s) {
|
|
339
|
-
return `'${s.replace(/'/g, "''")}'`;
|
|
340
|
-
}
|
|
341
338
|
const indexingHealthRollup = {
|
|
342
339
|
id: "indexing_health",
|
|
343
340
|
windowDays: 90,
|
|
344
341
|
async build({ engine, ctx, dataSource, builtAt }) {
|
|
345
|
-
const
|
|
346
|
-
if (!
|
|
347
|
-
const cutoff = utcDateMinusDays(builtAt, 90);
|
|
342
|
+
const key = inspectionParquetKey(ctx);
|
|
343
|
+
if (!await dataSource.head?.(key)) return { days: [] };
|
|
348
344
|
const sql = `
|
|
349
345
|
SELECT
|
|
350
346
|
substr(inspectedAt, 1, 10) AS date,
|
|
@@ -356,15 +352,18 @@ const indexingHealthRollup = {
|
|
|
356
352
|
SUM(CASE WHEN mobileUsabilityVerdict = 'PASS' THEN 1 ELSE 0 END)::BIGINT AS mobile_passes,
|
|
357
353
|
SUM(CASE WHEN richResultsVerdict = 'PASS' THEN 1 ELSE 0 END)::BIGINT AS rich_results_passes,
|
|
358
354
|
SUM(CASE WHEN userCanonical IS NOT NULL AND googleCanonical IS NOT NULL AND userCanonical <> googleCanonical THEN 1 ELSE 0 END)::BIGINT AS canonical_mismatches
|
|
359
|
-
FROM read_parquet(
|
|
360
|
-
WHERE substr(inspectedAt, 1, 10) >= '${
|
|
355
|
+
FROM read_parquet({{INSPECTIONS}}, union_by_name = true)
|
|
356
|
+
WHERE substr(inspectedAt, 1, 10) >= '${utcDateMinusDays(builtAt, 90)}'
|
|
361
357
|
GROUP BY 1
|
|
362
358
|
ORDER BY 1
|
|
363
359
|
`;
|
|
364
360
|
return { days: (await engine.runSQL({
|
|
365
361
|
ctx,
|
|
366
362
|
table: "pages",
|
|
367
|
-
fileSets: {
|
|
363
|
+
fileSets: { INSPECTIONS: {
|
|
364
|
+
table: "pages",
|
|
365
|
+
keys: [key]
|
|
366
|
+
} },
|
|
368
367
|
sql
|
|
369
368
|
})).rows.map((r) => ({
|
|
370
369
|
date: String(r.date),
|
|
@@ -383,8 +382,8 @@ const indexPercentRollup = {
|
|
|
383
382
|
id: "index_percent",
|
|
384
383
|
windowDays: 90,
|
|
385
384
|
async build({ engine, ctx, dataSource, builtAt }) {
|
|
386
|
-
const
|
|
387
|
-
if (!
|
|
385
|
+
const urlsKey = sitemapUrlsIndexKey(ctx);
|
|
386
|
+
if (!await dataSource.head?.(urlsKey)) return {
|
|
388
387
|
totalSitemapUrls: 0,
|
|
389
388
|
days: []
|
|
390
389
|
};
|
|
@@ -392,13 +391,19 @@ const indexPercentRollup = {
|
|
|
392
391
|
const numerator = await engine.runSQL({
|
|
393
392
|
ctx,
|
|
394
393
|
table: "pages",
|
|
395
|
-
fileSets: {
|
|
394
|
+
fileSets: {
|
|
395
|
+
PAGES: { table: "pages" },
|
|
396
|
+
URLS: {
|
|
397
|
+
table: "pages",
|
|
398
|
+
keys: [urlsKey]
|
|
399
|
+
}
|
|
400
|
+
},
|
|
396
401
|
sql: `
|
|
397
402
|
SELECT
|
|
398
403
|
p.date AS date,
|
|
399
404
|
COUNT(DISTINCT p.url)::BIGINT AS clicked_urls
|
|
400
405
|
FROM read_parquet({{PAGES}}, union_by_name = true) p
|
|
401
|
-
INNER JOIN read_parquet(
|
|
406
|
+
INNER JOIN read_parquet({{URLS}}, union_by_name = true) s
|
|
402
407
|
ON s.loc = p.url AND s.removed_at IS NULL
|
|
403
408
|
WHERE p.clicks > 0 AND p.date >= '${cutoff}'
|
|
404
409
|
GROUP BY p.date
|
|
@@ -408,10 +413,13 @@ const indexPercentRollup = {
|
|
|
408
413
|
const denom = await engine.runSQL({
|
|
409
414
|
ctx,
|
|
410
415
|
table: "pages",
|
|
411
|
-
fileSets: {
|
|
416
|
+
fileSets: { URLS: {
|
|
417
|
+
table: "pages",
|
|
418
|
+
keys: [urlsKey]
|
|
419
|
+
} },
|
|
412
420
|
sql: `
|
|
413
421
|
SELECT COUNT(*)::BIGINT AS total
|
|
414
|
-
FROM read_parquet(
|
|
422
|
+
FROM read_parquet({{URLS}}, union_by_name = true)
|
|
415
423
|
WHERE removed_at IS NULL
|
|
416
424
|
`
|
|
417
425
|
});
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@gscdump/engine",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.11.
|
|
4
|
+
"version": "0.11.1",
|
|
5
5
|
"description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -169,7 +169,7 @@
|
|
|
169
169
|
"dependencies": {
|
|
170
170
|
"drizzle-orm": "^0.45.2",
|
|
171
171
|
"proper-lockfile": "^4.1.2",
|
|
172
|
-
"gscdump": "0.11.
|
|
172
|
+
"gscdump": "0.11.1"
|
|
173
173
|
},
|
|
174
174
|
"devDependencies": {
|
|
175
175
|
"@duckdb/duckdb-wasm": "^1.32.0",
|