@gscdump/engine 0.19.6 → 0.19.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/entities.d.mts +7 -6
- package/dist/entities.mjs +63 -65
- package/dist/rollups.mjs +5 -5
- package/package.json +3 -3
package/dist/entities.d.mts
CHANGED
|
@@ -200,7 +200,8 @@ interface SitemapHistoryDoc {
|
|
|
200
200
|
}
|
|
201
201
|
declare function sitemapIndexKey(ctx: TenantCtx): string;
|
|
202
202
|
declare function sitemapHistoryKey(ctx: TenantCtx, feedpathHash: string, capturedAtMs: number): string;
|
|
203
|
-
declare function
|
|
203
|
+
declare function sitemapUrlsIndexPrefix(ctx: TenantCtx): string;
|
|
204
|
+
declare function sitemapUrlsIndexKey(ctx: TenantCtx, feedpathHash: string): string;
|
|
204
205
|
declare function sitemapUrlsDeltaKey(ctx: TenantCtx, feedpathHash: string, date: string): string;
|
|
205
206
|
/** Parsed URL entry from a sitemap XML. */
|
|
206
207
|
interface ParsedUrl {
|
|
@@ -273,12 +274,12 @@ interface SitemapStore {
|
|
|
273
274
|
/** Stream all delta entries within `[from, to]` (YYYY-MM-DD inclusive). */
|
|
274
275
|
loadDeltas: (ctx: TenantCtx, dateRange?: DateRange) => AsyncIterable<DeltaEntry>;
|
|
275
276
|
/**
|
|
276
|
-
* Fold
|
|
277
|
-
* `
|
|
277
|
+
* Fold accumulated deltas into the prior index, one feedpath at a time:
|
|
278
|
+
* rewrites each touched feedpath's `by-feed/<hash>/index.parquet` and deletes
|
|
279
|
+
* the consumed delta files. Bounded per feedpath, so it stays within memory
|
|
280
|
+
* regardless of total site URL count.
|
|
278
281
|
*/
|
|
279
282
|
compactUrls: (ctx: TenantCtx) => Promise<void>;
|
|
280
|
-
/** DuckDB-resolvable URI for the URLs index; `undefined` if backend lacks one. */
|
|
281
|
-
urlsParquetUri: (ctx: TenantCtx) => string | undefined;
|
|
282
283
|
}
|
|
283
284
|
interface CreateSitemapStoreOptions {
|
|
284
285
|
dataSource: DataSource;
|
|
@@ -330,4 +331,4 @@ interface CreateEmptyTypesStoreOptions {
|
|
|
330
331
|
now?: () => number;
|
|
331
332
|
}
|
|
332
333
|
declare function createEmptyTypesStore(opts: CreateEmptyTypesStoreOptions): EmptyTypesStore;
|
|
333
|
-
export { CreateEmptyTypesStoreOptions, CreateIndexingMetadataStoreOptions, CreateInspectionStoreOptions, CreateSitemapStoreOptions, DateRange, DeltaEntry, EmptyTypesDoc, EmptyTypesStore, INSPECTION_HISTORY_MAX_BYTES, IndexingMetadataIndex, IndexingMetadataRecord, IndexingMetadataStore, InspectionHistoryShard, InspectionIndex, InspectionParquetRow, InspectionRecord, InspectionStore, LoadUrlsOptions, ParsedUrl, SitemapHistoryDoc, SitemapIndex, SitemapRecord, SitemapStore, SitemapUrlRecord, SnapshotUrlsResult, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey };
|
|
334
|
+
export { CreateEmptyTypesStoreOptions, CreateIndexingMetadataStoreOptions, CreateInspectionStoreOptions, CreateSitemapStoreOptions, DateRange, DeltaEntry, EmptyTypesDoc, EmptyTypesStore, INSPECTION_HISTORY_MAX_BYTES, IndexingMetadataIndex, IndexingMetadataRecord, IndexingMetadataStore, InspectionHistoryShard, InspectionIndex, InspectionParquetRow, InspectionRecord, InspectionStore, LoadUrlsOptions, ParsedUrl, SitemapHistoryDoc, SitemapIndex, SitemapRecord, SitemapStore, SitemapUrlRecord, SnapshotUrlsResult, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
|
package/dist/entities.mjs
CHANGED
|
@@ -190,8 +190,11 @@ function sitemapHistoryKey(ctx, feedpathHash, capturedAtMs) {
|
|
|
190
190
|
function sitemapUrlsPrefix(ctx) {
|
|
191
191
|
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/sitemaps/urls` : `u_${ctx.userId}/entities/sitemaps/urls`;
|
|
192
192
|
}
|
|
193
|
-
function
|
|
194
|
-
return `${sitemapUrlsPrefix(ctx)}/
|
|
193
|
+
function sitemapUrlsIndexPrefix(ctx) {
|
|
194
|
+
return `${sitemapUrlsPrefix(ctx)}/by-feed`;
|
|
195
|
+
}
|
|
196
|
+
function sitemapUrlsIndexKey(ctx, feedpathHash) {
|
|
197
|
+
return `${sitemapUrlsIndexPrefix(ctx)}/${feedpathHash}/index.parquet`;
|
|
195
198
|
}
|
|
196
199
|
function sitemapUrlsDeltaKey(ctx, feedpathHash, date) {
|
|
197
200
|
return `${sitemapUrlsPrefix(ctx)}/deltas/${date}__${feedpathHash}.parquet`;
|
|
@@ -417,13 +420,12 @@ function createSitemapStore(opts) {
|
|
|
417
420
|
async *loadUrls(ctx, feedpath, opts) {
|
|
418
421
|
const fpHash = hash(feedpath);
|
|
419
422
|
const includeRemoved = opts?.includeRemoved ?? false;
|
|
420
|
-
const indexBytes = await ds.read(sitemapUrlsIndexKey(ctx)).catch(() => void 0);
|
|
421
|
-
const indexRows = indexBytes ? await decodeParquetToRows(indexBytes
|
|
423
|
+
const indexBytes = await ds.read(sitemapUrlsIndexKey(ctx, fpHash)).catch(() => void 0);
|
|
424
|
+
const indexRows = indexBytes ? await decodeParquetToRows(indexBytes) : [];
|
|
422
425
|
const deltaKeys = (await ds.list(`${sitemapUrlsPrefix(ctx)}/deltas/`)).sort();
|
|
423
426
|
const live = /* @__PURE__ */ new Map();
|
|
424
427
|
const removedMap = /* @__PURE__ */ new Map();
|
|
425
428
|
for (const row of indexRows) {
|
|
426
|
-
if (row.feedpath_hash !== fpHash) continue;
|
|
427
429
|
const rec = rowToUrlRecord(row);
|
|
428
430
|
if (rec.removedAt != null) removedMap.set(rec.urlHash, rec);
|
|
429
431
|
else live.set(rec.urlHash, rec);
|
|
@@ -492,71 +494,67 @@ function createSitemapStore(opts) {
|
|
|
492
494
|
}
|
|
493
495
|
},
|
|
494
496
|
async compactUrls(ctx) {
|
|
495
|
-
const
|
|
496
|
-
const
|
|
497
|
-
const indexRows = indexBytes ? await decodeParquetToRows(indexBytes) : [];
|
|
498
|
-
const stateKey = (fp, u) => `${fp}::${u}`;
|
|
499
|
-
const live = /* @__PURE__ */ new Map();
|
|
500
|
-
const removed = /* @__PURE__ */ new Map();
|
|
501
|
-
for (const row of indexRows) {
|
|
502
|
-
const rec = rowToUrlRecord(row);
|
|
503
|
-
const k = stateKey(rec.feedpathHash, rec.urlHash);
|
|
504
|
-
if (rec.removedAt != null) removed.set(k, rec);
|
|
505
|
-
else live.set(k, rec);
|
|
506
|
-
}
|
|
507
|
-
const deltaKeys = (await ds.list(`${sitemapUrlsPrefix(ctx)}/deltas/`)).sort();
|
|
508
|
-
const consumed = [];
|
|
497
|
+
const deltaKeys = await ds.list(`${sitemapUrlsPrefix(ctx)}/deltas/`);
|
|
498
|
+
const deltasByFeed = /* @__PURE__ */ new Map();
|
|
509
499
|
for (const key of deltaKeys) {
|
|
510
500
|
const m = SITEMAP_URLS_DELTA_PREFIX_RE.exec(key);
|
|
511
501
|
if (!m) continue;
|
|
512
|
-
const
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
const
|
|
536
|
-
|
|
537
|
-
if (
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
502
|
+
const list = deltasByFeed.get(m[2]) ?? [];
|
|
503
|
+
list.push(key);
|
|
504
|
+
deltasByFeed.set(m[2], list);
|
|
505
|
+
}
|
|
506
|
+
for (const [fpHash, feedDeltaKeys] of deltasByFeed) {
|
|
507
|
+
const indexKey = sitemapUrlsIndexKey(ctx, fpHash);
|
|
508
|
+
const indexBytes = await ds.read(indexKey).catch(() => void 0);
|
|
509
|
+
const indexRows = indexBytes ? await decodeParquetToRows(indexBytes) : [];
|
|
510
|
+
const live = /* @__PURE__ */ new Map();
|
|
511
|
+
const removed = /* @__PURE__ */ new Map();
|
|
512
|
+
for (const row of indexRows) {
|
|
513
|
+
const rec = rowToUrlRecord(row);
|
|
514
|
+
if (rec.removedAt != null) removed.set(rec.urlHash, rec);
|
|
515
|
+
else live.set(rec.urlHash, rec);
|
|
516
|
+
}
|
|
517
|
+
const consumed = [];
|
|
518
|
+
for (const key of feedDeltaKeys.sort()) {
|
|
519
|
+
const bytes = await ds.read(key).catch(() => void 0);
|
|
520
|
+
if (!bytes) continue;
|
|
521
|
+
consumed.push(key);
|
|
522
|
+
const rows = await decodeParquetToRows(bytes);
|
|
523
|
+
for (const r of rows) {
|
|
524
|
+
const urlHash = String(r.url_hash);
|
|
525
|
+
const at = Number(r.at);
|
|
526
|
+
const op = String(r.op);
|
|
527
|
+
if (op === "added") {
|
|
528
|
+
const prev = live.get(urlHash) ?? removed.get(urlHash);
|
|
529
|
+
removed.delete(urlHash);
|
|
530
|
+
live.set(urlHash, {
|
|
531
|
+
feedpath: String(r.feedpath),
|
|
532
|
+
feedpathHash: fpHash,
|
|
533
|
+
urlHash,
|
|
534
|
+
loc: String(r.loc),
|
|
535
|
+
lastmod: r.lastmod == null ? void 0 : String(r.lastmod),
|
|
536
|
+
firstSeenAt: prev?.firstSeenAt ?? at,
|
|
537
|
+
lastSeenAt: at
|
|
538
|
+
});
|
|
539
|
+
} else if (op === "removed") {
|
|
540
|
+
const prev = live.get(urlHash);
|
|
541
|
+
live.delete(urlHash);
|
|
542
|
+
if (prev) removed.set(urlHash, {
|
|
543
|
+
...prev,
|
|
544
|
+
removedAt: at
|
|
545
|
+
});
|
|
546
|
+
}
|
|
541
547
|
}
|
|
542
548
|
}
|
|
549
|
+
const merged = [...live.values(), ...removed.values()];
|
|
550
|
+
merged.sort((a, b) => a.urlHash < b.urlHash ? -1 : a.urlHash > b.urlHash ? 1 : 0);
|
|
551
|
+
const bytes = encodeRowsToParquetFlex(merged.map(urlRecordToRow), {
|
|
552
|
+
columns: URLS_INDEX_COLUMNS,
|
|
553
|
+
sortKey: ["feedpath_hash", "url_hash"]
|
|
554
|
+
});
|
|
555
|
+
await ds.write(indexKey, bytes);
|
|
556
|
+
if (consumed.length > 0) await ds.delete(consumed);
|
|
543
557
|
}
|
|
544
|
-
const merged = [...live.values(), ...removed.values()];
|
|
545
|
-
merged.sort((a, b) => {
|
|
546
|
-
if (a.feedpathHash !== b.feedpathHash) return a.feedpathHash < b.feedpathHash ? -1 : 1;
|
|
547
|
-
if (a.urlHash !== b.urlHash) return a.urlHash < b.urlHash ? -1 : 1;
|
|
548
|
-
return 0;
|
|
549
|
-
});
|
|
550
|
-
const bytes = encodeRowsToParquetFlex(merged.map(urlRecordToRow), {
|
|
551
|
-
columns: URLS_INDEX_COLUMNS,
|
|
552
|
-
sortKey: ["feedpath_hash", "url_hash"]
|
|
553
|
-
});
|
|
554
|
-
await ds.write(indexKey, bytes);
|
|
555
|
-
if (consumed.length > 0) await ds.delete(consumed);
|
|
556
|
-
},
|
|
557
|
-
urlsParquetUri(ctx) {
|
|
558
|
-
const key = sitemapUrlsIndexKey(ctx);
|
|
559
|
-
return ds.uri ? ds.uri(key) : void 0;
|
|
560
558
|
}
|
|
561
559
|
};
|
|
562
560
|
}
|
|
@@ -640,4 +638,4 @@ function createEmptyTypesStore(opts) {
|
|
|
640
638
|
}
|
|
641
639
|
};
|
|
642
640
|
}
|
|
643
|
-
export { INSPECTION_HISTORY_MAX_BYTES, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey };
|
|
641
|
+
export { INSPECTION_HISTORY_MAX_BYTES, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
|
package/dist/rollups.mjs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { encodeRowsToParquetFlex } from "./adapters/hyparquet.mjs";
|
|
2
|
-
import { createIndexingMetadataStore, createSitemapStore, inspectionParquetKey,
|
|
2
|
+
import { createIndexingMetadataStore, createSitemapStore, inspectionParquetKey, sitemapUrlsIndexPrefix } from "./entities.mjs";
|
|
3
3
|
import { MS_PER_DAY } from "gscdump";
|
|
4
4
|
function rollupPrefix(ctx, searchType) {
|
|
5
5
|
const base = ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/rollups` : `u_${ctx.userId}/rollups`;
|
|
@@ -612,8 +612,8 @@ const indexPercentRollup = {
|
|
|
612
612
|
windowDays: 90,
|
|
613
613
|
sliceOrthogonal: true,
|
|
614
614
|
async build({ engine, ctx, dataSource, builtAt, searchType }) {
|
|
615
|
-
const
|
|
616
|
-
if (
|
|
615
|
+
const urlsKeys = await dataSource.list(sitemapUrlsIndexPrefix(ctx));
|
|
616
|
+
if (urlsKeys.length === 0) return {
|
|
617
617
|
totalSitemapUrls: 0,
|
|
618
618
|
days: []
|
|
619
619
|
};
|
|
@@ -633,7 +633,7 @@ const indexPercentRollup = {
|
|
|
633
633
|
},
|
|
634
634
|
URLS: {
|
|
635
635
|
table: "pages",
|
|
636
|
-
keys:
|
|
636
|
+
keys: urlsKeys
|
|
637
637
|
}
|
|
638
638
|
},
|
|
639
639
|
...searchType !== void 0 ? { searchType } : {},
|
|
@@ -654,7 +654,7 @@ const indexPercentRollup = {
|
|
|
654
654
|
table: "pages",
|
|
655
655
|
fileSets: { URLS: {
|
|
656
656
|
table: "pages",
|
|
657
|
-
keys:
|
|
657
|
+
keys: urlsKeys
|
|
658
658
|
} },
|
|
659
659
|
sql: `
|
|
660
660
|
SELECT COUNT(*)::BIGINT AS total
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@gscdump/engine",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.19.
|
|
4
|
+
"version": "0.19.7",
|
|
5
5
|
"description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -169,8 +169,8 @@
|
|
|
169
169
|
"dependencies": {
|
|
170
170
|
"drizzle-orm": "^0.45.2",
|
|
171
171
|
"proper-lockfile": "^4.1.2",
|
|
172
|
-
"gscdump": "0.19.
|
|
173
|
-
"@gscdump/contracts": "0.19.
|
|
172
|
+
"gscdump": "0.19.7",
|
|
173
|
+
"@gscdump/contracts": "0.19.7"
|
|
174
174
|
},
|
|
175
175
|
"devDependencies": {
|
|
176
176
|
"@duckdb/duckdb-wasm": "^1.32.0",
|