@gscdump/engine 0.10.0 → 0.11.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_chunks/engine.mjs +1 -0
- package/dist/_chunks/storage.d.mts +9 -0
- package/dist/entities.d.mts +116 -2
- package/dist/entities.mjs +453 -1
- package/dist/index.d.mts +2 -1
- package/dist/index.mjs +2 -1
- package/dist/rollups.d.mts +34 -1
- package/dist/rollups.mjs +211 -3
- package/dist/schedule.d.mts +19 -0
- package/dist/schedule.mjs +100 -0
- package/package.json +7 -2
package/dist/_chunks/engine.mjs
CHANGED
|
@@ -318,6 +318,7 @@ function createStorageEngine(opts) {
|
|
|
318
318
|
opts.signal?.throwIfAborted();
|
|
319
319
|
const entries = Object.entries(opts.fileSets);
|
|
320
320
|
const perSet = await Promise.all(entries.map(async ([name, ref]) => {
|
|
321
|
+
if (ref.keys !== void 0) return [name, ref.keys];
|
|
321
322
|
return [name, (await manifestStore.listLive({
|
|
322
323
|
userId: opts.ctx.userId,
|
|
323
324
|
siteId: opts.ctx.siteId,
|
|
@@ -375,6 +375,15 @@ interface QueryExecutor {
|
|
|
375
375
|
interface FileSetRef {
|
|
376
376
|
table: TableName;
|
|
377
377
|
partitions?: string[];
|
|
378
|
+
/**
|
|
379
|
+
* Pre-resolved object keys, bypassing the manifest lookup. When provided,
|
|
380
|
+
* runSQL skips `manifestStore.listLive` for this entry and uses these keys
|
|
381
|
+
* directly. Use for entity-store sidecars (`entities/inspections/index.parquet`,
|
|
382
|
+
* `entities/sitemaps/urls/index.parquet`) which aren't registered in the
|
|
383
|
+
* analytics manifest. `table` is still required as the schema sentinel for
|
|
384
|
+
* the empty-fallback rewrite, but isn't consulted when `keys` is non-empty.
|
|
385
|
+
*/
|
|
386
|
+
keys?: string[];
|
|
378
387
|
}
|
|
379
388
|
interface RunSQLOptions {
|
|
380
389
|
ctx: TenantCtx;
|
package/dist/entities.d.mts
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { a as DataSource } from "./_chunks/storage.mjs";
|
|
2
|
+
import { ScheduleState } from "./schedule.mjs";
|
|
2
3
|
import { TenantCtx } from "gscdump/contracts";
|
|
3
4
|
/**
|
|
4
5
|
* GSC URL inspection result fields we persist. Mirrors the
|
|
@@ -28,8 +29,17 @@ interface InspectionRecord {
|
|
|
28
29
|
* Free-form payload for fields we don't promote to first-class columns
|
|
29
30
|
* (e.g. `referringUrls`, `crawledAs`). Keeps the wire format forward-compat
|
|
30
31
|
* without bumping the schema for every API addition.
|
|
32
|
+
*
|
|
33
|
+
* Recognised keys:
|
|
34
|
+
* - `schedule`: optional `ScheduleState` from {@link inspectionPolicy}
|
|
35
|
+
* governing when this URL is next due for re-inspection. Undefined on
|
|
36
|
+
* pre-§0 records — readers must tolerate the missing field and fall
|
|
37
|
+
* back to default policy on first observe.
|
|
31
38
|
*/
|
|
32
|
-
raw?:
|
|
39
|
+
raw?: {
|
|
40
|
+
schedule?: ScheduleState;
|
|
41
|
+
[key: string]: unknown;
|
|
42
|
+
};
|
|
33
43
|
}
|
|
34
44
|
/** Wire shape persisted to disk/R2. */
|
|
35
45
|
interface InspectionIndex {
|
|
@@ -44,6 +54,7 @@ interface InspectionHistoryShard {
|
|
|
44
54
|
}
|
|
45
55
|
declare function inspectionIndexKey(ctx: TenantCtx): string;
|
|
46
56
|
declare function emptyTypesKey(ctx: TenantCtx): string;
|
|
57
|
+
declare function inspectionParquetKey(ctx: TenantCtx): string;
|
|
47
58
|
declare function inspectionHistoryKey(ctx: TenantCtx, yearMonth: string): string;
|
|
48
59
|
/**
|
|
49
60
|
* Stable URL hash used as the index key. Short, URL-safe, deterministic.
|
|
@@ -66,6 +77,34 @@ interface InspectionStore {
|
|
|
66
77
|
loadIndex: (ctx: TenantCtx) => Promise<InspectionIndex>;
|
|
67
78
|
/** Read the per-month history shard if it exists. */
|
|
68
79
|
loadHistory: (ctx: TenantCtx, yearMonth: string) => Promise<InspectionHistoryShard | undefined>;
|
|
80
|
+
/**
|
|
81
|
+
* Snapshot the current JSON index to a parquet sidecar at
|
|
82
|
+
* `entities/inspections/index.parquet`. One PUT. Sorted by `urlHash` so
|
|
83
|
+
* DuckDB row-group stats can prune URL-keyed JOINs efficiently.
|
|
84
|
+
*
|
|
85
|
+
* Internal seam: callers don't choose JSON-vs-parquet — the store materialises
|
|
86
|
+
* the parquet at end-of-batch (e.g. after `indexing/complete`) and readers
|
|
87
|
+
* pick the format that matches their access pattern (parquet for JOINs,
|
|
88
|
+
* JSON for full-index scans / point lookups).
|
|
89
|
+
*
|
|
90
|
+
* Returns the parquet object key (matches {@link parquetUri} after write).
|
|
91
|
+
*/
|
|
92
|
+
materialize: (ctx: TenantCtx) => Promise<{
|
|
93
|
+
key: string;
|
|
94
|
+
rowCount: number;
|
|
95
|
+
bytes: number;
|
|
96
|
+
}>;
|
|
97
|
+
/**
|
|
98
|
+
* DuckDB-resolvable URI for the materialised parquet sidecar, or
|
|
99
|
+
* `undefined` if the underlying `DataSource` has no native URI shape
|
|
100
|
+
* (in-memory tests). When defined, read paths can `read_parquet(<uri>)`
|
|
101
|
+
* directly without staging bytes through JS.
|
|
102
|
+
*
|
|
103
|
+
* Does not check existence — caller is responsible for ensuring
|
|
104
|
+
* `materialize` has run at least once. Returning a URI for a missing key
|
|
105
|
+
* is safe; DuckDB will surface a 404 / not-found at query time.
|
|
106
|
+
*/
|
|
107
|
+
parquetUri: (ctx: TenantCtx) => string | undefined;
|
|
69
108
|
}
|
|
70
109
|
interface CreateInspectionStoreOptions {
|
|
71
110
|
dataSource: DataSource;
|
|
@@ -100,6 +139,12 @@ interface SitemapRecord {
|
|
|
100
139
|
}>;
|
|
101
140
|
/** Raw payload for fields we don't promote to first-class columns. */
|
|
102
141
|
raw?: unknown;
|
|
142
|
+
/** Number of URLs observed in this feedpath at last snapshot. */
|
|
143
|
+
urlCount?: number;
|
|
144
|
+
/** Stable hash of the sorted normalized loc list at last snapshot. */
|
|
145
|
+
contentHash?: string;
|
|
146
|
+
/** Adaptive cadence state owned by `sitemapPolicy`. */
|
|
147
|
+
schedule?: ScheduleState;
|
|
103
148
|
}
|
|
104
149
|
interface SitemapIndex {
|
|
105
150
|
version: 1;
|
|
@@ -114,6 +159,57 @@ interface SitemapHistoryDoc {
|
|
|
114
159
|
}
|
|
115
160
|
declare function sitemapIndexKey(ctx: TenantCtx): string;
|
|
116
161
|
declare function sitemapHistoryKey(ctx: TenantCtx, feedpathHash: string, capturedAtMs: number): string;
|
|
162
|
+
declare function sitemapUrlsIndexKey(ctx: TenantCtx): string;
|
|
163
|
+
declare function sitemapUrlsDeltaKey(ctx: TenantCtx, feedpathHash: string, date: string): string;
|
|
164
|
+
/** Parsed URL entry from a sitemap XML. */
|
|
165
|
+
interface ParsedUrl {
|
|
166
|
+
loc: string;
|
|
167
|
+
/** ISO-8601 lastmod from the sitemap, if present. */
|
|
168
|
+
lastmod?: string;
|
|
169
|
+
}
|
|
170
|
+
/** A single URL row in the urls/index.parquet partition. */
|
|
171
|
+
interface SitemapUrlRecord {
|
|
172
|
+
feedpath: string;
|
|
173
|
+
feedpathHash: string;
|
|
174
|
+
urlHash: string;
|
|
175
|
+
loc: string;
|
|
176
|
+
lastmod?: string;
|
|
177
|
+
firstSeenAt: number;
|
|
178
|
+
lastSeenAt: number;
|
|
179
|
+
/** Set when the URL has been removed. Null/undefined = currently live. */
|
|
180
|
+
removedAt?: number;
|
|
181
|
+
}
|
|
182
|
+
interface SnapshotUrlsResult {
|
|
183
|
+
added: number;
|
|
184
|
+
removed: number;
|
|
185
|
+
kept: number;
|
|
186
|
+
contentHash: string;
|
|
187
|
+
/** True when contentHash matched prior; the call performed zero writes. */
|
|
188
|
+
unchanged: boolean;
|
|
189
|
+
}
|
|
190
|
+
interface DeltaEntry {
|
|
191
|
+
feedpath: string;
|
|
192
|
+
feedpathHash: string;
|
|
193
|
+
urlHash: string;
|
|
194
|
+
op: 'added' | 'removed';
|
|
195
|
+
loc: string;
|
|
196
|
+
lastmod?: string;
|
|
197
|
+
at: number;
|
|
198
|
+
}
|
|
199
|
+
interface DateRange {
|
|
200
|
+
/** YYYY-MM-DD inclusive. */
|
|
201
|
+
from?: string;
|
|
202
|
+
/** YYYY-MM-DD inclusive. */
|
|
203
|
+
to?: string;
|
|
204
|
+
}
|
|
205
|
+
interface LoadUrlsOptions {
|
|
206
|
+
includeRemoved?: boolean;
|
|
207
|
+
}
|
|
208
|
+
/**
|
|
209
|
+
* Hash a URL list for change detection. Sorts then folds via FNV-1a so it's
|
|
210
|
+
* deterministic, locale-free, and cheap on Workers.
|
|
211
|
+
*/
|
|
212
|
+
declare function hashUrlList(urls: readonly ParsedUrl[]): string;
|
|
117
213
|
interface SitemapStore {
|
|
118
214
|
/**
|
|
119
215
|
* Persist a snapshot run. Updates the index + writes one immutable
|
|
@@ -124,6 +220,24 @@ interface SitemapStore {
|
|
|
124
220
|
loadIndex: (ctx: TenantCtx) => Promise<SitemapIndex>;
|
|
125
221
|
/** Fetch the latest snapshot for a feedpath, or undefined. */
|
|
126
222
|
getLatest: (ctx: TenantCtx, path: string) => Promise<SitemapRecord | undefined>;
|
|
223
|
+
/**
|
|
224
|
+
* Diff incoming URLs against the prior `urls/index.parquet` partition for
|
|
225
|
+
* `feedpath`; on change, writes a single delta parquet under
|
|
226
|
+
* `urls/deltas/YYYY-MM-DD__{feedpathHash}.parquet`. Skipped (0 PUTs) when
|
|
227
|
+
* `contentHash` matches prior.
|
|
228
|
+
*/
|
|
229
|
+
snapshotUrls: (ctx: TenantCtx, feedpath: string, urls: readonly ParsedUrl[]) => Promise<SnapshotUrlsResult>;
|
|
230
|
+
/** Stream live (and optionally removed) URL rows for a feedpath. */
|
|
231
|
+
loadUrls: (ctx: TenantCtx, feedpath: string, opts?: LoadUrlsOptions) => AsyncIterable<SitemapUrlRecord>;
|
|
232
|
+
/** Stream all delta entries within `[from, to]` (YYYY-MM-DD inclusive). */
|
|
233
|
+
loadDeltas: (ctx: TenantCtx, dateRange?: DateRange) => AsyncIterable<DeltaEntry>;
|
|
234
|
+
/**
|
|
235
|
+
* Fold every accumulated delta into the prior index; writes a fresh
|
|
236
|
+
* `urls/index.parquet` and deletes the consumed delta files.
|
|
237
|
+
*/
|
|
238
|
+
compactUrls: (ctx: TenantCtx) => Promise<void>;
|
|
239
|
+
/** DuckDB-resolvable URI for the URLs index; `undefined` if backend lacks one. */
|
|
240
|
+
urlsParquetUri: (ctx: TenantCtx) => string | undefined;
|
|
127
241
|
}
|
|
128
242
|
interface CreateSitemapStoreOptions {
|
|
129
243
|
dataSource: DataSource;
|
|
@@ -175,4 +289,4 @@ interface CreateEmptyTypesStoreOptions {
|
|
|
175
289
|
now?: () => number;
|
|
176
290
|
}
|
|
177
291
|
declare function createEmptyTypesStore(opts: CreateEmptyTypesStoreOptions): EmptyTypesStore;
|
|
178
|
-
export { CreateEmptyTypesStoreOptions, CreateIndexingMetadataStoreOptions, CreateInspectionStoreOptions, CreateSitemapStoreOptions, EmptyTypesDoc, EmptyTypesStore, IndexingMetadataIndex, IndexingMetadataRecord, IndexingMetadataStore, InspectionIndex, InspectionRecord, InspectionStore, SitemapHistoryDoc, SitemapIndex, SitemapRecord, SitemapStore, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, indexingMetadataIndexKey, inspectionHistoryKey, inspectionIndexKey, sitemapHistoryKey, sitemapIndexKey };
|
|
292
|
+
export { CreateEmptyTypesStoreOptions, CreateIndexingMetadataStoreOptions, CreateInspectionStoreOptions, CreateSitemapStoreOptions, DateRange, DeltaEntry, EmptyTypesDoc, EmptyTypesStore, IndexingMetadataIndex, IndexingMetadataRecord, IndexingMetadataStore, InspectionIndex, InspectionRecord, InspectionStore, LoadUrlsOptions, ParsedUrl, SitemapHistoryDoc, SitemapIndex, SitemapRecord, SitemapStore, SitemapUrlRecord, SnapshotUrlsResult, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionHistoryKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey };
|
package/dist/entities.mjs
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { decodeParquetToRows, encodeRowsToParquetFlex } from "./adapters/hyparquet.mjs";
|
|
1
2
|
const YEAR_MONTH_RE = /^(\d{4})-(\d{2})-/;
|
|
2
3
|
function inspectionIndexKey(ctx) {
|
|
3
4
|
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/index.json` : `u_${ctx.userId}/entities/inspections/index.json`;
|
|
@@ -5,6 +6,9 @@ function inspectionIndexKey(ctx) {
|
|
|
5
6
|
function emptyTypesKey(ctx) {
|
|
6
7
|
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/empty-types.json` : `u_${ctx.userId}/entities/empty-types.json`;
|
|
7
8
|
}
|
|
9
|
+
function inspectionParquetKey(ctx) {
|
|
10
|
+
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/index.parquet` : `u_${ctx.userId}/entities/inspections/index.parquet`;
|
|
11
|
+
}
|
|
8
12
|
function inspectionHistoryKey(ctx, yearMonth) {
|
|
9
13
|
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/history/${yearMonth}.json` : `u_${ctx.userId}/entities/inspections/history/${yearMonth}.json`;
|
|
10
14
|
}
|
|
@@ -75,15 +79,251 @@ function createInspectionStore(opts) {
|
|
|
75
79
|
},
|
|
76
80
|
async loadHistory(ctx, yearMonth) {
|
|
77
81
|
return await readJson(inspectionHistoryKey(ctx, yearMonth));
|
|
82
|
+
},
|
|
83
|
+
async materialize(ctx) {
|
|
84
|
+
const index = await readJson(inspectionIndexKey(ctx)) ?? emptyIndex();
|
|
85
|
+
const rows = Object.entries(index.records).map(([urlHash, r]) => ({
|
|
86
|
+
urlHash,
|
|
87
|
+
url: r.url,
|
|
88
|
+
inspectedAt: r.inspectedAt,
|
|
89
|
+
indexStatus: r.indexStatus ?? null,
|
|
90
|
+
lastCrawlTime: r.lastCrawlTime ?? null,
|
|
91
|
+
googleCanonical: r.googleCanonical ?? null,
|
|
92
|
+
userCanonical: r.userCanonical ?? null,
|
|
93
|
+
coverageState: r.coverageState ?? null,
|
|
94
|
+
robotsTxtState: r.robotsTxtState ?? null,
|
|
95
|
+
indexingState: r.indexingState ?? null,
|
|
96
|
+
pageFetchState: r.pageFetchState ?? null,
|
|
97
|
+
mobileUsabilityVerdict: r.mobileUsabilityVerdict ?? null,
|
|
98
|
+
richResultsVerdict: r.richResultsVerdict ?? null,
|
|
99
|
+
scheduleNextAt: r.raw?.schedule?.nextAt ?? null,
|
|
100
|
+
scheduleConsecutiveUnchanged: r.raw?.schedule?.consecutiveUnchanged ?? null,
|
|
101
|
+
schedulePolicyVersion: r.raw?.schedule?.policyVersion ?? null
|
|
102
|
+
}));
|
|
103
|
+
const bytes = encodeRowsToParquetFlex(rows, {
|
|
104
|
+
columns: INSPECTION_PARQUET_COLUMNS,
|
|
105
|
+
sortKey: ["urlHash"]
|
|
106
|
+
});
|
|
107
|
+
const key = inspectionParquetKey(ctx);
|
|
108
|
+
await ds.write(key, bytes);
|
|
109
|
+
return {
|
|
110
|
+
key,
|
|
111
|
+
rowCount: rows.length,
|
|
112
|
+
bytes: bytes.byteLength
|
|
113
|
+
};
|
|
114
|
+
},
|
|
115
|
+
parquetUri(ctx) {
|
|
116
|
+
return ds.uri?.(inspectionParquetKey(ctx));
|
|
78
117
|
}
|
|
79
118
|
};
|
|
80
119
|
}
|
|
120
|
+
const INSPECTION_PARQUET_COLUMNS = [
|
|
121
|
+
{
|
|
122
|
+
name: "urlHash",
|
|
123
|
+
type: "VARCHAR",
|
|
124
|
+
nullable: false
|
|
125
|
+
},
|
|
126
|
+
{
|
|
127
|
+
name: "url",
|
|
128
|
+
type: "VARCHAR",
|
|
129
|
+
nullable: false
|
|
130
|
+
},
|
|
131
|
+
{
|
|
132
|
+
name: "inspectedAt",
|
|
133
|
+
type: "VARCHAR",
|
|
134
|
+
nullable: false
|
|
135
|
+
},
|
|
136
|
+
{
|
|
137
|
+
name: "indexStatus",
|
|
138
|
+
type: "VARCHAR",
|
|
139
|
+
nullable: true
|
|
140
|
+
},
|
|
141
|
+
{
|
|
142
|
+
name: "lastCrawlTime",
|
|
143
|
+
type: "VARCHAR",
|
|
144
|
+
nullable: true
|
|
145
|
+
},
|
|
146
|
+
{
|
|
147
|
+
name: "googleCanonical",
|
|
148
|
+
type: "VARCHAR",
|
|
149
|
+
nullable: true
|
|
150
|
+
},
|
|
151
|
+
{
|
|
152
|
+
name: "userCanonical",
|
|
153
|
+
type: "VARCHAR",
|
|
154
|
+
nullable: true
|
|
155
|
+
},
|
|
156
|
+
{
|
|
157
|
+
name: "coverageState",
|
|
158
|
+
type: "VARCHAR",
|
|
159
|
+
nullable: true
|
|
160
|
+
},
|
|
161
|
+
{
|
|
162
|
+
name: "robotsTxtState",
|
|
163
|
+
type: "VARCHAR",
|
|
164
|
+
nullable: true
|
|
165
|
+
},
|
|
166
|
+
{
|
|
167
|
+
name: "indexingState",
|
|
168
|
+
type: "VARCHAR",
|
|
169
|
+
nullable: true
|
|
170
|
+
},
|
|
171
|
+
{
|
|
172
|
+
name: "pageFetchState",
|
|
173
|
+
type: "VARCHAR",
|
|
174
|
+
nullable: true
|
|
175
|
+
},
|
|
176
|
+
{
|
|
177
|
+
name: "mobileUsabilityVerdict",
|
|
178
|
+
type: "VARCHAR",
|
|
179
|
+
nullable: true
|
|
180
|
+
},
|
|
181
|
+
{
|
|
182
|
+
name: "richResultsVerdict",
|
|
183
|
+
type: "VARCHAR",
|
|
184
|
+
nullable: true
|
|
185
|
+
},
|
|
186
|
+
{
|
|
187
|
+
name: "scheduleNextAt",
|
|
188
|
+
type: "BIGINT",
|
|
189
|
+
nullable: true
|
|
190
|
+
},
|
|
191
|
+
{
|
|
192
|
+
name: "scheduleConsecutiveUnchanged",
|
|
193
|
+
type: "INTEGER",
|
|
194
|
+
nullable: true
|
|
195
|
+
},
|
|
196
|
+
{
|
|
197
|
+
name: "schedulePolicyVersion",
|
|
198
|
+
type: "INTEGER",
|
|
199
|
+
nullable: true
|
|
200
|
+
}
|
|
201
|
+
];
|
|
81
202
|
function sitemapIndexKey(ctx) {
|
|
82
203
|
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/sitemaps/index.json` : `u_${ctx.userId}/entities/sitemaps/index.json`;
|
|
83
204
|
}
|
|
84
205
|
function sitemapHistoryKey(ctx, feedpathHash, capturedAtMs) {
|
|
85
206
|
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/sitemaps/history/${feedpathHash}__${capturedAtMs}.json` : `u_${ctx.userId}/entities/sitemaps/history/${feedpathHash}__${capturedAtMs}.json`;
|
|
86
207
|
}
|
|
208
|
+
function sitemapUrlsPrefix(ctx) {
|
|
209
|
+
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/sitemaps/urls` : `u_${ctx.userId}/entities/sitemaps/urls`;
|
|
210
|
+
}
|
|
211
|
+
function sitemapUrlsIndexKey(ctx) {
|
|
212
|
+
return `${sitemapUrlsPrefix(ctx)}/index.parquet`;
|
|
213
|
+
}
|
|
214
|
+
function sitemapUrlsDeltaKey(ctx, feedpathHash, date) {
|
|
215
|
+
return `${sitemapUrlsPrefix(ctx)}/deltas/${date}__${feedpathHash}.parquet`;
|
|
216
|
+
}
|
|
217
|
+
const SITEMAP_URLS_DELTA_PREFIX_RE = /\/urls\/deltas\/(\d{4}-\d{2}-\d{2})__([0-9a-f]+)\.parquet$/;
|
|
218
|
+
const URLS_INDEX_COLUMNS = [
|
|
219
|
+
{
|
|
220
|
+
name: "feedpath",
|
|
221
|
+
type: "VARCHAR",
|
|
222
|
+
nullable: false
|
|
223
|
+
},
|
|
224
|
+
{
|
|
225
|
+
name: "feedpath_hash",
|
|
226
|
+
type: "VARCHAR",
|
|
227
|
+
nullable: false
|
|
228
|
+
},
|
|
229
|
+
{
|
|
230
|
+
name: "url_hash",
|
|
231
|
+
type: "VARCHAR",
|
|
232
|
+
nullable: false
|
|
233
|
+
},
|
|
234
|
+
{
|
|
235
|
+
name: "loc",
|
|
236
|
+
type: "VARCHAR",
|
|
237
|
+
nullable: false
|
|
238
|
+
},
|
|
239
|
+
{
|
|
240
|
+
name: "lastmod",
|
|
241
|
+
type: "VARCHAR",
|
|
242
|
+
nullable: true
|
|
243
|
+
},
|
|
244
|
+
{
|
|
245
|
+
name: "first_seen_at",
|
|
246
|
+
type: "BIGINT",
|
|
247
|
+
nullable: false
|
|
248
|
+
},
|
|
249
|
+
{
|
|
250
|
+
name: "last_seen_at",
|
|
251
|
+
type: "BIGINT",
|
|
252
|
+
nullable: false
|
|
253
|
+
},
|
|
254
|
+
{
|
|
255
|
+
name: "removed_at",
|
|
256
|
+
type: "BIGINT",
|
|
257
|
+
nullable: true
|
|
258
|
+
}
|
|
259
|
+
];
|
|
260
|
+
const URLS_DELTA_COLUMNS = [
|
|
261
|
+
{
|
|
262
|
+
name: "feedpath",
|
|
263
|
+
type: "VARCHAR",
|
|
264
|
+
nullable: false
|
|
265
|
+
},
|
|
266
|
+
{
|
|
267
|
+
name: "feedpath_hash",
|
|
268
|
+
type: "VARCHAR",
|
|
269
|
+
nullable: false
|
|
270
|
+
},
|
|
271
|
+
{
|
|
272
|
+
name: "url_hash",
|
|
273
|
+
type: "VARCHAR",
|
|
274
|
+
nullable: false
|
|
275
|
+
},
|
|
276
|
+
{
|
|
277
|
+
name: "op",
|
|
278
|
+
type: "VARCHAR",
|
|
279
|
+
nullable: false
|
|
280
|
+
},
|
|
281
|
+
{
|
|
282
|
+
name: "loc",
|
|
283
|
+
type: "VARCHAR",
|
|
284
|
+
nullable: false
|
|
285
|
+
},
|
|
286
|
+
{
|
|
287
|
+
name: "lastmod",
|
|
288
|
+
type: "VARCHAR",
|
|
289
|
+
nullable: true
|
|
290
|
+
},
|
|
291
|
+
{
|
|
292
|
+
name: "at",
|
|
293
|
+
type: "BIGINT",
|
|
294
|
+
nullable: false
|
|
295
|
+
}
|
|
296
|
+
];
|
|
297
|
+
function rowToUrlRecord(row) {
|
|
298
|
+
return {
|
|
299
|
+
feedpath: String(row.feedpath),
|
|
300
|
+
feedpathHash: String(row.feedpath_hash),
|
|
301
|
+
urlHash: String(row.url_hash),
|
|
302
|
+
loc: String(row.loc),
|
|
303
|
+
lastmod: row.lastmod == null ? void 0 : String(row.lastmod),
|
|
304
|
+
firstSeenAt: Number(row.first_seen_at),
|
|
305
|
+
lastSeenAt: Number(row.last_seen_at),
|
|
306
|
+
removedAt: row.removed_at == null ? void 0 : Number(row.removed_at)
|
|
307
|
+
};
|
|
308
|
+
}
|
|
309
|
+
function urlRecordToRow(r) {
|
|
310
|
+
return {
|
|
311
|
+
feedpath: r.feedpath,
|
|
312
|
+
feedpath_hash: r.feedpathHash,
|
|
313
|
+
url_hash: r.urlHash,
|
|
314
|
+
loc: r.loc,
|
|
315
|
+
lastmod: r.lastmod ?? null,
|
|
316
|
+
first_seen_at: r.firstSeenAt,
|
|
317
|
+
last_seen_at: r.lastSeenAt,
|
|
318
|
+
removed_at: r.removedAt ?? null
|
|
319
|
+
};
|
|
320
|
+
}
|
|
321
|
+
function isoDate(ms) {
|
|
322
|
+
return new Date(ms).toISOString().slice(0, 10);
|
|
323
|
+
}
|
|
324
|
+
function hashUrlList(urls) {
|
|
325
|
+
return hashUrl(urls.map((u) => u.loc).sort().join("\n"));
|
|
326
|
+
}
|
|
87
327
|
function createSitemapStore(opts) {
|
|
88
328
|
const ds = opts.dataSource;
|
|
89
329
|
const hash = opts.hash ?? hashUrl;
|
|
@@ -123,6 +363,218 @@ function createSitemapStore(opts) {
|
|
|
123
363
|
},
|
|
124
364
|
async getLatest(ctx, path) {
|
|
125
365
|
return (await readJson(sitemapIndexKey(ctx)))?.records[hash(path)];
|
|
366
|
+
},
|
|
367
|
+
async snapshotUrls(ctx, feedpath, urls) {
|
|
368
|
+
const fpHash = hash(feedpath);
|
|
369
|
+
const contentHash = hashUrlList(urls);
|
|
370
|
+
const at = now();
|
|
371
|
+
const priorByHash = /* @__PURE__ */ new Map();
|
|
372
|
+
for await (const rec of this.loadUrls(ctx, feedpath, { includeRemoved: true })) priorByHash.set(rec.urlHash, rec);
|
|
373
|
+
const livePrior = Array.from(priorByHash.values()).filter((r) => r.removedAt == null);
|
|
374
|
+
if (livePrior.length > 0) {
|
|
375
|
+
if (hashUrl(livePrior.map((r) => String(r.loc)).sort().join("\n")) === contentHash) return {
|
|
376
|
+
added: 0,
|
|
377
|
+
removed: 0,
|
|
378
|
+
kept: livePrior.length,
|
|
379
|
+
contentHash,
|
|
380
|
+
unchanged: true
|
|
381
|
+
};
|
|
382
|
+
}
|
|
383
|
+
const incomingByHash = /* @__PURE__ */ new Map();
|
|
384
|
+
for (const u of urls) incomingByHash.set(hash(u.loc), u);
|
|
385
|
+
const deltaRows = [];
|
|
386
|
+
let added = 0;
|
|
387
|
+
let removed = 0;
|
|
388
|
+
let kept = 0;
|
|
389
|
+
const date = isoDate(at);
|
|
390
|
+
for (const [urlHash, u] of incomingByHash) {
|
|
391
|
+
const prev = priorByHash.get(urlHash);
|
|
392
|
+
if (!prev || prev.removedAt != null) {
|
|
393
|
+
added++;
|
|
394
|
+
deltaRows.push({
|
|
395
|
+
feedpath,
|
|
396
|
+
feedpath_hash: fpHash,
|
|
397
|
+
url_hash: urlHash,
|
|
398
|
+
op: "added",
|
|
399
|
+
loc: u.loc,
|
|
400
|
+
lastmod: u.lastmod ?? null,
|
|
401
|
+
at
|
|
402
|
+
});
|
|
403
|
+
} else kept++;
|
|
404
|
+
}
|
|
405
|
+
for (const [urlHash, prev] of priorByHash) {
|
|
406
|
+
if (prev.removedAt != null) continue;
|
|
407
|
+
if (!incomingByHash.has(urlHash)) {
|
|
408
|
+
removed++;
|
|
409
|
+
deltaRows.push({
|
|
410
|
+
feedpath,
|
|
411
|
+
feedpath_hash: fpHash,
|
|
412
|
+
url_hash: urlHash,
|
|
413
|
+
op: "removed",
|
|
414
|
+
loc: prev.loc,
|
|
415
|
+
lastmod: prev.lastmod ?? null,
|
|
416
|
+
at
|
|
417
|
+
});
|
|
418
|
+
}
|
|
419
|
+
}
|
|
420
|
+
if (deltaRows.length > 0) {
|
|
421
|
+
const bytes = encodeRowsToParquetFlex(deltaRows, {
|
|
422
|
+
columns: URLS_DELTA_COLUMNS,
|
|
423
|
+
sortKey: ["url_hash"]
|
|
424
|
+
});
|
|
425
|
+
await ds.write(sitemapUrlsDeltaKey(ctx, fpHash, date), bytes);
|
|
426
|
+
}
|
|
427
|
+
return {
|
|
428
|
+
added,
|
|
429
|
+
removed,
|
|
430
|
+
kept,
|
|
431
|
+
contentHash,
|
|
432
|
+
unchanged: false
|
|
433
|
+
};
|
|
434
|
+
},
|
|
435
|
+
async *loadUrls(ctx, feedpath, opts) {
|
|
436
|
+
const fpHash = hash(feedpath);
|
|
437
|
+
const includeRemoved = opts?.includeRemoved ?? false;
|
|
438
|
+
const indexBytes = await ds.read(sitemapUrlsIndexKey(ctx)).catch(() => void 0);
|
|
439
|
+
const indexRows = indexBytes ? await decodeParquetToRows(indexBytes) : [];
|
|
440
|
+
const deltaKeys = (await ds.list(`${sitemapUrlsPrefix(ctx)}/deltas/`)).sort();
|
|
441
|
+
const live = /* @__PURE__ */ new Map();
|
|
442
|
+
const removedMap = /* @__PURE__ */ new Map();
|
|
443
|
+
for (const row of indexRows) {
|
|
444
|
+
if (row.feedpath_hash !== fpHash) continue;
|
|
445
|
+
const rec = rowToUrlRecord(row);
|
|
446
|
+
if (rec.removedAt != null) removedMap.set(rec.urlHash, rec);
|
|
447
|
+
else live.set(rec.urlHash, rec);
|
|
448
|
+
}
|
|
449
|
+
for (const key of deltaKeys) {
|
|
450
|
+
const m = SITEMAP_URLS_DELTA_PREFIX_RE.exec(key);
|
|
451
|
+
if (!m || m[2] !== fpHash) continue;
|
|
452
|
+
const dBytes = await ds.read(key).catch(() => void 0);
|
|
453
|
+
if (!dBytes) continue;
|
|
454
|
+
const dRows = await decodeParquetToRows(dBytes);
|
|
455
|
+
for (const r of dRows) {
|
|
456
|
+
const op = String(r.op);
|
|
457
|
+
const urlHash = String(r.url_hash);
|
|
458
|
+
const at = Number(r.at);
|
|
459
|
+
if (op === "added") {
|
|
460
|
+
const prev = live.get(urlHash) ?? removedMap.get(urlHash);
|
|
461
|
+
removedMap.delete(urlHash);
|
|
462
|
+
live.set(urlHash, {
|
|
463
|
+
feedpath,
|
|
464
|
+
feedpathHash: fpHash,
|
|
465
|
+
urlHash,
|
|
466
|
+
loc: String(r.loc),
|
|
467
|
+
lastmod: r.lastmod == null ? void 0 : String(r.lastmod),
|
|
468
|
+
firstSeenAt: prev?.firstSeenAt ?? at,
|
|
469
|
+
lastSeenAt: at
|
|
470
|
+
});
|
|
471
|
+
} else if (op === "removed") {
|
|
472
|
+
const prev = live.get(urlHash);
|
|
473
|
+
live.delete(urlHash);
|
|
474
|
+
if (prev) removedMap.set(urlHash, {
|
|
475
|
+
...prev,
|
|
476
|
+
removedAt: at
|
|
477
|
+
});
|
|
478
|
+
}
|
|
479
|
+
}
|
|
480
|
+
}
|
|
481
|
+
for (const rec of live.values()) yield rec;
|
|
482
|
+
if (includeRemoved) for (const rec of removedMap.values()) yield rec;
|
|
483
|
+
},
|
|
484
|
+
async *loadDeltas(ctx, dateRange) {
|
|
485
|
+
const from = dateRange?.from;
|
|
486
|
+
const to = dateRange?.to;
|
|
487
|
+
const keys = (await ds.list(`${sitemapUrlsPrefix(ctx)}/deltas/`)).sort();
|
|
488
|
+
for (const key of keys) {
|
|
489
|
+
const m = SITEMAP_URLS_DELTA_PREFIX_RE.exec(key);
|
|
490
|
+
if (!m) continue;
|
|
491
|
+
const date = m[1];
|
|
492
|
+
if (from && date < from) continue;
|
|
493
|
+
if (to && date > to) continue;
|
|
494
|
+
const bytes = await ds.read(key).catch(() => void 0);
|
|
495
|
+
if (!bytes) continue;
|
|
496
|
+
const rows = await decodeParquetToRows(bytes);
|
|
497
|
+
for (const r of rows) {
|
|
498
|
+
const op = String(r.op);
|
|
499
|
+
if (op !== "added" && op !== "removed") continue;
|
|
500
|
+
yield {
|
|
501
|
+
feedpath: String(r.feedpath),
|
|
502
|
+
feedpathHash: String(r.feedpath_hash),
|
|
503
|
+
urlHash: String(r.url_hash),
|
|
504
|
+
op,
|
|
505
|
+
loc: String(r.loc),
|
|
506
|
+
lastmod: r.lastmod == null ? void 0 : String(r.lastmod),
|
|
507
|
+
at: Number(r.at)
|
|
508
|
+
};
|
|
509
|
+
}
|
|
510
|
+
}
|
|
511
|
+
},
|
|
512
|
+
async compactUrls(ctx) {
|
|
513
|
+
const indexKey = sitemapUrlsIndexKey(ctx);
|
|
514
|
+
const indexBytes = await ds.read(indexKey).catch(() => void 0);
|
|
515
|
+
const indexRows = indexBytes ? await decodeParquetToRows(indexBytes) : [];
|
|
516
|
+
const stateKey = (fp, u) => `${fp}::${u}`;
|
|
517
|
+
const live = /* @__PURE__ */ new Map();
|
|
518
|
+
const removed = /* @__PURE__ */ new Map();
|
|
519
|
+
for (const row of indexRows) {
|
|
520
|
+
const rec = rowToUrlRecord(row);
|
|
521
|
+
const k = stateKey(rec.feedpathHash, rec.urlHash);
|
|
522
|
+
if (rec.removedAt != null) removed.set(k, rec);
|
|
523
|
+
else live.set(k, rec);
|
|
524
|
+
}
|
|
525
|
+
const deltaKeys = (await ds.list(`${sitemapUrlsPrefix(ctx)}/deltas/`)).sort();
|
|
526
|
+
const consumed = [];
|
|
527
|
+
for (const key of deltaKeys) {
|
|
528
|
+
const m = SITEMAP_URLS_DELTA_PREFIX_RE.exec(key);
|
|
529
|
+
if (!m) continue;
|
|
530
|
+
const fpHash = m[2];
|
|
531
|
+
const bytes = await ds.read(key).catch(() => void 0);
|
|
532
|
+
if (!bytes) continue;
|
|
533
|
+
consumed.push(key);
|
|
534
|
+
const rows = await decodeParquetToRows(bytes);
|
|
535
|
+
for (const r of rows) {
|
|
536
|
+
const urlHash = String(r.url_hash);
|
|
537
|
+
const at = Number(r.at);
|
|
538
|
+
const k = stateKey(fpHash, urlHash);
|
|
539
|
+
const op = String(r.op);
|
|
540
|
+
if (op === "added") {
|
|
541
|
+
const prev = live.get(k) ?? removed.get(k);
|
|
542
|
+
removed.delete(k);
|
|
543
|
+
live.set(k, {
|
|
544
|
+
feedpath: String(r.feedpath),
|
|
545
|
+
feedpathHash: fpHash,
|
|
546
|
+
urlHash,
|
|
547
|
+
loc: String(r.loc),
|
|
548
|
+
lastmod: r.lastmod == null ? void 0 : String(r.lastmod),
|
|
549
|
+
firstSeenAt: prev?.firstSeenAt ?? at,
|
|
550
|
+
lastSeenAt: at
|
|
551
|
+
});
|
|
552
|
+
} else if (op === "removed") {
|
|
553
|
+
const prev = live.get(k);
|
|
554
|
+
live.delete(k);
|
|
555
|
+
if (prev) removed.set(k, {
|
|
556
|
+
...prev,
|
|
557
|
+
removedAt: at
|
|
558
|
+
});
|
|
559
|
+
}
|
|
560
|
+
}
|
|
561
|
+
}
|
|
562
|
+
const merged = [...live.values(), ...removed.values()];
|
|
563
|
+
merged.sort((a, b) => {
|
|
564
|
+
if (a.feedpathHash !== b.feedpathHash) return a.feedpathHash < b.feedpathHash ? -1 : 1;
|
|
565
|
+
if (a.urlHash !== b.urlHash) return a.urlHash < b.urlHash ? -1 : 1;
|
|
566
|
+
return 0;
|
|
567
|
+
});
|
|
568
|
+
const bytes = encodeRowsToParquetFlex(merged.map(urlRecordToRow), {
|
|
569
|
+
columns: URLS_INDEX_COLUMNS,
|
|
570
|
+
sortKey: ["feedpath_hash", "url_hash"]
|
|
571
|
+
});
|
|
572
|
+
await ds.write(indexKey, bytes);
|
|
573
|
+
if (consumed.length > 0) await ds.delete(consumed);
|
|
574
|
+
},
|
|
575
|
+
urlsParquetUri(ctx) {
|
|
576
|
+
const key = sitemapUrlsIndexKey(ctx);
|
|
577
|
+
return ds.uri ? ds.uri(key) : void 0;
|
|
126
578
|
}
|
|
127
579
|
};
|
|
128
580
|
}
|
|
@@ -206,4 +658,4 @@ function createEmptyTypesStore(opts) {
|
|
|
206
658
|
}
|
|
207
659
|
};
|
|
208
660
|
}
|
|
209
|
-
export { createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, indexingMetadataIndexKey, inspectionHistoryKey, inspectionIndexKey, sitemapHistoryKey, sitemapIndexKey };
|
|
661
|
+
export { createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionHistoryKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey };
|
package/dist/index.d.mts
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import { A as SyncStateDetail, B as WriteResult, C as QueryExecutor, D as SearchType, E as RunSQLOptions, F as TenantCtx, G as CompactionThresholds, H as inferLegacyTier, I as Watermark, K as enumeratePartitions, L as WatermarkFilter, M as SyncStateKind, N as SyncStateScope, O as StorageEngine, P as TableName, R as WatermarkScope, S as QueryExecuteResult, T as Row, U as inferSearchType, V as dayPartition, W as objectKey, _ as PurgeFilter, a as DataSource, b as QueryCtx, c as FileSetRef, d as LockScope, f as ManifestEntry, g as ParquetCodec, h as OptimizedQueryResult, i as DEFAULT_SEARCH_TYPE, j as SyncStateFilter, k as SyncState, l as GcCtx, m as ManifestStore, n as CompactionTier, o as EngineOptions, p as ManifestPurgeResult, r as ComparisonResult, s as ExtraResult, t as CodecCtx, u as ListLiveFilter, v as PurgeResult, w as QueryResult, x as QueryExecuteOptions, y as PurgeUrlsResult, z as WriteCtx } from "./_chunks/storage.mjs";
|
|
2
2
|
import { a as createDuckDBExecutor, i as createDuckDBCodec, n as DuckDBHandle, r as canonicalEmptyParquetSchema, t as DuckDBFactory } from "./_chunks/duckdb.mjs";
|
|
3
3
|
import { _ as pages, a as allTables, c as inferTable, d as TABLE_METADATA, f as countries, g as page_keywords, h as keywords, i as TableSchema, m as drizzleSchema, n as ColumnType, o as currentSchemaVersion, p as devices, r as SCHEMAS, s as dimensionToColumn, t as ColumnDef, u as DrizzleSchema } from "./_chunks/schema.mjs";
|
|
4
|
+
import { InspectionVerdict, SchedulePolicy, ScheduleState, fixedPolicy, inspectionPolicy, sitemapPolicy } from "./schedule.mjs";
|
|
4
5
|
import { GscApiRow, IngestOptions, RowAccumulator, RowAccumulatorOptions, createRowAccumulator, toPath, toSumPosition, transformGscRow } from "./ingest.mjs";
|
|
5
6
|
import { a as substituteNamedFiles, i as resolveToSQL, n as ResolvedQuery, t as FILES_PLACEHOLDER } from "./_chunks/planner.mjs";
|
|
6
7
|
import { bindLiterals, formatLiteral } from "./sql-bind.mjs";
|
|
@@ -9,4 +10,4 @@ declare function coerceRow(row: Row$1): Row$1;
|
|
|
9
10
|
declare function coerceRows(rows: readonly Row$1[]): Row$1[];
|
|
10
11
|
declare const MAX_DAY_BYTES: number;
|
|
11
12
|
declare function createStorageEngine(opts: EngineOptions): StorageEngine;
|
|
12
|
-
export { type CodecCtx, type ColumnDef, type ColumnType, type CompactionThresholds, type CompactionTier, type ComparisonResult, DEFAULT_SEARCH_TYPE, type DataSource, type DrizzleSchema, type DuckDBFactory, type DuckDBHandle, type EngineOptions, type ExtraResult, FILES_PLACEHOLDER, type FileSetRef, type GcCtx, type GscApiRow, type IngestOptions, type ListLiveFilter, type LockScope, MAX_DAY_BYTES, type ManifestEntry, type ManifestPurgeResult, type ManifestStore, type OptimizedQueryResult, type ParquetCodec, type PurgeFilter, type PurgeResult, type PurgeUrlsResult, type QueryCtx, type QueryExecuteOptions, type QueryExecuteResult, type QueryExecutor, type QueryResult, type ResolvedQuery, type Row, type RowAccumulator, type RowAccumulatorOptions, type RunSQLOptions, SCHEMAS, type SearchType, type StorageEngine, type SyncState, type SyncStateDetail, type SyncStateFilter, type SyncStateKind, type SyncStateScope, TABLE_METADATA, type TableName, type TableSchema, type TenantCtx, type Watermark, type WatermarkFilter, type WatermarkScope, type WriteCtx, type WriteResult, allTables, bindLiterals, canonicalEmptyParquetSchema, coerceRow, coerceRows, countries, createDuckDBCodec, createDuckDBExecutor, createRowAccumulator, createStorageEngine, currentSchemaVersion, dayPartition, devices, dimensionToColumn, drizzleSchema, enumeratePartitions, formatLiteral, inferLegacyTier, inferSearchType, inferTable, keywords, objectKey, page_keywords, pages, resolveToSQL, substituteNamedFiles, toPath, toSumPosition, transformGscRow };
|
|
13
|
+
export { type CodecCtx, type ColumnDef, type ColumnType, type CompactionThresholds, type CompactionTier, type ComparisonResult, DEFAULT_SEARCH_TYPE, type DataSource, type DrizzleSchema, type DuckDBFactory, type DuckDBHandle, type EngineOptions, type ExtraResult, FILES_PLACEHOLDER, type FileSetRef, type GcCtx, type GscApiRow, type IngestOptions, type InspectionVerdict, type ListLiveFilter, type LockScope, MAX_DAY_BYTES, type ManifestEntry, type ManifestPurgeResult, type ManifestStore, type OptimizedQueryResult, type ParquetCodec, type PurgeFilter, type PurgeResult, type PurgeUrlsResult, type QueryCtx, type QueryExecuteOptions, type QueryExecuteResult, type QueryExecutor, type QueryResult, type ResolvedQuery, type Row, type RowAccumulator, type RowAccumulatorOptions, type RunSQLOptions, SCHEMAS, type SchedulePolicy, type ScheduleState, type SearchType, type StorageEngine, type SyncState, type SyncStateDetail, type SyncStateFilter, type SyncStateKind, type SyncStateScope, TABLE_METADATA, type TableName, type TableSchema, type TenantCtx, type Watermark, type WatermarkFilter, type WatermarkScope, type WriteCtx, type WriteResult, allTables, bindLiterals, canonicalEmptyParquetSchema, coerceRow, coerceRows, countries, createDuckDBCodec, createDuckDBExecutor, createRowAccumulator, createStorageEngine, currentSchemaVersion, dayPartition, devices, dimensionToColumn, drizzleSchema, enumeratePartitions, fixedPolicy, formatLiteral, inferLegacyTier, inferSearchType, inferTable, inspectionPolicy, keywords, objectKey, page_keywords, pages, resolveToSQL, sitemapPolicy, substituteNamedFiles, toPath, toSumPosition, transformGscRow };
|
package/dist/index.mjs
CHANGED
|
@@ -5,6 +5,7 @@ import { bindLiterals, formatLiteral } from "./sql-bind.mjs";
|
|
|
5
5
|
import { a as createDuckDBExecutor, i as createDuckDBCodec, n as createStorageEngine, r as canonicalEmptyParquetSchema, t as MAX_DAY_BYTES } from "./_chunks/engine.mjs";
|
|
6
6
|
import { createRowAccumulator, toPath, toSumPosition, transformGscRow } from "./ingest.mjs";
|
|
7
7
|
import "./planner.mjs";
|
|
8
|
+
import { fixedPolicy, inspectionPolicy, sitemapPolicy } from "./schedule.mjs";
|
|
8
9
|
function coerceRow(row) {
|
|
9
10
|
let mutated = null;
|
|
10
11
|
for (const [k, v] of Object.entries(row)) if (typeof v === "bigint") {
|
|
@@ -18,4 +19,4 @@ function coerceRows(rows) {
|
|
|
18
19
|
for (let i = 0; i < rows.length; i++) out[i] = coerceRow(rows[i]);
|
|
19
20
|
return out;
|
|
20
21
|
}
|
|
21
|
-
export { DEFAULT_SEARCH_TYPE, FILES_PLACEHOLDER, MAX_DAY_BYTES, SCHEMAS, TABLE_METADATA, allTables, bindLiterals, canonicalEmptyParquetSchema, coerceRow, coerceRows, countries, createDuckDBCodec, createDuckDBExecutor, createRowAccumulator, createStorageEngine, currentSchemaVersion, dayPartition, devices, dimensionToColumn, drizzleSchema, enumeratePartitions, formatLiteral, inferLegacyTier, inferSearchType, inferTable, keywords, objectKey, page_keywords, pages, resolveToSQL, substituteNamedFiles, toPath, toSumPosition, transformGscRow };
|
|
22
|
+
export { DEFAULT_SEARCH_TYPE, FILES_PLACEHOLDER, MAX_DAY_BYTES, SCHEMAS, TABLE_METADATA, allTables, bindLiterals, canonicalEmptyParquetSchema, coerceRow, coerceRows, countries, createDuckDBCodec, createDuckDBExecutor, createRowAccumulator, createStorageEngine, currentSchemaVersion, dayPartition, devices, dimensionToColumn, drizzleSchema, enumeratePartitions, fixedPolicy, formatLiteral, inferLegacyTier, inferSearchType, inferTable, inspectionPolicy, keywords, objectKey, page_keywords, pages, resolveToSQL, sitemapPolicy, substituteNamedFiles, toPath, toSumPosition, transformGscRow };
|
package/dist/rollups.d.mts
CHANGED
|
@@ -159,5 +159,38 @@ declare const topKeywords28dParquetRollup: RollupDef;
|
|
|
159
159
|
* so downstream readers don't have to special-case first-run sites.
|
|
160
160
|
*/
|
|
161
161
|
declare const indexingMetadataRollup: RollupDef;
|
|
162
|
+
/**
|
|
163
|
+
* Indexing-API health by day: per `inspectedAt` date, counts of indexed,
|
|
164
|
+
* soft-404, redirect, not-found, mobile passes, rich-results passes, and
|
|
165
|
+
* canonical mismatches. Sourced from the inspections parquet sidecar
|
|
166
|
+
* (`InspectionStore.parquetUri`), which holds the latest record per URL.
|
|
167
|
+
*
|
|
168
|
+
* Empty-payload no-op when the sidecar URI is unavailable (in-memory
|
|
169
|
+
* `DataSource`, or before `materialize` has run).
|
|
170
|
+
*/
|
|
171
|
+
declare const indexingHealthRollup: RollupDef;
|
|
172
|
+
/**
|
|
173
|
+
* Per-day index-percent: ratio of (sitemap URLs that received GSC clicks on
|
|
174
|
+
* that date) / (total live sitemap URLs). Uses a DuckDB JOIN between the
|
|
175
|
+
* sitemap urls parquet (`SitemapStore.urlsParquetUri`) and the `pages` fact
|
|
176
|
+
* parquet. Total denominator is the count of live URLs in the urls index;
|
|
177
|
+
* numerator is per-day distinct loc count where pages.clicks > 0.
|
|
178
|
+
*/
|
|
179
|
+
declare const indexPercentRollup: RollupDef;
|
|
180
|
+
/**
|
|
181
|
+
* Sitemap-health per-day series materialized from the sitemap-store JSON
|
|
182
|
+
* index. Each `SitemapRecord` carries `urlCount`, `errors`, `warnings`,
|
|
183
|
+
* `contentHash`, and `lastDownloaded`. We bucket records by the day of their
|
|
184
|
+
* `capturedAt` (or `lastDownloaded` fallback) and emit per-day aggregates plus
|
|
185
|
+
* a snapshot of per-feed stats at the most recent capture.
|
|
186
|
+
*/
|
|
187
|
+
declare const sitemapHealthRollup: RollupDef;
|
|
188
|
+
/**
|
|
189
|
+
* Trailing-28-day sitemap URL changes: per-day per-feedpath {added, removed}
|
|
190
|
+
* counts plus rolling top-200 added and removed URLs. Streams from
|
|
191
|
+
* `SitemapStore.loadDeltas()` so it scales independently of how many feeds
|
|
192
|
+
* exist on the site.
|
|
193
|
+
*/
|
|
194
|
+
declare const sitemapChanges28dRollup: RollupDef;
|
|
162
195
|
declare const DEFAULT_ROLLUPS: readonly RollupDef[];
|
|
163
|
-
export { DEFAULT_ROLLUPS, ParquetRollupPointer, RebuildRollupResult, RebuildRollupsOptions, RollupCtx, RollupDef, RollupEngine, RollupEnvelope, dailyTotalsRollup, indexingMetadataRollup, rebuildRollups, rollupKey, rollupParquetKey, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };
|
|
196
|
+
export { DEFAULT_ROLLUPS, ParquetRollupPointer, RebuildRollupResult, RebuildRollupsOptions, RollupCtx, RollupDef, RollupEngine, RollupEnvelope, dailyTotalsRollup, indexPercentRollup, indexingHealthRollup, indexingMetadataRollup, rebuildRollups, rollupKey, rollupParquetKey, sitemapChanges28dRollup, sitemapHealthRollup, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };
|
package/dist/rollups.mjs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { createIndexingMetadataStore } from "./entities.mjs";
|
|
2
1
|
import { encodeRowsToParquetFlex } from "./adapters/hyparquet.mjs";
|
|
2
|
+
import { createIndexingMetadataStore, createSitemapStore, inspectionParquetKey, sitemapUrlsIndexKey } from "./entities.mjs";
|
|
3
3
|
import { MS_PER_DAY } from "gscdump";
|
|
4
4
|
function rollupPrefix(ctx) {
|
|
5
5
|
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/rollups` : `u_${ctx.userId}/rollups`;
|
|
@@ -335,12 +335,220 @@ const indexingMetadataRollup = {
|
|
|
335
335
|
};
|
|
336
336
|
}
|
|
337
337
|
};
|
|
338
|
+
const indexingHealthRollup = {
|
|
339
|
+
id: "indexing_health",
|
|
340
|
+
windowDays: 90,
|
|
341
|
+
async build({ engine, ctx, dataSource, builtAt }) {
|
|
342
|
+
const key = inspectionParquetKey(ctx);
|
|
343
|
+
if (!await dataSource.head?.(key)) return { days: [] };
|
|
344
|
+
const sql = `
|
|
345
|
+
SELECT
|
|
346
|
+
substr(inspectedAt, 1, 10) AS date,
|
|
347
|
+
COUNT(*)::BIGINT AS total_urls,
|
|
348
|
+
SUM(CASE WHEN indexStatus = 'PASS' THEN 1 ELSE 0 END)::BIGINT AS indexed_count,
|
|
349
|
+
SUM(CASE WHEN pageFetchState = 'SOFT_404' THEN 1 ELSE 0 END)::BIGINT AS soft_404,
|
|
350
|
+
SUM(CASE WHEN pageFetchState = 'REDIRECT_ERROR' THEN 1 ELSE 0 END)::BIGINT AS redirect,
|
|
351
|
+
SUM(CASE WHEN pageFetchState = 'NOT_FOUND' THEN 1 ELSE 0 END)::BIGINT AS not_found,
|
|
352
|
+
SUM(CASE WHEN mobileUsabilityVerdict = 'PASS' THEN 1 ELSE 0 END)::BIGINT AS mobile_passes,
|
|
353
|
+
SUM(CASE WHEN richResultsVerdict = 'PASS' THEN 1 ELSE 0 END)::BIGINT AS rich_results_passes,
|
|
354
|
+
SUM(CASE WHEN userCanonical IS NOT NULL AND googleCanonical IS NOT NULL AND userCanonical <> googleCanonical THEN 1 ELSE 0 END)::BIGINT AS canonical_mismatches
|
|
355
|
+
FROM read_parquet({{INSPECTIONS}}, union_by_name = true)
|
|
356
|
+
WHERE substr(inspectedAt, 1, 10) >= '${utcDateMinusDays(builtAt, 90)}'
|
|
357
|
+
GROUP BY 1
|
|
358
|
+
ORDER BY 1
|
|
359
|
+
`;
|
|
360
|
+
return { days: (await engine.runSQL({
|
|
361
|
+
ctx,
|
|
362
|
+
table: "pages",
|
|
363
|
+
fileSets: { INSPECTIONS: {
|
|
364
|
+
table: "pages",
|
|
365
|
+
keys: [key]
|
|
366
|
+
} },
|
|
367
|
+
sql
|
|
368
|
+
})).rows.map((r) => ({
|
|
369
|
+
date: String(r.date),
|
|
370
|
+
total_urls: Number(r.total_urls),
|
|
371
|
+
indexed_count: Number(r.indexed_count),
|
|
372
|
+
soft_404: Number(r.soft_404),
|
|
373
|
+
redirect: Number(r.redirect),
|
|
374
|
+
not_found: Number(r.not_found),
|
|
375
|
+
mobile_passes: Number(r.mobile_passes),
|
|
376
|
+
rich_results_passes: Number(r.rich_results_passes),
|
|
377
|
+
canonical_mismatches: Number(r.canonical_mismatches)
|
|
378
|
+
})) };
|
|
379
|
+
}
|
|
380
|
+
};
|
|
381
|
+
const indexPercentRollup = {
|
|
382
|
+
id: "index_percent",
|
|
383
|
+
windowDays: 90,
|
|
384
|
+
async build({ engine, ctx, dataSource, builtAt }) {
|
|
385
|
+
const urlsKey = sitemapUrlsIndexKey(ctx);
|
|
386
|
+
if (!await dataSource.head?.(urlsKey)) return {
|
|
387
|
+
totalSitemapUrls: 0,
|
|
388
|
+
days: []
|
|
389
|
+
};
|
|
390
|
+
const cutoff = utcDateMinusDays(builtAt, 90);
|
|
391
|
+
const numerator = await engine.runSQL({
|
|
392
|
+
ctx,
|
|
393
|
+
table: "pages",
|
|
394
|
+
fileSets: {
|
|
395
|
+
PAGES: { table: "pages" },
|
|
396
|
+
URLS: {
|
|
397
|
+
table: "pages",
|
|
398
|
+
keys: [urlsKey]
|
|
399
|
+
}
|
|
400
|
+
},
|
|
401
|
+
sql: `
|
|
402
|
+
SELECT
|
|
403
|
+
p.date AS date,
|
|
404
|
+
COUNT(DISTINCT p.url)::BIGINT AS clicked_urls
|
|
405
|
+
FROM read_parquet({{PAGES}}, union_by_name = true) p
|
|
406
|
+
INNER JOIN read_parquet({{URLS}}, union_by_name = true) s
|
|
407
|
+
ON s.loc = p.url AND s.removed_at IS NULL
|
|
408
|
+
WHERE p.clicks > 0 AND p.date >= '${cutoff}'
|
|
409
|
+
GROUP BY p.date
|
|
410
|
+
ORDER BY p.date
|
|
411
|
+
`
|
|
412
|
+
});
|
|
413
|
+
const denom = await engine.runSQL({
|
|
414
|
+
ctx,
|
|
415
|
+
table: "pages",
|
|
416
|
+
fileSets: { URLS: {
|
|
417
|
+
table: "pages",
|
|
418
|
+
keys: [urlsKey]
|
|
419
|
+
} },
|
|
420
|
+
sql: `
|
|
421
|
+
SELECT COUNT(*)::BIGINT AS total
|
|
422
|
+
FROM read_parquet({{URLS}}, union_by_name = true)
|
|
423
|
+
WHERE removed_at IS NULL
|
|
424
|
+
`
|
|
425
|
+
});
|
|
426
|
+
const total = Number(denom.rows[0]?.total ?? 0);
|
|
427
|
+
return {
|
|
428
|
+
totalSitemapUrls: total,
|
|
429
|
+
days: numerator.rows.map((r) => {
|
|
430
|
+
const clicked = Number(r.clicked_urls);
|
|
431
|
+
return {
|
|
432
|
+
date: String(r.date),
|
|
433
|
+
clicked_urls: clicked,
|
|
434
|
+
total_sitemap_urls: total,
|
|
435
|
+
ratio: total === 0 ? 0 : clicked / total
|
|
436
|
+
};
|
|
437
|
+
})
|
|
438
|
+
};
|
|
439
|
+
}
|
|
440
|
+
};
|
|
441
|
+
const sitemapHealthRollup = {
|
|
442
|
+
id: "sitemap_health",
|
|
443
|
+
windowDays: 90,
|
|
444
|
+
async build({ dataSource, ctx, builtAt }) {
|
|
445
|
+
const index = await createSitemapStore({ dataSource }).loadIndex(ctx);
|
|
446
|
+
const records = Object.values(index.records);
|
|
447
|
+
const cutoff = utcDateMinusDays(builtAt, 90);
|
|
448
|
+
const byDay = /* @__PURE__ */ new Map();
|
|
449
|
+
const feeds = [];
|
|
450
|
+
for (const r of records) {
|
|
451
|
+
const day = (r.capturedAt ?? r.lastDownloaded ?? "").slice(0, 10);
|
|
452
|
+
if (!day || day < cutoff) continue;
|
|
453
|
+
const errors = Number(r.errors ?? 0);
|
|
454
|
+
const warnings = Number(r.warnings ?? 0);
|
|
455
|
+
const urlCount = Number(r.urlCount ?? 0);
|
|
456
|
+
const bucket = byDay.get(day) ?? {
|
|
457
|
+
day,
|
|
458
|
+
feeds: 0,
|
|
459
|
+
total_urls: 0,
|
|
460
|
+
errors: 0,
|
|
461
|
+
warnings: 0
|
|
462
|
+
};
|
|
463
|
+
bucket.feeds += 1;
|
|
464
|
+
bucket.total_urls += urlCount;
|
|
465
|
+
bucket.errors += errors;
|
|
466
|
+
bucket.warnings += warnings;
|
|
467
|
+
byDay.set(day, bucket);
|
|
468
|
+
feeds.push({
|
|
469
|
+
path: r.path,
|
|
470
|
+
urlCount,
|
|
471
|
+
errors,
|
|
472
|
+
warnings,
|
|
473
|
+
contentHash: r.contentHash ?? null,
|
|
474
|
+
lastDownloaded: r.lastDownloaded ?? null,
|
|
475
|
+
capturedAt: r.capturedAt
|
|
476
|
+
});
|
|
477
|
+
}
|
|
478
|
+
return {
|
|
479
|
+
days: Array.from(byDay.values()).sort((a, b) => a.day < b.day ? -1 : 1),
|
|
480
|
+
feeds
|
|
481
|
+
};
|
|
482
|
+
}
|
|
483
|
+
};
|
|
484
|
+
const sitemapChanges28dRollup = {
|
|
485
|
+
id: "sitemap_changes_28d",
|
|
486
|
+
windowDays: 28,
|
|
487
|
+
async build({ dataSource, ctx, builtAt }) {
|
|
488
|
+
const store = createSitemapStore({ dataSource });
|
|
489
|
+
const from = utcDateMinusDays(builtAt, 28);
|
|
490
|
+
const to = utcDateMinusDays(builtAt, 0);
|
|
491
|
+
const counts = /* @__PURE__ */ new Map();
|
|
492
|
+
const addedTop = [];
|
|
493
|
+
const removedTop = [];
|
|
494
|
+
function key(k) {
|
|
495
|
+
return `${k.day}\x00${k.feedpath}`;
|
|
496
|
+
}
|
|
497
|
+
for await (const d of store.loadDeltas(ctx, {
|
|
498
|
+
from,
|
|
499
|
+
to
|
|
500
|
+
})) {
|
|
501
|
+
const day = new Date(d.at).toISOString().slice(0, 10);
|
|
502
|
+
const k = key({
|
|
503
|
+
day,
|
|
504
|
+
feedpath: d.feedpath
|
|
505
|
+
});
|
|
506
|
+
const cur = counts.get(k) ?? {
|
|
507
|
+
day,
|
|
508
|
+
feedpath: d.feedpath,
|
|
509
|
+
added: 0,
|
|
510
|
+
removed: 0
|
|
511
|
+
};
|
|
512
|
+
if (d.op === "added") {
|
|
513
|
+
cur.added += 1;
|
|
514
|
+
addedTop.push({
|
|
515
|
+
loc: d.loc,
|
|
516
|
+
feedpath: d.feedpath,
|
|
517
|
+
at: d.at
|
|
518
|
+
});
|
|
519
|
+
} else {
|
|
520
|
+
cur.removed += 1;
|
|
521
|
+
removedTop.push({
|
|
522
|
+
loc: d.loc,
|
|
523
|
+
feedpath: d.feedpath,
|
|
524
|
+
at: d.at
|
|
525
|
+
});
|
|
526
|
+
}
|
|
527
|
+
counts.set(k, cur);
|
|
528
|
+
}
|
|
529
|
+
const days = Array.from(counts.values()).sort((a, b) => {
|
|
530
|
+
if (a.day !== b.day) return a.day < b.day ? -1 : 1;
|
|
531
|
+
return a.feedpath < b.feedpath ? -1 : 1;
|
|
532
|
+
});
|
|
533
|
+
addedTop.sort((a, b) => b.at - a.at);
|
|
534
|
+
removedTop.sort((a, b) => b.at - a.at);
|
|
535
|
+
return {
|
|
536
|
+
days,
|
|
537
|
+
topAdded: addedTop.slice(0, 200),
|
|
538
|
+
topRemoved: removedTop.slice(0, 200)
|
|
539
|
+
};
|
|
540
|
+
}
|
|
541
|
+
};
|
|
338
542
|
const DEFAULT_ROLLUPS = [
|
|
339
543
|
dailyTotalsRollup,
|
|
340
544
|
weeklyTotalsRollup,
|
|
341
545
|
topPages28dRollup,
|
|
342
546
|
topKeywords28dRollup,
|
|
343
547
|
topCountries28dRollup,
|
|
344
|
-
indexingMetadataRollup
|
|
548
|
+
indexingMetadataRollup,
|
|
549
|
+
indexingHealthRollup,
|
|
550
|
+
indexPercentRollup,
|
|
551
|
+
sitemapHealthRollup,
|
|
552
|
+
sitemapChanges28dRollup
|
|
345
553
|
];
|
|
346
|
-
export { DEFAULT_ROLLUPS, dailyTotalsRollup, indexingMetadataRollup, rebuildRollups, rollupKey, rollupParquetKey, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };
|
|
554
|
+
export { DEFAULT_ROLLUPS, dailyTotalsRollup, indexPercentRollup, indexingHealthRollup, indexingMetadataRollup, rebuildRollups, rollupKey, rollupParquetKey, sitemapChanges28dRollup, sitemapHealthRollup, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
interface ScheduleState {
|
|
2
|
+
nextAt: number;
|
|
3
|
+
consecutiveUnchanged: number;
|
|
4
|
+
policyVersion: number;
|
|
5
|
+
}
|
|
6
|
+
interface SchedulePolicy {
|
|
7
|
+
readonly version: number;
|
|
8
|
+
initial: (now: number) => ScheduleState;
|
|
9
|
+
observe: (prev: ScheduleState, evt: {
|
|
10
|
+
changed: boolean;
|
|
11
|
+
at: number;
|
|
12
|
+
}) => ScheduleState;
|
|
13
|
+
isDue: (state: ScheduleState, now: number) => boolean;
|
|
14
|
+
}
|
|
15
|
+
declare const sitemapPolicy: SchedulePolicy;
|
|
16
|
+
type InspectionVerdict = 'PASS' | 'FAIL' | 'NEUTRAL';
|
|
17
|
+
declare function inspectionPolicy(verdict: InspectionVerdict): SchedulePolicy;
|
|
18
|
+
declare function fixedPolicy(intervalMs: number): SchedulePolicy;
|
|
19
|
+
export { InspectionVerdict, SchedulePolicy, ScheduleState, fixedPolicy, inspectionPolicy, sitemapPolicy };
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
const DAY = 24 * (3600 * 1e3);
|
|
2
|
+
function isDue(state, now) {
|
|
3
|
+
return now >= state.nextAt;
|
|
4
|
+
}
|
|
5
|
+
function sitemapCadenceMs(consecutiveUnchanged) {
|
|
6
|
+
if (consecutiveUnchanged >= 7) return 30 * DAY;
|
|
7
|
+
if (consecutiveUnchanged >= 3) return 7 * DAY;
|
|
8
|
+
return DAY;
|
|
9
|
+
}
|
|
10
|
+
const SITEMAP_VERSION = 1;
|
|
11
|
+
const sitemapPolicy = {
|
|
12
|
+
version: SITEMAP_VERSION,
|
|
13
|
+
initial(now) {
|
|
14
|
+
return {
|
|
15
|
+
nextAt: now + DAY,
|
|
16
|
+
consecutiveUnchanged: 0,
|
|
17
|
+
policyVersion: SITEMAP_VERSION
|
|
18
|
+
};
|
|
19
|
+
},
|
|
20
|
+
observe(prev, evt) {
|
|
21
|
+
if (prev.policyVersion !== SITEMAP_VERSION) return {
|
|
22
|
+
nextAt: evt.at + sitemapCadenceMs(0),
|
|
23
|
+
consecutiveUnchanged: 0,
|
|
24
|
+
policyVersion: SITEMAP_VERSION
|
|
25
|
+
};
|
|
26
|
+
if (evt.changed) return {
|
|
27
|
+
nextAt: evt.at + DAY,
|
|
28
|
+
consecutiveUnchanged: 0,
|
|
29
|
+
policyVersion: SITEMAP_VERSION
|
|
30
|
+
};
|
|
31
|
+
const next = prev.consecutiveUnchanged + 1;
|
|
32
|
+
return {
|
|
33
|
+
nextAt: evt.at + sitemapCadenceMs(next),
|
|
34
|
+
consecutiveUnchanged: next,
|
|
35
|
+
policyVersion: SITEMAP_VERSION
|
|
36
|
+
};
|
|
37
|
+
},
|
|
38
|
+
isDue
|
|
39
|
+
};
|
|
40
|
+
const INSPECTION_VERSION = 1;
|
|
41
|
+
function inspectionCadenceMs(verdict) {
|
|
42
|
+
if (verdict === "PASS") return 30 * DAY;
|
|
43
|
+
if (verdict === "FAIL") return 7 * DAY;
|
|
44
|
+
return 14 * DAY;
|
|
45
|
+
}
|
|
46
|
+
function inspectionPolicy(verdict) {
|
|
47
|
+
const cadence = inspectionCadenceMs(verdict);
|
|
48
|
+
return {
|
|
49
|
+
version: INSPECTION_VERSION,
|
|
50
|
+
initial(now) {
|
|
51
|
+
return {
|
|
52
|
+
nextAt: now + cadence,
|
|
53
|
+
consecutiveUnchanged: 0,
|
|
54
|
+
policyVersion: INSPECTION_VERSION
|
|
55
|
+
};
|
|
56
|
+
},
|
|
57
|
+
observe(prev, evt) {
|
|
58
|
+
if (prev.policyVersion !== INSPECTION_VERSION) return {
|
|
59
|
+
nextAt: evt.at + cadence,
|
|
60
|
+
consecutiveUnchanged: 0,
|
|
61
|
+
policyVersion: INSPECTION_VERSION
|
|
62
|
+
};
|
|
63
|
+
const next = evt.changed ? 0 : prev.consecutiveUnchanged + 1;
|
|
64
|
+
return {
|
|
65
|
+
nextAt: evt.at + cadence,
|
|
66
|
+
consecutiveUnchanged: next,
|
|
67
|
+
policyVersion: INSPECTION_VERSION
|
|
68
|
+
};
|
|
69
|
+
},
|
|
70
|
+
isDue
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
const FIXED_VERSION = 1;
|
|
74
|
+
function fixedPolicy(intervalMs) {
|
|
75
|
+
return {
|
|
76
|
+
version: FIXED_VERSION,
|
|
77
|
+
initial(now) {
|
|
78
|
+
return {
|
|
79
|
+
nextAt: now + intervalMs,
|
|
80
|
+
consecutiveUnchanged: 0,
|
|
81
|
+
policyVersion: FIXED_VERSION
|
|
82
|
+
};
|
|
83
|
+
},
|
|
84
|
+
observe(prev, evt) {
|
|
85
|
+
if (prev.policyVersion !== FIXED_VERSION) return {
|
|
86
|
+
nextAt: evt.at + intervalMs,
|
|
87
|
+
consecutiveUnchanged: 0,
|
|
88
|
+
policyVersion: FIXED_VERSION
|
|
89
|
+
};
|
|
90
|
+
const next = evt.changed ? 0 : prev.consecutiveUnchanged + 1;
|
|
91
|
+
return {
|
|
92
|
+
nextAt: evt.at + intervalMs,
|
|
93
|
+
consecutiveUnchanged: next,
|
|
94
|
+
policyVersion: FIXED_VERSION
|
|
95
|
+
};
|
|
96
|
+
},
|
|
97
|
+
isDue
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
export { fixedPolicy, inspectionPolicy, sitemapPolicy };
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@gscdump/engine",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.
|
|
4
|
+
"version": "0.11.1",
|
|
5
5
|
"description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -61,6 +61,11 @@
|
|
|
61
61
|
"import": "./dist/sql-fragments.mjs",
|
|
62
62
|
"default": "./dist/sql-fragments.mjs"
|
|
63
63
|
},
|
|
64
|
+
"./schedule": {
|
|
65
|
+
"types": "./dist/schedule.d.mts",
|
|
66
|
+
"import": "./dist/schedule.mjs",
|
|
67
|
+
"default": "./dist/schedule.mjs"
|
|
68
|
+
},
|
|
64
69
|
"./entities": {
|
|
65
70
|
"types": "./dist/entities.d.mts",
|
|
66
71
|
"import": "./dist/entities.mjs",
|
|
@@ -164,7 +169,7 @@
|
|
|
164
169
|
"dependencies": {
|
|
165
170
|
"drizzle-orm": "^0.45.2",
|
|
166
171
|
"proper-lockfile": "^4.1.2",
|
|
167
|
-
"gscdump": "0.
|
|
172
|
+
"gscdump": "0.11.1"
|
|
168
173
|
},
|
|
169
174
|
"devDependencies": {
|
|
170
175
|
"@duckdb/duckdb-wasm": "^1.32.0",
|