@gscdump/engine 0.9.2 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/_chunks/dispatch.mjs +11 -17
- package/dist/_chunks/engine.mjs +622 -0
- package/dist/_chunks/pg-adapter.mjs +1 -10
- package/dist/_chunks/registry.d.mts +137 -15
- package/dist/_chunks/resolver.mjs +2 -25
- package/dist/_chunks/snapshot.d.mts +14 -0
- package/dist/_chunks/storage.d.mts +1 -20
- package/dist/adapters/node.d.mts +91 -0
- package/dist/adapters/node.mjs +133 -0
- package/dist/analyzer/index.d.mts +4 -50
- package/dist/analyzer/index.mjs +17 -8
- package/dist/entities.d.mts +116 -2
- package/dist/entities.mjs +453 -1
- package/dist/index.d.mts +3 -2
- package/dist/index.mjs +7 -621
- package/dist/planner.d.mts +1 -1
- package/dist/planner.mjs +1 -1
- package/dist/resolver/index.d.mts +1 -23
- package/dist/resolver/index.mjs +3 -3
- package/dist/rollups.d.mts +196 -0
- package/dist/rollups.mjs +546 -0
- package/dist/schedule.d.mts +19 -0
- package/dist/schedule.mjs +100 -0
- package/dist/snapshot.d.mts +1 -13
- package/dist/source/index.d.mts +30 -8
- package/dist/source/index.mjs +42 -7
- package/package.json +15 -5
- package/dist/_chunks/source-types.d.mts +0 -31
- /package/dist/_chunks/{planner.mjs → compiler.mjs} +0 -0
package/dist/entities.d.mts
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { a as DataSource } from "./_chunks/storage.mjs";
|
|
2
|
+
import { ScheduleState } from "./schedule.mjs";
|
|
2
3
|
import { TenantCtx } from "gscdump/contracts";
|
|
3
4
|
/**
|
|
4
5
|
* GSC URL inspection result fields we persist. Mirrors the
|
|
@@ -28,8 +29,17 @@ interface InspectionRecord {
|
|
|
28
29
|
* Free-form payload for fields we don't promote to first-class columns
|
|
29
30
|
* (e.g. `referringUrls`, `crawledAs`). Keeps the wire format forward-compat
|
|
30
31
|
* without bumping the schema for every API addition.
|
|
32
|
+
*
|
|
33
|
+
* Recognised keys:
|
|
34
|
+
* - `schedule`: optional `ScheduleState` from {@link inspectionPolicy}
|
|
35
|
+
* governing when this URL is next due for re-inspection. Undefined on
|
|
36
|
+
* pre-§0 records — readers must tolerate the missing field and fall
|
|
37
|
+
* back to default policy on first observe.
|
|
31
38
|
*/
|
|
32
|
-
raw?:
|
|
39
|
+
raw?: {
|
|
40
|
+
schedule?: ScheduleState;
|
|
41
|
+
[key: string]: unknown;
|
|
42
|
+
};
|
|
33
43
|
}
|
|
34
44
|
/** Wire shape persisted to disk/R2. */
|
|
35
45
|
interface InspectionIndex {
|
|
@@ -44,6 +54,7 @@ interface InspectionHistoryShard {
|
|
|
44
54
|
}
|
|
45
55
|
declare function inspectionIndexKey(ctx: TenantCtx): string;
|
|
46
56
|
declare function emptyTypesKey(ctx: TenantCtx): string;
|
|
57
|
+
declare function inspectionParquetKey(ctx: TenantCtx): string;
|
|
47
58
|
declare function inspectionHistoryKey(ctx: TenantCtx, yearMonth: string): string;
|
|
48
59
|
/**
|
|
49
60
|
* Stable URL hash used as the index key. Short, URL-safe, deterministic.
|
|
@@ -66,6 +77,34 @@ interface InspectionStore {
|
|
|
66
77
|
loadIndex: (ctx: TenantCtx) => Promise<InspectionIndex>;
|
|
67
78
|
/** Read the per-month history shard if it exists. */
|
|
68
79
|
loadHistory: (ctx: TenantCtx, yearMonth: string) => Promise<InspectionHistoryShard | undefined>;
|
|
80
|
+
/**
|
|
81
|
+
* Snapshot the current JSON index to a parquet sidecar at
|
|
82
|
+
* `entities/inspections/index.parquet`. One PUT. Sorted by `urlHash` so
|
|
83
|
+
* DuckDB row-group stats can prune URL-keyed JOINs efficiently.
|
|
84
|
+
*
|
|
85
|
+
* Internal seam: callers don't choose JSON-vs-parquet — the store materialises
|
|
86
|
+
* the parquet at end-of-batch (e.g. after `indexing/complete`) and readers
|
|
87
|
+
* pick the format that matches their access pattern (parquet for JOINs,
|
|
88
|
+
* JSON for full-index scans / point lookups).
|
|
89
|
+
*
|
|
90
|
+
* Returns the parquet object key (matches {@link parquetUri} after write).
|
|
91
|
+
*/
|
|
92
|
+
materialize: (ctx: TenantCtx) => Promise<{
|
|
93
|
+
key: string;
|
|
94
|
+
rowCount: number;
|
|
95
|
+
bytes: number;
|
|
96
|
+
}>;
|
|
97
|
+
/**
|
|
98
|
+
* DuckDB-resolvable URI for the materialised parquet sidecar, or
|
|
99
|
+
* `undefined` if the underlying `DataSource` has no native URI shape
|
|
100
|
+
* (in-memory tests). When defined, read paths can `read_parquet(<uri>)`
|
|
101
|
+
* directly without staging bytes through JS.
|
|
102
|
+
*
|
|
103
|
+
* Does not check existence — caller is responsible for ensuring
|
|
104
|
+
* `materialize` has run at least once. Returning a URI for a missing key
|
|
105
|
+
* is safe; DuckDB will surface a 404 / not-found at query time.
|
|
106
|
+
*/
|
|
107
|
+
parquetUri: (ctx: TenantCtx) => string | undefined;
|
|
69
108
|
}
|
|
70
109
|
interface CreateInspectionStoreOptions {
|
|
71
110
|
dataSource: DataSource;
|
|
@@ -100,6 +139,12 @@ interface SitemapRecord {
|
|
|
100
139
|
}>;
|
|
101
140
|
/** Raw payload for fields we don't promote to first-class columns. */
|
|
102
141
|
raw?: unknown;
|
|
142
|
+
/** Number of URLs observed in this feedpath at last snapshot. */
|
|
143
|
+
urlCount?: number;
|
|
144
|
+
/** Stable hash of the sorted normalized loc list at last snapshot. */
|
|
145
|
+
contentHash?: string;
|
|
146
|
+
/** Adaptive cadence state owned by `sitemapPolicy`. */
|
|
147
|
+
schedule?: ScheduleState;
|
|
103
148
|
}
|
|
104
149
|
interface SitemapIndex {
|
|
105
150
|
version: 1;
|
|
@@ -114,6 +159,57 @@ interface SitemapHistoryDoc {
|
|
|
114
159
|
}
|
|
115
160
|
declare function sitemapIndexKey(ctx: TenantCtx): string;
|
|
116
161
|
declare function sitemapHistoryKey(ctx: TenantCtx, feedpathHash: string, capturedAtMs: number): string;
|
|
162
|
+
declare function sitemapUrlsIndexKey(ctx: TenantCtx): string;
|
|
163
|
+
declare function sitemapUrlsDeltaKey(ctx: TenantCtx, feedpathHash: string, date: string): string;
|
|
164
|
+
/** Parsed URL entry from a sitemap XML. */
|
|
165
|
+
interface ParsedUrl {
|
|
166
|
+
loc: string;
|
|
167
|
+
/** ISO-8601 lastmod from the sitemap, if present. */
|
|
168
|
+
lastmod?: string;
|
|
169
|
+
}
|
|
170
|
+
/** A single URL row in the urls/index.parquet partition. */
|
|
171
|
+
interface SitemapUrlRecord {
|
|
172
|
+
feedpath: string;
|
|
173
|
+
feedpathHash: string;
|
|
174
|
+
urlHash: string;
|
|
175
|
+
loc: string;
|
|
176
|
+
lastmod?: string;
|
|
177
|
+
firstSeenAt: number;
|
|
178
|
+
lastSeenAt: number;
|
|
179
|
+
/** Set when the URL has been removed. Null/undefined = currently live. */
|
|
180
|
+
removedAt?: number;
|
|
181
|
+
}
|
|
182
|
+
interface SnapshotUrlsResult {
|
|
183
|
+
added: number;
|
|
184
|
+
removed: number;
|
|
185
|
+
kept: number;
|
|
186
|
+
contentHash: string;
|
|
187
|
+
/** True when contentHash matched prior; the call performed zero writes. */
|
|
188
|
+
unchanged: boolean;
|
|
189
|
+
}
|
|
190
|
+
interface DeltaEntry {
|
|
191
|
+
feedpath: string;
|
|
192
|
+
feedpathHash: string;
|
|
193
|
+
urlHash: string;
|
|
194
|
+
op: 'added' | 'removed';
|
|
195
|
+
loc: string;
|
|
196
|
+
lastmod?: string;
|
|
197
|
+
at: number;
|
|
198
|
+
}
|
|
199
|
+
interface DateRange {
|
|
200
|
+
/** YYYY-MM-DD inclusive. */
|
|
201
|
+
from?: string;
|
|
202
|
+
/** YYYY-MM-DD inclusive. */
|
|
203
|
+
to?: string;
|
|
204
|
+
}
|
|
205
|
+
interface LoadUrlsOptions {
|
|
206
|
+
includeRemoved?: boolean;
|
|
207
|
+
}
|
|
208
|
+
/**
|
|
209
|
+
* Hash a URL list for change detection. Sorts then folds via FNV-1a so it's
|
|
210
|
+
* deterministic, locale-free, and cheap on Workers.
|
|
211
|
+
*/
|
|
212
|
+
declare function hashUrlList(urls: readonly ParsedUrl[]): string;
|
|
117
213
|
interface SitemapStore {
|
|
118
214
|
/**
|
|
119
215
|
* Persist a snapshot run. Updates the index + writes one immutable
|
|
@@ -124,6 +220,24 @@ interface SitemapStore {
|
|
|
124
220
|
loadIndex: (ctx: TenantCtx) => Promise<SitemapIndex>;
|
|
125
221
|
/** Fetch the latest snapshot for a feedpath, or undefined. */
|
|
126
222
|
getLatest: (ctx: TenantCtx, path: string) => Promise<SitemapRecord | undefined>;
|
|
223
|
+
/**
|
|
224
|
+
* Diff incoming URLs against the prior `urls/index.parquet` partition for
|
|
225
|
+
* `feedpath`; on change, writes a single delta parquet under
|
|
226
|
+
* `urls/deltas/YYYY-MM-DD__{feedpathHash}.parquet`. Skipped (0 PUTs) when
|
|
227
|
+
* `contentHash` matches prior.
|
|
228
|
+
*/
|
|
229
|
+
snapshotUrls: (ctx: TenantCtx, feedpath: string, urls: readonly ParsedUrl[]) => Promise<SnapshotUrlsResult>;
|
|
230
|
+
/** Stream live (and optionally removed) URL rows for a feedpath. */
|
|
231
|
+
loadUrls: (ctx: TenantCtx, feedpath: string, opts?: LoadUrlsOptions) => AsyncIterable<SitemapUrlRecord>;
|
|
232
|
+
/** Stream all delta entries within `[from, to]` (YYYY-MM-DD inclusive). */
|
|
233
|
+
loadDeltas: (ctx: TenantCtx, dateRange?: DateRange) => AsyncIterable<DeltaEntry>;
|
|
234
|
+
/**
|
|
235
|
+
* Fold every accumulated delta into the prior index; writes a fresh
|
|
236
|
+
* `urls/index.parquet` and deletes the consumed delta files.
|
|
237
|
+
*/
|
|
238
|
+
compactUrls: (ctx: TenantCtx) => Promise<void>;
|
|
239
|
+
/** DuckDB-resolvable URI for the URLs index; `undefined` if backend lacks one. */
|
|
240
|
+
urlsParquetUri: (ctx: TenantCtx) => string | undefined;
|
|
127
241
|
}
|
|
128
242
|
interface CreateSitemapStoreOptions {
|
|
129
243
|
dataSource: DataSource;
|
|
@@ -175,4 +289,4 @@ interface CreateEmptyTypesStoreOptions {
|
|
|
175
289
|
now?: () => number;
|
|
176
290
|
}
|
|
177
291
|
declare function createEmptyTypesStore(opts: CreateEmptyTypesStoreOptions): EmptyTypesStore;
|
|
178
|
-
export { CreateEmptyTypesStoreOptions, CreateIndexingMetadataStoreOptions, CreateInspectionStoreOptions, CreateSitemapStoreOptions, EmptyTypesDoc, EmptyTypesStore, IndexingMetadataIndex, IndexingMetadataRecord, IndexingMetadataStore, InspectionIndex, InspectionRecord, InspectionStore, SitemapHistoryDoc, SitemapIndex, SitemapRecord, SitemapStore, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, indexingMetadataIndexKey, inspectionHistoryKey, inspectionIndexKey, sitemapHistoryKey, sitemapIndexKey };
|
|
292
|
+
export { CreateEmptyTypesStoreOptions, CreateIndexingMetadataStoreOptions, CreateInspectionStoreOptions, CreateSitemapStoreOptions, DateRange, DeltaEntry, EmptyTypesDoc, EmptyTypesStore, IndexingMetadataIndex, IndexingMetadataRecord, IndexingMetadataStore, InspectionIndex, InspectionRecord, InspectionStore, LoadUrlsOptions, ParsedUrl, SitemapHistoryDoc, SitemapIndex, SitemapRecord, SitemapStore, SitemapUrlRecord, SnapshotUrlsResult, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionHistoryKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey };
|
package/dist/entities.mjs
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { decodeParquetToRows, encodeRowsToParquetFlex } from "./adapters/hyparquet.mjs";
|
|
1
2
|
const YEAR_MONTH_RE = /^(\d{4})-(\d{2})-/;
|
|
2
3
|
function inspectionIndexKey(ctx) {
|
|
3
4
|
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/index.json` : `u_${ctx.userId}/entities/inspections/index.json`;
|
|
@@ -5,6 +6,9 @@ function inspectionIndexKey(ctx) {
|
|
|
5
6
|
function emptyTypesKey(ctx) {
|
|
6
7
|
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/empty-types.json` : `u_${ctx.userId}/entities/empty-types.json`;
|
|
7
8
|
}
|
|
9
|
+
function inspectionParquetKey(ctx) {
|
|
10
|
+
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/index.parquet` : `u_${ctx.userId}/entities/inspections/index.parquet`;
|
|
11
|
+
}
|
|
8
12
|
function inspectionHistoryKey(ctx, yearMonth) {
|
|
9
13
|
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/history/${yearMonth}.json` : `u_${ctx.userId}/entities/inspections/history/${yearMonth}.json`;
|
|
10
14
|
}
|
|
@@ -75,15 +79,251 @@ function createInspectionStore(opts) {
|
|
|
75
79
|
},
|
|
76
80
|
async loadHistory(ctx, yearMonth) {
|
|
77
81
|
return await readJson(inspectionHistoryKey(ctx, yearMonth));
|
|
82
|
+
},
|
|
83
|
+
async materialize(ctx) {
|
|
84
|
+
const index = await readJson(inspectionIndexKey(ctx)) ?? emptyIndex();
|
|
85
|
+
const rows = Object.entries(index.records).map(([urlHash, r]) => ({
|
|
86
|
+
urlHash,
|
|
87
|
+
url: r.url,
|
|
88
|
+
inspectedAt: r.inspectedAt,
|
|
89
|
+
indexStatus: r.indexStatus ?? null,
|
|
90
|
+
lastCrawlTime: r.lastCrawlTime ?? null,
|
|
91
|
+
googleCanonical: r.googleCanonical ?? null,
|
|
92
|
+
userCanonical: r.userCanonical ?? null,
|
|
93
|
+
coverageState: r.coverageState ?? null,
|
|
94
|
+
robotsTxtState: r.robotsTxtState ?? null,
|
|
95
|
+
indexingState: r.indexingState ?? null,
|
|
96
|
+
pageFetchState: r.pageFetchState ?? null,
|
|
97
|
+
mobileUsabilityVerdict: r.mobileUsabilityVerdict ?? null,
|
|
98
|
+
richResultsVerdict: r.richResultsVerdict ?? null,
|
|
99
|
+
scheduleNextAt: r.raw?.schedule?.nextAt ?? null,
|
|
100
|
+
scheduleConsecutiveUnchanged: r.raw?.schedule?.consecutiveUnchanged ?? null,
|
|
101
|
+
schedulePolicyVersion: r.raw?.schedule?.policyVersion ?? null
|
|
102
|
+
}));
|
|
103
|
+
const bytes = encodeRowsToParquetFlex(rows, {
|
|
104
|
+
columns: INSPECTION_PARQUET_COLUMNS,
|
|
105
|
+
sortKey: ["urlHash"]
|
|
106
|
+
});
|
|
107
|
+
const key = inspectionParquetKey(ctx);
|
|
108
|
+
await ds.write(key, bytes);
|
|
109
|
+
return {
|
|
110
|
+
key,
|
|
111
|
+
rowCount: rows.length,
|
|
112
|
+
bytes: bytes.byteLength
|
|
113
|
+
};
|
|
114
|
+
},
|
|
115
|
+
parquetUri(ctx) {
|
|
116
|
+
return ds.uri?.(inspectionParquetKey(ctx));
|
|
78
117
|
}
|
|
79
118
|
};
|
|
80
119
|
}
|
|
120
|
+
const INSPECTION_PARQUET_COLUMNS = [
|
|
121
|
+
{
|
|
122
|
+
name: "urlHash",
|
|
123
|
+
type: "VARCHAR",
|
|
124
|
+
nullable: false
|
|
125
|
+
},
|
|
126
|
+
{
|
|
127
|
+
name: "url",
|
|
128
|
+
type: "VARCHAR",
|
|
129
|
+
nullable: false
|
|
130
|
+
},
|
|
131
|
+
{
|
|
132
|
+
name: "inspectedAt",
|
|
133
|
+
type: "VARCHAR",
|
|
134
|
+
nullable: false
|
|
135
|
+
},
|
|
136
|
+
{
|
|
137
|
+
name: "indexStatus",
|
|
138
|
+
type: "VARCHAR",
|
|
139
|
+
nullable: true
|
|
140
|
+
},
|
|
141
|
+
{
|
|
142
|
+
name: "lastCrawlTime",
|
|
143
|
+
type: "VARCHAR",
|
|
144
|
+
nullable: true
|
|
145
|
+
},
|
|
146
|
+
{
|
|
147
|
+
name: "googleCanonical",
|
|
148
|
+
type: "VARCHAR",
|
|
149
|
+
nullable: true
|
|
150
|
+
},
|
|
151
|
+
{
|
|
152
|
+
name: "userCanonical",
|
|
153
|
+
type: "VARCHAR",
|
|
154
|
+
nullable: true
|
|
155
|
+
},
|
|
156
|
+
{
|
|
157
|
+
name: "coverageState",
|
|
158
|
+
type: "VARCHAR",
|
|
159
|
+
nullable: true
|
|
160
|
+
},
|
|
161
|
+
{
|
|
162
|
+
name: "robotsTxtState",
|
|
163
|
+
type: "VARCHAR",
|
|
164
|
+
nullable: true
|
|
165
|
+
},
|
|
166
|
+
{
|
|
167
|
+
name: "indexingState",
|
|
168
|
+
type: "VARCHAR",
|
|
169
|
+
nullable: true
|
|
170
|
+
},
|
|
171
|
+
{
|
|
172
|
+
name: "pageFetchState",
|
|
173
|
+
type: "VARCHAR",
|
|
174
|
+
nullable: true
|
|
175
|
+
},
|
|
176
|
+
{
|
|
177
|
+
name: "mobileUsabilityVerdict",
|
|
178
|
+
type: "VARCHAR",
|
|
179
|
+
nullable: true
|
|
180
|
+
},
|
|
181
|
+
{
|
|
182
|
+
name: "richResultsVerdict",
|
|
183
|
+
type: "VARCHAR",
|
|
184
|
+
nullable: true
|
|
185
|
+
},
|
|
186
|
+
{
|
|
187
|
+
name: "scheduleNextAt",
|
|
188
|
+
type: "BIGINT",
|
|
189
|
+
nullable: true
|
|
190
|
+
},
|
|
191
|
+
{
|
|
192
|
+
name: "scheduleConsecutiveUnchanged",
|
|
193
|
+
type: "INTEGER",
|
|
194
|
+
nullable: true
|
|
195
|
+
},
|
|
196
|
+
{
|
|
197
|
+
name: "schedulePolicyVersion",
|
|
198
|
+
type: "INTEGER",
|
|
199
|
+
nullable: true
|
|
200
|
+
}
|
|
201
|
+
];
|
|
81
202
|
function sitemapIndexKey(ctx) {
|
|
82
203
|
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/sitemaps/index.json` : `u_${ctx.userId}/entities/sitemaps/index.json`;
|
|
83
204
|
}
|
|
84
205
|
function sitemapHistoryKey(ctx, feedpathHash, capturedAtMs) {
|
|
85
206
|
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/sitemaps/history/${feedpathHash}__${capturedAtMs}.json` : `u_${ctx.userId}/entities/sitemaps/history/${feedpathHash}__${capturedAtMs}.json`;
|
|
86
207
|
}
|
|
208
|
+
function sitemapUrlsPrefix(ctx) {
|
|
209
|
+
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/sitemaps/urls` : `u_${ctx.userId}/entities/sitemaps/urls`;
|
|
210
|
+
}
|
|
211
|
+
function sitemapUrlsIndexKey(ctx) {
|
|
212
|
+
return `${sitemapUrlsPrefix(ctx)}/index.parquet`;
|
|
213
|
+
}
|
|
214
|
+
function sitemapUrlsDeltaKey(ctx, feedpathHash, date) {
|
|
215
|
+
return `${sitemapUrlsPrefix(ctx)}/deltas/${date}__${feedpathHash}.parquet`;
|
|
216
|
+
}
|
|
217
|
+
const SITEMAP_URLS_DELTA_PREFIX_RE = /\/urls\/deltas\/(\d{4}-\d{2}-\d{2})__([0-9a-f]+)\.parquet$/;
|
|
218
|
+
const URLS_INDEX_COLUMNS = [
|
|
219
|
+
{
|
|
220
|
+
name: "feedpath",
|
|
221
|
+
type: "VARCHAR",
|
|
222
|
+
nullable: false
|
|
223
|
+
},
|
|
224
|
+
{
|
|
225
|
+
name: "feedpath_hash",
|
|
226
|
+
type: "VARCHAR",
|
|
227
|
+
nullable: false
|
|
228
|
+
},
|
|
229
|
+
{
|
|
230
|
+
name: "url_hash",
|
|
231
|
+
type: "VARCHAR",
|
|
232
|
+
nullable: false
|
|
233
|
+
},
|
|
234
|
+
{
|
|
235
|
+
name: "loc",
|
|
236
|
+
type: "VARCHAR",
|
|
237
|
+
nullable: false
|
|
238
|
+
},
|
|
239
|
+
{
|
|
240
|
+
name: "lastmod",
|
|
241
|
+
type: "VARCHAR",
|
|
242
|
+
nullable: true
|
|
243
|
+
},
|
|
244
|
+
{
|
|
245
|
+
name: "first_seen_at",
|
|
246
|
+
type: "BIGINT",
|
|
247
|
+
nullable: false
|
|
248
|
+
},
|
|
249
|
+
{
|
|
250
|
+
name: "last_seen_at",
|
|
251
|
+
type: "BIGINT",
|
|
252
|
+
nullable: false
|
|
253
|
+
},
|
|
254
|
+
{
|
|
255
|
+
name: "removed_at",
|
|
256
|
+
type: "BIGINT",
|
|
257
|
+
nullable: true
|
|
258
|
+
}
|
|
259
|
+
];
|
|
260
|
+
const URLS_DELTA_COLUMNS = [
|
|
261
|
+
{
|
|
262
|
+
name: "feedpath",
|
|
263
|
+
type: "VARCHAR",
|
|
264
|
+
nullable: false
|
|
265
|
+
},
|
|
266
|
+
{
|
|
267
|
+
name: "feedpath_hash",
|
|
268
|
+
type: "VARCHAR",
|
|
269
|
+
nullable: false
|
|
270
|
+
},
|
|
271
|
+
{
|
|
272
|
+
name: "url_hash",
|
|
273
|
+
type: "VARCHAR",
|
|
274
|
+
nullable: false
|
|
275
|
+
},
|
|
276
|
+
{
|
|
277
|
+
name: "op",
|
|
278
|
+
type: "VARCHAR",
|
|
279
|
+
nullable: false
|
|
280
|
+
},
|
|
281
|
+
{
|
|
282
|
+
name: "loc",
|
|
283
|
+
type: "VARCHAR",
|
|
284
|
+
nullable: false
|
|
285
|
+
},
|
|
286
|
+
{
|
|
287
|
+
name: "lastmod",
|
|
288
|
+
type: "VARCHAR",
|
|
289
|
+
nullable: true
|
|
290
|
+
},
|
|
291
|
+
{
|
|
292
|
+
name: "at",
|
|
293
|
+
type: "BIGINT",
|
|
294
|
+
nullable: false
|
|
295
|
+
}
|
|
296
|
+
];
|
|
297
|
+
function rowToUrlRecord(row) {
|
|
298
|
+
return {
|
|
299
|
+
feedpath: String(row.feedpath),
|
|
300
|
+
feedpathHash: String(row.feedpath_hash),
|
|
301
|
+
urlHash: String(row.url_hash),
|
|
302
|
+
loc: String(row.loc),
|
|
303
|
+
lastmod: row.lastmod == null ? void 0 : String(row.lastmod),
|
|
304
|
+
firstSeenAt: Number(row.first_seen_at),
|
|
305
|
+
lastSeenAt: Number(row.last_seen_at),
|
|
306
|
+
removedAt: row.removed_at == null ? void 0 : Number(row.removed_at)
|
|
307
|
+
};
|
|
308
|
+
}
|
|
309
|
+
function urlRecordToRow(r) {
|
|
310
|
+
return {
|
|
311
|
+
feedpath: r.feedpath,
|
|
312
|
+
feedpath_hash: r.feedpathHash,
|
|
313
|
+
url_hash: r.urlHash,
|
|
314
|
+
loc: r.loc,
|
|
315
|
+
lastmod: r.lastmod ?? null,
|
|
316
|
+
first_seen_at: r.firstSeenAt,
|
|
317
|
+
last_seen_at: r.lastSeenAt,
|
|
318
|
+
removed_at: r.removedAt ?? null
|
|
319
|
+
};
|
|
320
|
+
}
|
|
321
|
+
function isoDate(ms) {
|
|
322
|
+
return new Date(ms).toISOString().slice(0, 10);
|
|
323
|
+
}
|
|
324
|
+
function hashUrlList(urls) {
|
|
325
|
+
return hashUrl(urls.map((u) => u.loc).sort().join("\n"));
|
|
326
|
+
}
|
|
87
327
|
function createSitemapStore(opts) {
|
|
88
328
|
const ds = opts.dataSource;
|
|
89
329
|
const hash = opts.hash ?? hashUrl;
|
|
@@ -123,6 +363,218 @@ function createSitemapStore(opts) {
|
|
|
123
363
|
},
|
|
124
364
|
async getLatest(ctx, path) {
|
|
125
365
|
return (await readJson(sitemapIndexKey(ctx)))?.records[hash(path)];
|
|
366
|
+
},
|
|
367
|
+
async snapshotUrls(ctx, feedpath, urls) {
|
|
368
|
+
const fpHash = hash(feedpath);
|
|
369
|
+
const contentHash = hashUrlList(urls);
|
|
370
|
+
const at = now();
|
|
371
|
+
const priorByHash = /* @__PURE__ */ new Map();
|
|
372
|
+
for await (const rec of this.loadUrls(ctx, feedpath, { includeRemoved: true })) priorByHash.set(rec.urlHash, rec);
|
|
373
|
+
const livePrior = Array.from(priorByHash.values()).filter((r) => r.removedAt == null);
|
|
374
|
+
if (livePrior.length > 0) {
|
|
375
|
+
if (hashUrl(livePrior.map((r) => String(r.loc)).sort().join("\n")) === contentHash) return {
|
|
376
|
+
added: 0,
|
|
377
|
+
removed: 0,
|
|
378
|
+
kept: livePrior.length,
|
|
379
|
+
contentHash,
|
|
380
|
+
unchanged: true
|
|
381
|
+
};
|
|
382
|
+
}
|
|
383
|
+
const incomingByHash = /* @__PURE__ */ new Map();
|
|
384
|
+
for (const u of urls) incomingByHash.set(hash(u.loc), u);
|
|
385
|
+
const deltaRows = [];
|
|
386
|
+
let added = 0;
|
|
387
|
+
let removed = 0;
|
|
388
|
+
let kept = 0;
|
|
389
|
+
const date = isoDate(at);
|
|
390
|
+
for (const [urlHash, u] of incomingByHash) {
|
|
391
|
+
const prev = priorByHash.get(urlHash);
|
|
392
|
+
if (!prev || prev.removedAt != null) {
|
|
393
|
+
added++;
|
|
394
|
+
deltaRows.push({
|
|
395
|
+
feedpath,
|
|
396
|
+
feedpath_hash: fpHash,
|
|
397
|
+
url_hash: urlHash,
|
|
398
|
+
op: "added",
|
|
399
|
+
loc: u.loc,
|
|
400
|
+
lastmod: u.lastmod ?? null,
|
|
401
|
+
at
|
|
402
|
+
});
|
|
403
|
+
} else kept++;
|
|
404
|
+
}
|
|
405
|
+
for (const [urlHash, prev] of priorByHash) {
|
|
406
|
+
if (prev.removedAt != null) continue;
|
|
407
|
+
if (!incomingByHash.has(urlHash)) {
|
|
408
|
+
removed++;
|
|
409
|
+
deltaRows.push({
|
|
410
|
+
feedpath,
|
|
411
|
+
feedpath_hash: fpHash,
|
|
412
|
+
url_hash: urlHash,
|
|
413
|
+
op: "removed",
|
|
414
|
+
loc: prev.loc,
|
|
415
|
+
lastmod: prev.lastmod ?? null,
|
|
416
|
+
at
|
|
417
|
+
});
|
|
418
|
+
}
|
|
419
|
+
}
|
|
420
|
+
if (deltaRows.length > 0) {
|
|
421
|
+
const bytes = encodeRowsToParquetFlex(deltaRows, {
|
|
422
|
+
columns: URLS_DELTA_COLUMNS,
|
|
423
|
+
sortKey: ["url_hash"]
|
|
424
|
+
});
|
|
425
|
+
await ds.write(sitemapUrlsDeltaKey(ctx, fpHash, date), bytes);
|
|
426
|
+
}
|
|
427
|
+
return {
|
|
428
|
+
added,
|
|
429
|
+
removed,
|
|
430
|
+
kept,
|
|
431
|
+
contentHash,
|
|
432
|
+
unchanged: false
|
|
433
|
+
};
|
|
434
|
+
},
|
|
435
|
+
async *loadUrls(ctx, feedpath, opts) {
|
|
436
|
+
const fpHash = hash(feedpath);
|
|
437
|
+
const includeRemoved = opts?.includeRemoved ?? false;
|
|
438
|
+
const indexBytes = await ds.read(sitemapUrlsIndexKey(ctx)).catch(() => void 0);
|
|
439
|
+
const indexRows = indexBytes ? await decodeParquetToRows(indexBytes) : [];
|
|
440
|
+
const deltaKeys = (await ds.list(`${sitemapUrlsPrefix(ctx)}/deltas/`)).sort();
|
|
441
|
+
const live = /* @__PURE__ */ new Map();
|
|
442
|
+
const removedMap = /* @__PURE__ */ new Map();
|
|
443
|
+
for (const row of indexRows) {
|
|
444
|
+
if (row.feedpath_hash !== fpHash) continue;
|
|
445
|
+
const rec = rowToUrlRecord(row);
|
|
446
|
+
if (rec.removedAt != null) removedMap.set(rec.urlHash, rec);
|
|
447
|
+
else live.set(rec.urlHash, rec);
|
|
448
|
+
}
|
|
449
|
+
for (const key of deltaKeys) {
|
|
450
|
+
const m = SITEMAP_URLS_DELTA_PREFIX_RE.exec(key);
|
|
451
|
+
if (!m || m[2] !== fpHash) continue;
|
|
452
|
+
const dBytes = await ds.read(key).catch(() => void 0);
|
|
453
|
+
if (!dBytes) continue;
|
|
454
|
+
const dRows = await decodeParquetToRows(dBytes);
|
|
455
|
+
for (const r of dRows) {
|
|
456
|
+
const op = String(r.op);
|
|
457
|
+
const urlHash = String(r.url_hash);
|
|
458
|
+
const at = Number(r.at);
|
|
459
|
+
if (op === "added") {
|
|
460
|
+
const prev = live.get(urlHash) ?? removedMap.get(urlHash);
|
|
461
|
+
removedMap.delete(urlHash);
|
|
462
|
+
live.set(urlHash, {
|
|
463
|
+
feedpath,
|
|
464
|
+
feedpathHash: fpHash,
|
|
465
|
+
urlHash,
|
|
466
|
+
loc: String(r.loc),
|
|
467
|
+
lastmod: r.lastmod == null ? void 0 : String(r.lastmod),
|
|
468
|
+
firstSeenAt: prev?.firstSeenAt ?? at,
|
|
469
|
+
lastSeenAt: at
|
|
470
|
+
});
|
|
471
|
+
} else if (op === "removed") {
|
|
472
|
+
const prev = live.get(urlHash);
|
|
473
|
+
live.delete(urlHash);
|
|
474
|
+
if (prev) removedMap.set(urlHash, {
|
|
475
|
+
...prev,
|
|
476
|
+
removedAt: at
|
|
477
|
+
});
|
|
478
|
+
}
|
|
479
|
+
}
|
|
480
|
+
}
|
|
481
|
+
for (const rec of live.values()) yield rec;
|
|
482
|
+
if (includeRemoved) for (const rec of removedMap.values()) yield rec;
|
|
483
|
+
},
|
|
484
|
+
async *loadDeltas(ctx, dateRange) {
|
|
485
|
+
const from = dateRange?.from;
|
|
486
|
+
const to = dateRange?.to;
|
|
487
|
+
const keys = (await ds.list(`${sitemapUrlsPrefix(ctx)}/deltas/`)).sort();
|
|
488
|
+
for (const key of keys) {
|
|
489
|
+
const m = SITEMAP_URLS_DELTA_PREFIX_RE.exec(key);
|
|
490
|
+
if (!m) continue;
|
|
491
|
+
const date = m[1];
|
|
492
|
+
if (from && date < from) continue;
|
|
493
|
+
if (to && date > to) continue;
|
|
494
|
+
const bytes = await ds.read(key).catch(() => void 0);
|
|
495
|
+
if (!bytes) continue;
|
|
496
|
+
const rows = await decodeParquetToRows(bytes);
|
|
497
|
+
for (const r of rows) {
|
|
498
|
+
const op = String(r.op);
|
|
499
|
+
if (op !== "added" && op !== "removed") continue;
|
|
500
|
+
yield {
|
|
501
|
+
feedpath: String(r.feedpath),
|
|
502
|
+
feedpathHash: String(r.feedpath_hash),
|
|
503
|
+
urlHash: String(r.url_hash),
|
|
504
|
+
op,
|
|
505
|
+
loc: String(r.loc),
|
|
506
|
+
lastmod: r.lastmod == null ? void 0 : String(r.lastmod),
|
|
507
|
+
at: Number(r.at)
|
|
508
|
+
};
|
|
509
|
+
}
|
|
510
|
+
}
|
|
511
|
+
},
|
|
512
|
+
async compactUrls(ctx) {
|
|
513
|
+
const indexKey = sitemapUrlsIndexKey(ctx);
|
|
514
|
+
const indexBytes = await ds.read(indexKey).catch(() => void 0);
|
|
515
|
+
const indexRows = indexBytes ? await decodeParquetToRows(indexBytes) : [];
|
|
516
|
+
const stateKey = (fp, u) => `${fp}::${u}`;
|
|
517
|
+
const live = /* @__PURE__ */ new Map();
|
|
518
|
+
const removed = /* @__PURE__ */ new Map();
|
|
519
|
+
for (const row of indexRows) {
|
|
520
|
+
const rec = rowToUrlRecord(row);
|
|
521
|
+
const k = stateKey(rec.feedpathHash, rec.urlHash);
|
|
522
|
+
if (rec.removedAt != null) removed.set(k, rec);
|
|
523
|
+
else live.set(k, rec);
|
|
524
|
+
}
|
|
525
|
+
const deltaKeys = (await ds.list(`${sitemapUrlsPrefix(ctx)}/deltas/`)).sort();
|
|
526
|
+
const consumed = [];
|
|
527
|
+
for (const key of deltaKeys) {
|
|
528
|
+
const m = SITEMAP_URLS_DELTA_PREFIX_RE.exec(key);
|
|
529
|
+
if (!m) continue;
|
|
530
|
+
const fpHash = m[2];
|
|
531
|
+
const bytes = await ds.read(key).catch(() => void 0);
|
|
532
|
+
if (!bytes) continue;
|
|
533
|
+
consumed.push(key);
|
|
534
|
+
const rows = await decodeParquetToRows(bytes);
|
|
535
|
+
for (const r of rows) {
|
|
536
|
+
const urlHash = String(r.url_hash);
|
|
537
|
+
const at = Number(r.at);
|
|
538
|
+
const k = stateKey(fpHash, urlHash);
|
|
539
|
+
const op = String(r.op);
|
|
540
|
+
if (op === "added") {
|
|
541
|
+
const prev = live.get(k) ?? removed.get(k);
|
|
542
|
+
removed.delete(k);
|
|
543
|
+
live.set(k, {
|
|
544
|
+
feedpath: String(r.feedpath),
|
|
545
|
+
feedpathHash: fpHash,
|
|
546
|
+
urlHash,
|
|
547
|
+
loc: String(r.loc),
|
|
548
|
+
lastmod: r.lastmod == null ? void 0 : String(r.lastmod),
|
|
549
|
+
firstSeenAt: prev?.firstSeenAt ?? at,
|
|
550
|
+
lastSeenAt: at
|
|
551
|
+
});
|
|
552
|
+
} else if (op === "removed") {
|
|
553
|
+
const prev = live.get(k);
|
|
554
|
+
live.delete(k);
|
|
555
|
+
if (prev) removed.set(k, {
|
|
556
|
+
...prev,
|
|
557
|
+
removedAt: at
|
|
558
|
+
});
|
|
559
|
+
}
|
|
560
|
+
}
|
|
561
|
+
}
|
|
562
|
+
const merged = [...live.values(), ...removed.values()];
|
|
563
|
+
merged.sort((a, b) => {
|
|
564
|
+
if (a.feedpathHash !== b.feedpathHash) return a.feedpathHash < b.feedpathHash ? -1 : 1;
|
|
565
|
+
if (a.urlHash !== b.urlHash) return a.urlHash < b.urlHash ? -1 : 1;
|
|
566
|
+
return 0;
|
|
567
|
+
});
|
|
568
|
+
const bytes = encodeRowsToParquetFlex(merged.map(urlRecordToRow), {
|
|
569
|
+
columns: URLS_INDEX_COLUMNS,
|
|
570
|
+
sortKey: ["feedpath_hash", "url_hash"]
|
|
571
|
+
});
|
|
572
|
+
await ds.write(indexKey, bytes);
|
|
573
|
+
if (consumed.length > 0) await ds.delete(consumed);
|
|
574
|
+
},
|
|
575
|
+
urlsParquetUri(ctx) {
|
|
576
|
+
const key = sitemapUrlsIndexKey(ctx);
|
|
577
|
+
return ds.uri ? ds.uri(key) : void 0;
|
|
126
578
|
}
|
|
127
579
|
};
|
|
128
580
|
}
|
|
@@ -206,4 +658,4 @@ function createEmptyTypesStore(opts) {
|
|
|
206
658
|
}
|
|
207
659
|
};
|
|
208
660
|
}
|
|
209
|
-
export { createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, indexingMetadataIndexKey, inspectionHistoryKey, inspectionIndexKey, sitemapHistoryKey, sitemapIndexKey };
|
|
661
|
+
export { createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionHistoryKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey };
|
package/dist/index.d.mts
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
|
-
import { A as SyncStateDetail, B as WriteResult, C as QueryExecutor, D as SearchType, E as RunSQLOptions, F as TenantCtx, G as
|
|
1
|
+
import { A as SyncStateDetail, B as WriteResult, C as QueryExecutor, D as SearchType, E as RunSQLOptions, F as TenantCtx, G as CompactionThresholds, H as inferLegacyTier, I as Watermark, K as enumeratePartitions, L as WatermarkFilter, M as SyncStateKind, N as SyncStateScope, O as StorageEngine, P as TableName, R as WatermarkScope, S as QueryExecuteResult, T as Row, U as inferSearchType, V as dayPartition, W as objectKey, _ as PurgeFilter, a as DataSource, b as QueryCtx, c as FileSetRef, d as LockScope, f as ManifestEntry, g as ParquetCodec, h as OptimizedQueryResult, i as DEFAULT_SEARCH_TYPE, j as SyncStateFilter, k as SyncState, l as GcCtx, m as ManifestStore, n as CompactionTier, o as EngineOptions, p as ManifestPurgeResult, r as ComparisonResult, s as ExtraResult, t as CodecCtx, u as ListLiveFilter, v as PurgeResult, w as QueryResult, x as QueryExecuteOptions, y as PurgeUrlsResult, z as WriteCtx } from "./_chunks/storage.mjs";
|
|
2
2
|
import { a as createDuckDBExecutor, i as createDuckDBCodec, n as DuckDBHandle, r as canonicalEmptyParquetSchema, t as DuckDBFactory } from "./_chunks/duckdb.mjs";
|
|
3
3
|
import { _ as pages, a as allTables, c as inferTable, d as TABLE_METADATA, f as countries, g as page_keywords, h as keywords, i as TableSchema, m as drizzleSchema, n as ColumnType, o as currentSchemaVersion, p as devices, r as SCHEMAS, s as dimensionToColumn, t as ColumnDef, u as DrizzleSchema } from "./_chunks/schema.mjs";
|
|
4
|
+
import { InspectionVerdict, SchedulePolicy, ScheduleState, fixedPolicy, inspectionPolicy, sitemapPolicy } from "./schedule.mjs";
|
|
4
5
|
import { GscApiRow, IngestOptions, RowAccumulator, RowAccumulatorOptions, createRowAccumulator, toPath, toSumPosition, transformGscRow } from "./ingest.mjs";
|
|
5
6
|
import { a as substituteNamedFiles, i as resolveToSQL, n as ResolvedQuery, t as FILES_PLACEHOLDER } from "./_chunks/planner.mjs";
|
|
6
7
|
import { bindLiterals, formatLiteral } from "./sql-bind.mjs";
|
|
@@ -9,4 +10,4 @@ declare function coerceRow(row: Row$1): Row$1;
|
|
|
9
10
|
declare function coerceRows(rows: readonly Row$1[]): Row$1[];
|
|
10
11
|
declare const MAX_DAY_BYTES: number;
|
|
11
12
|
declare function createStorageEngine(opts: EngineOptions): StorageEngine;
|
|
12
|
-
export { type CodecCtx, type ColumnDef, type ColumnType, type CompactionThresholds, type CompactionTier, type ComparisonResult, DEFAULT_SEARCH_TYPE, type DataSource, type DrizzleSchema, type DuckDBFactory, type DuckDBHandle, type EngineOptions, type ExtraResult, FILES_PLACEHOLDER, type FileSetRef, type GcCtx, type GscApiRow, type IngestOptions, type ListLiveFilter, type LockScope, MAX_DAY_BYTES, type ManifestEntry, type ManifestPurgeResult, type ManifestStore, type OptimizedQueryResult, type ParquetCodec, type PurgeFilter, type PurgeResult, type PurgeUrlsResult, type QueryCtx, type QueryExecuteOptions, type QueryExecuteResult, type QueryExecutor, type QueryResult, type ResolvedQuery, type Row, type RowAccumulator, type RowAccumulatorOptions, type RunSQLOptions, SCHEMAS, type SearchType, type StorageEngine, type SyncState, type SyncStateDetail, type SyncStateFilter, type SyncStateKind, type SyncStateScope, TABLE_METADATA, type TableName, type TableSchema, type TenantCtx, type Watermark, type WatermarkFilter, type WatermarkScope, type WriteCtx, type WriteResult, allTables, bindLiterals, canonicalEmptyParquetSchema, coerceRow, coerceRows, countries, createDuckDBCodec, createDuckDBExecutor, createRowAccumulator, createStorageEngine, currentSchemaVersion, dayPartition, devices, dimensionToColumn, drizzleSchema, enumeratePartitions, formatLiteral, inferLegacyTier, inferSearchType, inferTable,
|
|
13
|
+
export { type CodecCtx, type ColumnDef, type ColumnType, type CompactionThresholds, type CompactionTier, type ComparisonResult, DEFAULT_SEARCH_TYPE, type DataSource, type DrizzleSchema, type DuckDBFactory, type DuckDBHandle, type EngineOptions, type ExtraResult, FILES_PLACEHOLDER, type FileSetRef, type GcCtx, type GscApiRow, type IngestOptions, type InspectionVerdict, type ListLiveFilter, type LockScope, MAX_DAY_BYTES, type ManifestEntry, type ManifestPurgeResult, type ManifestStore, type OptimizedQueryResult, type ParquetCodec, type PurgeFilter, type PurgeResult, type PurgeUrlsResult, type QueryCtx, type QueryExecuteOptions, type QueryExecuteResult, type QueryExecutor, type QueryResult, type ResolvedQuery, type Row, type RowAccumulator, type RowAccumulatorOptions, type RunSQLOptions, SCHEMAS, type SchedulePolicy, type ScheduleState, type SearchType, type StorageEngine, type SyncState, type SyncStateDetail, type SyncStateFilter, type SyncStateKind, type SyncStateScope, TABLE_METADATA, type TableName, type TableSchema, type TenantCtx, type Watermark, type WatermarkFilter, type WatermarkScope, type WriteCtx, type WriteResult, allTables, bindLiterals, canonicalEmptyParquetSchema, coerceRow, coerceRows, countries, createDuckDBCodec, createDuckDBExecutor, createRowAccumulator, createStorageEngine, currentSchemaVersion, dayPartition, devices, dimensionToColumn, drizzleSchema, enumeratePartitions, fixedPolicy, formatLiteral, inferLegacyTier, inferSearchType, inferTable, inspectionPolicy, keywords, objectKey, page_keywords, pages, resolveToSQL, sitemapPolicy, substituteNamedFiles, toPath, toSumPosition, transformGscRow };
|