@gscdump/engine 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +53 -0
- package/dist/adapters/duckdb-node.d.mts +19 -0
- package/dist/adapters/duckdb-node.mjs +78 -0
- package/dist/adapters/filesystem.d.mts +206 -0
- package/dist/adapters/filesystem.mjs +320 -0
- package/dist/adapters/http.d.mts +227 -0
- package/dist/adapters/http.mjs +119 -0
- package/dist/adapters/hyparquet.d.mts +107 -0
- package/dist/adapters/hyparquet.mjs +250 -0
- package/dist/adapters/inspection-sqlite-browser.d.mts +9 -0
- package/dist/adapters/inspection-sqlite-browser.mjs +42 -0
- package/dist/adapters/inspection-sqlite-node.d.mts +9 -0
- package/dist/adapters/inspection-sqlite-node.mjs +32 -0
- package/dist/adapters/node-harness.d.mts +334 -0
- package/dist/adapters/node-harness.mjs +1907 -0
- package/dist/adapters/r2-manifest.d.mts +227 -0
- package/dist/adapters/r2-manifest.mjs +355 -0
- package/dist/adapters/r2.d.mts +93 -0
- package/dist/adapters/r2.mjs +65 -0
- package/dist/arrow-utils.d.mts +14 -0
- package/dist/arrow-utils.mjs +8 -0
- package/dist/contracts.d.mts +436 -0
- package/dist/contracts.mjs +1 -0
- package/dist/entities.d.mts +238 -0
- package/dist/entities.mjs +359 -0
- package/dist/index.d.mts +1849 -0
- package/dist/index.mjs +1976 -0
- package/dist/ingest.d.mts +96 -0
- package/dist/ingest.mjs +187 -0
- package/dist/planner.d.mts +16 -0
- package/dist/planner.mjs +321 -0
- package/dist/resolver/index.d.mts +207 -0
- package/dist/resolver/index.mjs +869 -0
- package/dist/rollups.d.mts +207 -0
- package/dist/rollups.mjs +553 -0
- package/dist/schema.d.mts +1258 -0
- package/dist/schema.mjs +139 -0
- package/dist/scope.d.mts +38 -0
- package/dist/scope.mjs +28 -0
- package/dist/snapshot.d.mts +14 -0
- package/dist/snapshot.mjs +1 -0
- package/dist/sql-bind.d.mts +19 -0
- package/dist/sql-bind.mjs +92 -0
- package/dist/sql-fragments.d.mts +21 -0
- package/dist/sql-fragments.mjs +13 -0
- package/package.json +168 -0
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
import { TenantCtx } from "gscdump/contracts";
|
|
2
|
+
interface DataSource {
|
|
3
|
+
read: (key: string, range?: {
|
|
4
|
+
offset: number;
|
|
5
|
+
length: number;
|
|
6
|
+
}, signal?: AbortSignal) => Promise<Uint8Array>;
|
|
7
|
+
write: (key: string, bytes: Uint8Array) => Promise<void>;
|
|
8
|
+
delete: (keys: string[]) => Promise<void>;
|
|
9
|
+
/**
|
|
10
|
+
* One-shot listing under a prefix. Implementations may cap the number of
|
|
11
|
+
* returned keys (typically 10k) — callers iterating full tenant space
|
|
12
|
+
* should prefer `streamList` when available or narrow the prefix.
|
|
13
|
+
*/
|
|
14
|
+
list: (prefix: string) => Promise<string[]>;
|
|
15
|
+
/**
|
|
16
|
+
* Per-key URI probe. Returns a URI string DuckDB's `httpfs` (or an
|
|
17
|
+
* equivalent engine that fetches its own I/O) can read directly, or
|
|
18
|
+
* `undefined` if the key isn't URI-resolvable on this backend and the
|
|
19
|
+
* caller must fall back to `read(key)` for the bytes.
|
|
20
|
+
*
|
|
21
|
+
* Contracts:
|
|
22
|
+
* - When defined, the returned URI MUST yield byte-identical content to
|
|
23
|
+
* `read(key)`. Callers rely on this for correctness.
|
|
24
|
+
* - Backends with a native URI for every key (filesystem: absolute path,
|
|
25
|
+
* R2 via `httpfs`: signed URL) may always return a string.
|
|
26
|
+
* - Backends without a native URI shape (in-memory) omit the method or
|
|
27
|
+
* return `undefined` per call.
|
|
28
|
+
* - Mixed-per-query is allowed: some keys in one query may return a URI,
|
|
29
|
+
* others may not; the executor branches per key.
|
|
30
|
+
*/
|
|
31
|
+
uri?: (key: string) => string | undefined;
|
|
32
|
+
/**
|
|
33
|
+
* Optional — probe the byte size of a key without reading it. Used by
|
|
34
|
+
* the engine to fill in `WriteResult.bytes` when a codec reports 0 or
|
|
35
|
+
* unknown but the file is non-trivial.
|
|
36
|
+
*/
|
|
37
|
+
head?: (key: string) => Promise<{
|
|
38
|
+
bytes: number;
|
|
39
|
+
} | undefined>;
|
|
40
|
+
/**
|
|
41
|
+
* Optional streaming variant of `list`. Implementations that page
|
|
42
|
+
* backing-store results (R2, S3) should implement this and yield keys
|
|
43
|
+
* lazily. `list` may return up to an adapter-defined cap (typically
|
|
44
|
+
* 10k keys); callers iterating full tenant space must prefer
|
|
45
|
+
* `streamList` when available, or chunk by narrower prefixes.
|
|
46
|
+
*/
|
|
47
|
+
streamList?: (prefix: string) => AsyncIterable<string>;
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* GSC URL inspection result fields we persist. Mirrors the
|
|
51
|
+
* `searchconsole_v1.Schema$UrlInspectionResult` shape but as plain JSON
|
|
52
|
+
* so storage doesn't depend on the googleapis type tree.
|
|
53
|
+
*/
|
|
54
|
+
interface InspectionRecord {
|
|
55
|
+
url: string;
|
|
56
|
+
/** ISO-8601 timestamp of when we ran the inspection. */
|
|
57
|
+
inspectedAt: string;
|
|
58
|
+
/** PASS / NEUTRAL / FAIL — the headline verdict from indexStatusResult. */
|
|
59
|
+
indexStatus?: string;
|
|
60
|
+
/** Last-crawl timestamp the API reports (ISO-8601). */
|
|
61
|
+
lastCrawlTime?: string;
|
|
62
|
+
/** Canonical URL Google selected. */
|
|
63
|
+
googleCanonical?: string;
|
|
64
|
+
/** Canonical URL the page declares. */
|
|
65
|
+
userCanonical?: string;
|
|
66
|
+
/** Crawl/index/serving disposition strings as the API returns them. */
|
|
67
|
+
coverageState?: string;
|
|
68
|
+
robotsTxtState?: string;
|
|
69
|
+
indexingState?: string;
|
|
70
|
+
pageFetchState?: string;
|
|
71
|
+
mobileUsabilityVerdict?: string;
|
|
72
|
+
richResultsVerdict?: string;
|
|
73
|
+
/**
|
|
74
|
+
* Free-form payload for fields we don't promote to first-class columns
|
|
75
|
+
* (e.g. `referringUrls`, `crawledAs`). Keeps the wire format forward-compat
|
|
76
|
+
* without bumping the schema for every API addition.
|
|
77
|
+
*/
|
|
78
|
+
raw?: unknown;
|
|
79
|
+
}
|
|
80
|
+
/** Wire shape persisted to disk/R2. */
|
|
81
|
+
interface InspectionIndex {
|
|
82
|
+
version: 1;
|
|
83
|
+
/** Map of urlHash → InspectionRecord (latest only). */
|
|
84
|
+
records: Record<string, InspectionRecord>;
|
|
85
|
+
}
|
|
86
|
+
interface InspectionHistoryShard {
|
|
87
|
+
version: 1;
|
|
88
|
+
/** Append-only list of inspection records for the YYYY-MM bucket. */
|
|
89
|
+
records: InspectionRecord[];
|
|
90
|
+
}
|
|
91
|
+
declare function inspectionIndexKey(ctx: TenantCtx): string;
|
|
92
|
+
declare function emptyTypesKey(ctx: TenantCtx): string;
|
|
93
|
+
declare function inspectionHistoryKey(ctx: TenantCtx, yearMonth: string): string;
|
|
94
|
+
/**
|
|
95
|
+
* Stable URL hash used as the index key. Short, URL-safe, deterministic.
|
|
96
|
+
* Uses a 64-bit FNV-1a; collisions vanishingly unlikely at the scales we
|
|
97
|
+
* care about (≤100k URLs/site).
|
|
98
|
+
*/
|
|
99
|
+
declare function hashUrl(url: string): string;
|
|
100
|
+
interface InspectionStore {
|
|
101
|
+
/**
|
|
102
|
+
* Persist a batch of fresh inspection results. Updates the index +
|
|
103
|
+
* appends to the per-month history shard.
|
|
104
|
+
*/
|
|
105
|
+
writeBatch: (ctx: TenantCtx, records: readonly InspectionRecord[]) => Promise<void>;
|
|
106
|
+
/** Fetch the latest inspection record for a URL, or undefined. */
|
|
107
|
+
getLatest: (ctx: TenantCtx, url: string) => Promise<InspectionRecord | undefined>;
|
|
108
|
+
/**
|
|
109
|
+
* Read the full index for a site (latest record per URL). Cheap on
|
|
110
|
+
* Workers; on big tenants the dashboard reads this once per page load.
|
|
111
|
+
*/
|
|
112
|
+
loadIndex: (ctx: TenantCtx) => Promise<InspectionIndex>;
|
|
113
|
+
/** Read the per-month history shard if it exists. */
|
|
114
|
+
loadHistory: (ctx: TenantCtx, yearMonth: string) => Promise<InspectionHistoryShard | undefined>;
|
|
115
|
+
}
|
|
116
|
+
interface CreateInspectionStoreOptions {
|
|
117
|
+
dataSource: DataSource;
|
|
118
|
+
/**
|
|
119
|
+
* Override the FNV hash with a callable (test seam, or to swap in
|
|
120
|
+
* SHA-256 if hash collisions become a concern at extreme scale).
|
|
121
|
+
*/
|
|
122
|
+
hash?: (url: string) => string;
|
|
123
|
+
now?: () => number;
|
|
124
|
+
}
|
|
125
|
+
declare function createInspectionStore(opts: CreateInspectionStoreOptions): InspectionStore;
|
|
126
|
+
interface InspectionSqlDriver {
|
|
127
|
+
exec: (sql: string) => void | Promise<void>;
|
|
128
|
+
run: (sql: string, params: unknown[]) => void | Promise<void>;
|
|
129
|
+
all: (sql: string, params: unknown[]) => unknown[] | Promise<unknown[]>;
|
|
130
|
+
serialize: () => Uint8Array | Promise<Uint8Array>;
|
|
131
|
+
close: () => void | Promise<void>;
|
|
132
|
+
}
|
|
133
|
+
interface CreateInspectionStoreSqliteOptions {
|
|
134
|
+
dataSource: DataSource;
|
|
135
|
+
openDriver: (bytes: Uint8Array | undefined) => InspectionSqlDriver | Promise<InspectionSqlDriver>;
|
|
136
|
+
hash?: (url: string) => string;
|
|
137
|
+
}
|
|
138
|
+
declare function inspectionSqliteKey(ctx: TenantCtx): string;
|
|
139
|
+
declare function createInspectionStoreSqlite(opts: CreateInspectionStoreSqliteOptions): InspectionStore;
|
|
140
|
+
/** GSC sitemap record we persist. Matches `Schema$WmxSitemap` but as plain JSON. */
|
|
141
|
+
interface SitemapRecord {
|
|
142
|
+
/** The sitemap URL (feedpath) as returned by GSC. */
|
|
143
|
+
path: string;
|
|
144
|
+
/** ISO-8601 timestamp of the snapshot run that captured this record. */
|
|
145
|
+
capturedAt: string;
|
|
146
|
+
/** Last time Google downloaded this sitemap (RFC 3339, from the API). */
|
|
147
|
+
lastDownloaded?: string;
|
|
148
|
+
/** Last time the sitemap was submitted. */
|
|
149
|
+
lastSubmitted?: string;
|
|
150
|
+
type?: string;
|
|
151
|
+
isPending?: boolean;
|
|
152
|
+
isSitemapsIndex?: boolean;
|
|
153
|
+
errors?: string;
|
|
154
|
+
warnings?: string;
|
|
155
|
+
/** Per-content-type counts (web, image, video, news). */
|
|
156
|
+
contents?: Array<{
|
|
157
|
+
type?: string;
|
|
158
|
+
submitted?: string;
|
|
159
|
+
indexed?: string;
|
|
160
|
+
}>;
|
|
161
|
+
/** Raw payload for fields we don't promote to first-class columns. */
|
|
162
|
+
raw?: unknown;
|
|
163
|
+
}
|
|
164
|
+
interface SitemapIndex {
|
|
165
|
+
version: 1;
|
|
166
|
+
/** Map of feedpathHash → latest SitemapRecord. */
|
|
167
|
+
records: Record<string, SitemapRecord>;
|
|
168
|
+
}
|
|
169
|
+
interface SitemapHistoryDoc {
|
|
170
|
+
version: 1;
|
|
171
|
+
path: string;
|
|
172
|
+
capturedAt: string;
|
|
173
|
+
record: SitemapRecord;
|
|
174
|
+
}
|
|
175
|
+
declare function sitemapIndexKey(ctx: TenantCtx): string;
|
|
176
|
+
declare function sitemapHistoryKey(ctx: TenantCtx, feedpathHash: string, capturedAtMs: number): string;
|
|
177
|
+
interface SitemapStore {
|
|
178
|
+
/**
|
|
179
|
+
* Persist a snapshot run. Updates the index + writes one immutable
|
|
180
|
+
* history doc per record under `history/<feedpathHash>__<capturedAtMs>.json`.
|
|
181
|
+
*/
|
|
182
|
+
writeSnapshot: (ctx: TenantCtx, records: readonly SitemapRecord[]) => Promise<void>;
|
|
183
|
+
/** Load the full site index (latest record per feedpath). */
|
|
184
|
+
loadIndex: (ctx: TenantCtx) => Promise<SitemapIndex>;
|
|
185
|
+
/** Fetch the latest snapshot for a feedpath, or undefined. */
|
|
186
|
+
getLatest: (ctx: TenantCtx, path: string) => Promise<SitemapRecord | undefined>;
|
|
187
|
+
}
|
|
188
|
+
interface CreateSitemapStoreOptions {
|
|
189
|
+
dataSource: DataSource;
|
|
190
|
+
/** Override the feedpath hash (test seam). */
|
|
191
|
+
hash?: (path: string) => string;
|
|
192
|
+
now?: () => number;
|
|
193
|
+
}
|
|
194
|
+
declare function createSitemapStore(opts: CreateSitemapStoreOptions): SitemapStore;
|
|
195
|
+
interface IndexingMetadataRecord {
|
|
196
|
+
url: string;
|
|
197
|
+
capturedAt: string;
|
|
198
|
+
/** ISO-8601 notifyTime of the latest `URL_UPDATED` notification we've seen. */
|
|
199
|
+
latestUpdateAt?: string;
|
|
200
|
+
/** ISO-8601 notifyTime of the latest `URL_REMOVED` notification we've seen. */
|
|
201
|
+
latestRemoveAt?: string;
|
|
202
|
+
raw?: unknown;
|
|
203
|
+
}
|
|
204
|
+
interface IndexingMetadataIndex {
|
|
205
|
+
version: 1;
|
|
206
|
+
records: Record<string, IndexingMetadataRecord>;
|
|
207
|
+
}
|
|
208
|
+
declare function indexingMetadataIndexKey(ctx: TenantCtx): string;
|
|
209
|
+
interface IndexingMetadataStore {
|
|
210
|
+
writeBatch: (ctx: TenantCtx, records: readonly IndexingMetadataRecord[]) => Promise<void>;
|
|
211
|
+
loadIndex: (ctx: TenantCtx) => Promise<IndexingMetadataIndex>;
|
|
212
|
+
getLatest: (ctx: TenantCtx, url: string) => Promise<IndexingMetadataRecord | undefined>;
|
|
213
|
+
}
|
|
214
|
+
interface CreateIndexingMetadataStoreOptions {
|
|
215
|
+
dataSource: DataSource;
|
|
216
|
+
hash?: (url: string) => string;
|
|
217
|
+
}
|
|
218
|
+
declare function createIndexingMetadataStore(opts: CreateIndexingMetadataStoreOptions): IndexingMetadataStore;
|
|
219
|
+
interface EmptyTypesDoc {
|
|
220
|
+
version: 1;
|
|
221
|
+
/** SearchType strings detected as empty for this (user, site). */
|
|
222
|
+
emptyTypes: string[];
|
|
223
|
+
/** When each type was last marked empty (unix ms). Helps debug stale skips. */
|
|
224
|
+
markedAt: Record<string, number>;
|
|
225
|
+
}
|
|
226
|
+
interface EmptyTypesStore {
|
|
227
|
+
load: (ctx: TenantCtx) => Promise<EmptyTypesDoc>;
|
|
228
|
+
/** Add types to the empty set, preserving existing markers. No-op if all already present. */
|
|
229
|
+
mark: (ctx: TenantCtx, types: readonly string[], now?: number) => Promise<EmptyTypesDoc>;
|
|
230
|
+
/** Remove types from the empty set. Returns the updated doc. */
|
|
231
|
+
clear: (ctx: TenantCtx, types: readonly string[]) => Promise<EmptyTypesDoc>;
|
|
232
|
+
}
|
|
233
|
+
interface CreateEmptyTypesStoreOptions {
|
|
234
|
+
dataSource: DataSource;
|
|
235
|
+
now?: () => number;
|
|
236
|
+
}
|
|
237
|
+
declare function createEmptyTypesStore(opts: CreateEmptyTypesStoreOptions): EmptyTypesStore;
|
|
238
|
+
export { CreateEmptyTypesStoreOptions, CreateIndexingMetadataStoreOptions, CreateInspectionStoreOptions, CreateInspectionStoreSqliteOptions, CreateSitemapStoreOptions, EmptyTypesDoc, EmptyTypesStore, IndexingMetadataIndex, IndexingMetadataRecord, IndexingMetadataStore, InspectionIndex, InspectionRecord, InspectionSqlDriver, InspectionStore, SitemapHistoryDoc, SitemapIndex, SitemapRecord, SitemapStore, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createInspectionStoreSqlite, createSitemapStore, emptyTypesKey, hashUrl, indexingMetadataIndexKey, inspectionHistoryKey, inspectionIndexKey, inspectionSqliteKey, sitemapHistoryKey, sitemapIndexKey };
|
|
@@ -0,0 +1,359 @@
|
|
|
1
|
+
const YEAR_MONTH_RE = /^(\d{4})-(\d{2})-/;
|
|
2
|
+
function inspectionIndexKey(ctx) {
|
|
3
|
+
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/index.json` : `u_${ctx.userId}/entities/inspections/index.json`;
|
|
4
|
+
}
|
|
5
|
+
function emptyTypesKey(ctx) {
|
|
6
|
+
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/empty-types.json` : `u_${ctx.userId}/entities/empty-types.json`;
|
|
7
|
+
}
|
|
8
|
+
function inspectionHistoryKey(ctx, yearMonth) {
|
|
9
|
+
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/history/${yearMonth}.json` : `u_${ctx.userId}/entities/inspections/history/${yearMonth}.json`;
|
|
10
|
+
}
|
|
11
|
+
function hashUrl(url) {
|
|
12
|
+
let hi = 2166136261;
|
|
13
|
+
let lo = 3421674724;
|
|
14
|
+
for (let i = 0; i < url.length; i++) {
|
|
15
|
+
const c = url.charCodeAt(i);
|
|
16
|
+
lo ^= c;
|
|
17
|
+
const loMul = Math.imul(lo, 435) >>> 0;
|
|
18
|
+
const carry = Math.floor(lo * 435 / 4294967296);
|
|
19
|
+
const hiMul = Math.imul(hi, 435) + Math.imul(lo, 1) + carry >>> 0;
|
|
20
|
+
lo = loMul;
|
|
21
|
+
hi = hiMul;
|
|
22
|
+
}
|
|
23
|
+
return (hi >>> 0).toString(16).padStart(8, "0") + (lo >>> 0).toString(16).padStart(8, "0");
|
|
24
|
+
}
|
|
25
|
+
function createInspectionStore(opts) {
|
|
26
|
+
const hash = opts.hash ?? hashUrl;
|
|
27
|
+
const ds = opts.dataSource;
|
|
28
|
+
async function readJson(key) {
|
|
29
|
+
return await ds.read(key).then((bytes) => JSON.parse(new TextDecoder().decode(bytes)), () => void 0);
|
|
30
|
+
}
|
|
31
|
+
async function writeJson(key, value) {
|
|
32
|
+
await ds.write(key, new TextEncoder().encode(JSON.stringify(value)));
|
|
33
|
+
}
|
|
34
|
+
function emptyIndex() {
|
|
35
|
+
return {
|
|
36
|
+
version: 1,
|
|
37
|
+
records: {}
|
|
38
|
+
};
|
|
39
|
+
}
|
|
40
|
+
function emptyShard() {
|
|
41
|
+
return {
|
|
42
|
+
version: 1,
|
|
43
|
+
records: []
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
function shardFor(record) {
|
|
47
|
+
const m = YEAR_MONTH_RE.exec(record.inspectedAt);
|
|
48
|
+
return m ? `${m[1]}-${m[2]}` : "unknown";
|
|
49
|
+
}
|
|
50
|
+
return {
|
|
51
|
+
async writeBatch(ctx, records) {
|
|
52
|
+
if (records.length === 0) return;
|
|
53
|
+
const indexKey = inspectionIndexKey(ctx);
|
|
54
|
+
const index = await readJson(indexKey) ?? emptyIndex();
|
|
55
|
+
const byShard = /* @__PURE__ */ new Map();
|
|
56
|
+
for (const r of records) {
|
|
57
|
+
index.records[hash(r.url)] = r;
|
|
58
|
+
const shardKey = shardFor(r);
|
|
59
|
+
if (!byShard.has(shardKey)) byShard.set(shardKey, []);
|
|
60
|
+
byShard.get(shardKey).push(r);
|
|
61
|
+
}
|
|
62
|
+
await writeJson(indexKey, index);
|
|
63
|
+
for (const [yearMonth, batch] of byShard) {
|
|
64
|
+
const histKey = inspectionHistoryKey(ctx, yearMonth);
|
|
65
|
+
const existing = await readJson(histKey) ?? emptyShard();
|
|
66
|
+
existing.records.push(...batch);
|
|
67
|
+
await writeJson(histKey, existing);
|
|
68
|
+
}
|
|
69
|
+
},
|
|
70
|
+
async getLatest(ctx, url) {
|
|
71
|
+
return (await readJson(inspectionIndexKey(ctx)))?.records[hash(url)];
|
|
72
|
+
},
|
|
73
|
+
async loadIndex(ctx) {
|
|
74
|
+
return await readJson(inspectionIndexKey(ctx)) ?? emptyIndex();
|
|
75
|
+
},
|
|
76
|
+
async loadHistory(ctx, yearMonth) {
|
|
77
|
+
return await readJson(inspectionHistoryKey(ctx, yearMonth));
|
|
78
|
+
}
|
|
79
|
+
};
|
|
80
|
+
}
|
|
81
|
+
function inspectionSqliteKey(ctx) {
|
|
82
|
+
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/inspections.db` : `u_${ctx.userId}/entities/inspections/inspections.db`;
|
|
83
|
+
}
|
|
84
|
+
const INSPECTION_SCHEMA_SQL = `
|
|
85
|
+
CREATE TABLE IF NOT EXISTS inspections (
|
|
86
|
+
url_hash TEXT PRIMARY KEY,
|
|
87
|
+
url TEXT NOT NULL,
|
|
88
|
+
inspected_at TEXT NOT NULL,
|
|
89
|
+
index_status TEXT,
|
|
90
|
+
last_crawl_time TEXT,
|
|
91
|
+
google_canonical TEXT,
|
|
92
|
+
user_canonical TEXT,
|
|
93
|
+
coverage_state TEXT,
|
|
94
|
+
robots_txt_state TEXT,
|
|
95
|
+
indexing_state TEXT,
|
|
96
|
+
page_fetch_state TEXT,
|
|
97
|
+
mobile_usability_verdict TEXT,
|
|
98
|
+
rich_results_verdict TEXT,
|
|
99
|
+
raw TEXT
|
|
100
|
+
);
|
|
101
|
+
CREATE TABLE IF NOT EXISTS inspection_history (
|
|
102
|
+
year_month TEXT NOT NULL,
|
|
103
|
+
url_hash TEXT NOT NULL,
|
|
104
|
+
url TEXT NOT NULL,
|
|
105
|
+
inspected_at TEXT NOT NULL,
|
|
106
|
+
payload TEXT NOT NULL,
|
|
107
|
+
PRIMARY KEY (year_month, url_hash, inspected_at)
|
|
108
|
+
);
|
|
109
|
+
CREATE INDEX IF NOT EXISTS inspection_history_by_month ON inspection_history(year_month);
|
|
110
|
+
`;
|
|
111
|
+
function rowToRecord(r) {
|
|
112
|
+
const out = {
|
|
113
|
+
url: r.url,
|
|
114
|
+
inspectedAt: r.inspected_at
|
|
115
|
+
};
|
|
116
|
+
if (r.index_status != null) out.indexStatus = r.index_status;
|
|
117
|
+
if (r.last_crawl_time != null) out.lastCrawlTime = r.last_crawl_time;
|
|
118
|
+
if (r.google_canonical != null) out.googleCanonical = r.google_canonical;
|
|
119
|
+
if (r.user_canonical != null) out.userCanonical = r.user_canonical;
|
|
120
|
+
if (r.coverage_state != null) out.coverageState = r.coverage_state;
|
|
121
|
+
if (r.robots_txt_state != null) out.robotsTxtState = r.robots_txt_state;
|
|
122
|
+
if (r.indexing_state != null) out.indexingState = r.indexing_state;
|
|
123
|
+
if (r.page_fetch_state != null) out.pageFetchState = r.page_fetch_state;
|
|
124
|
+
if (r.mobile_usability_verdict != null) out.mobileUsabilityVerdict = r.mobile_usability_verdict;
|
|
125
|
+
if (r.rich_results_verdict != null) out.richResultsVerdict = r.rich_results_verdict;
|
|
126
|
+
if (r.raw != null) out.raw = JSON.parse(r.raw);
|
|
127
|
+
return out;
|
|
128
|
+
}
|
|
129
|
+
function shardForRecord(record) {
|
|
130
|
+
const m = YEAR_MONTH_RE.exec(record.inspectedAt);
|
|
131
|
+
return m ? `${m[1]}-${m[2]}` : "unknown";
|
|
132
|
+
}
|
|
133
|
+
function createInspectionStoreSqlite(opts) {
|
|
134
|
+
const ds = opts.dataSource;
|
|
135
|
+
const hash = opts.hash ?? hashUrl;
|
|
136
|
+
async function withDriver(ctx, fn, persist) {
|
|
137
|
+
const key = inspectionSqliteKey(ctx);
|
|
138
|
+
const bytes = await ds.read(key).catch(() => void 0);
|
|
139
|
+
const driver = await opts.openDriver(bytes);
|
|
140
|
+
await driver.exec(INSPECTION_SCHEMA_SQL);
|
|
141
|
+
const result = await fn(driver);
|
|
142
|
+
if (persist) {
|
|
143
|
+
const out = await driver.serialize();
|
|
144
|
+
await ds.write(key, out);
|
|
145
|
+
}
|
|
146
|
+
await driver.close();
|
|
147
|
+
return result;
|
|
148
|
+
}
|
|
149
|
+
return {
|
|
150
|
+
async writeBatch(ctx, records) {
|
|
151
|
+
if (records.length === 0) return;
|
|
152
|
+
await withDriver(ctx, async (driver) => {
|
|
153
|
+
for (const r of records) {
|
|
154
|
+
const h = hash(r.url);
|
|
155
|
+
await driver.run(`INSERT INTO inspections (
|
|
156
|
+
url_hash, url, inspected_at, index_status, last_crawl_time,
|
|
157
|
+
google_canonical, user_canonical, coverage_state, robots_txt_state,
|
|
158
|
+
indexing_state, page_fetch_state, mobile_usability_verdict,
|
|
159
|
+
rich_results_verdict, raw
|
|
160
|
+
) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)
|
|
161
|
+
ON CONFLICT(url_hash) DO UPDATE SET
|
|
162
|
+
url = excluded.url,
|
|
163
|
+
inspected_at = excluded.inspected_at,
|
|
164
|
+
index_status = excluded.index_status,
|
|
165
|
+
last_crawl_time = excluded.last_crawl_time,
|
|
166
|
+
google_canonical = excluded.google_canonical,
|
|
167
|
+
user_canonical = excluded.user_canonical,
|
|
168
|
+
coverage_state = excluded.coverage_state,
|
|
169
|
+
robots_txt_state = excluded.robots_txt_state,
|
|
170
|
+
indexing_state = excluded.indexing_state,
|
|
171
|
+
page_fetch_state = excluded.page_fetch_state,
|
|
172
|
+
mobile_usability_verdict = excluded.mobile_usability_verdict,
|
|
173
|
+
rich_results_verdict = excluded.rich_results_verdict,
|
|
174
|
+
raw = excluded.raw`, [
|
|
175
|
+
h,
|
|
176
|
+
r.url,
|
|
177
|
+
r.inspectedAt,
|
|
178
|
+
r.indexStatus ?? null,
|
|
179
|
+
r.lastCrawlTime ?? null,
|
|
180
|
+
r.googleCanonical ?? null,
|
|
181
|
+
r.userCanonical ?? null,
|
|
182
|
+
r.coverageState ?? null,
|
|
183
|
+
r.robotsTxtState ?? null,
|
|
184
|
+
r.indexingState ?? null,
|
|
185
|
+
r.pageFetchState ?? null,
|
|
186
|
+
r.mobileUsabilityVerdict ?? null,
|
|
187
|
+
r.richResultsVerdict ?? null,
|
|
188
|
+
r.raw === void 0 ? null : JSON.stringify(r.raw)
|
|
189
|
+
]);
|
|
190
|
+
await driver.run(`INSERT OR REPLACE INTO inspection_history
|
|
191
|
+
(year_month, url_hash, url, inspected_at, payload)
|
|
192
|
+
VALUES (?,?,?,?,?)`, [
|
|
193
|
+
shardForRecord(r),
|
|
194
|
+
h,
|
|
195
|
+
r.url,
|
|
196
|
+
r.inspectedAt,
|
|
197
|
+
JSON.stringify(r)
|
|
198
|
+
]);
|
|
199
|
+
}
|
|
200
|
+
}, true);
|
|
201
|
+
},
|
|
202
|
+
async getLatest(ctx, url) {
|
|
203
|
+
return await withDriver(ctx, async (driver) => {
|
|
204
|
+
const rows = await driver.all("SELECT * FROM inspections WHERE url_hash = ? LIMIT 1", [hash(url)]);
|
|
205
|
+
return rows.length === 0 ? void 0 : rowToRecord(rows[0]);
|
|
206
|
+
}, false);
|
|
207
|
+
},
|
|
208
|
+
async loadIndex(ctx) {
|
|
209
|
+
return await withDriver(ctx, async (driver) => {
|
|
210
|
+
const rows = await driver.all("SELECT * FROM inspections", []);
|
|
211
|
+
const records = {};
|
|
212
|
+
for (const r of rows) records[r.url_hash] = rowToRecord(r);
|
|
213
|
+
return {
|
|
214
|
+
version: 1,
|
|
215
|
+
records
|
|
216
|
+
};
|
|
217
|
+
}, false);
|
|
218
|
+
},
|
|
219
|
+
async loadHistory(ctx, yearMonth) {
|
|
220
|
+
return await withDriver(ctx, async (driver) => {
|
|
221
|
+
const rows = await driver.all("SELECT * FROM inspection_history WHERE year_month = ? ORDER BY inspected_at ASC", [yearMonth]);
|
|
222
|
+
if (rows.length === 0) return void 0;
|
|
223
|
+
return {
|
|
224
|
+
version: 1,
|
|
225
|
+
records: rows.map((r) => JSON.parse(r.payload))
|
|
226
|
+
};
|
|
227
|
+
}, false);
|
|
228
|
+
}
|
|
229
|
+
};
|
|
230
|
+
}
|
|
231
|
+
function sitemapIndexKey(ctx) {
|
|
232
|
+
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/sitemaps/index.json` : `u_${ctx.userId}/entities/sitemaps/index.json`;
|
|
233
|
+
}
|
|
234
|
+
function sitemapHistoryKey(ctx, feedpathHash, capturedAtMs) {
|
|
235
|
+
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/sitemaps/history/${feedpathHash}__${capturedAtMs}.json` : `u_${ctx.userId}/entities/sitemaps/history/${feedpathHash}__${capturedAtMs}.json`;
|
|
236
|
+
}
|
|
237
|
+
function createSitemapStore(opts) {
|
|
238
|
+
const ds = opts.dataSource;
|
|
239
|
+
const hash = opts.hash ?? hashUrl;
|
|
240
|
+
const now = opts.now ?? (() => Date.now());
|
|
241
|
+
async function readJson(key) {
|
|
242
|
+
return await ds.read(key).then((bytes) => JSON.parse(new TextDecoder().decode(bytes)), () => void 0);
|
|
243
|
+
}
|
|
244
|
+
async function writeJson(key, value) {
|
|
245
|
+
await ds.write(key, new TextEncoder().encode(JSON.stringify(value)));
|
|
246
|
+
}
|
|
247
|
+
return {
|
|
248
|
+
async writeSnapshot(ctx, records) {
|
|
249
|
+
if (records.length === 0) return;
|
|
250
|
+
const indexKey = sitemapIndexKey(ctx);
|
|
251
|
+
const index = await readJson(indexKey) ?? {
|
|
252
|
+
version: 1,
|
|
253
|
+
records: {}
|
|
254
|
+
};
|
|
255
|
+
const stamp = now();
|
|
256
|
+
for (const r of records) {
|
|
257
|
+
const h = hash(r.path);
|
|
258
|
+
index.records[h] = r;
|
|
259
|
+
await writeJson(sitemapHistoryKey(ctx, h, stamp), {
|
|
260
|
+
version: 1,
|
|
261
|
+
path: r.path,
|
|
262
|
+
capturedAt: r.capturedAt,
|
|
263
|
+
record: r
|
|
264
|
+
});
|
|
265
|
+
}
|
|
266
|
+
await writeJson(indexKey, index);
|
|
267
|
+
},
|
|
268
|
+
async loadIndex(ctx) {
|
|
269
|
+
return await readJson(sitemapIndexKey(ctx)) ?? {
|
|
270
|
+
version: 1,
|
|
271
|
+
records: {}
|
|
272
|
+
};
|
|
273
|
+
},
|
|
274
|
+
async getLatest(ctx, path) {
|
|
275
|
+
return (await readJson(sitemapIndexKey(ctx)))?.records[hash(path)];
|
|
276
|
+
}
|
|
277
|
+
};
|
|
278
|
+
}
|
|
279
|
+
function indexingMetadataIndexKey(ctx) {
|
|
280
|
+
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/indexing/index.json` : `u_${ctx.userId}/entities/indexing/index.json`;
|
|
281
|
+
}
|
|
282
|
+
function createIndexingMetadataStore(opts) {
|
|
283
|
+
const ds = opts.dataSource;
|
|
284
|
+
const hash = opts.hash ?? hashUrl;
|
|
285
|
+
async function readIndex(key) {
|
|
286
|
+
return await ds.read(key).then((bytes) => JSON.parse(new TextDecoder().decode(bytes)), () => ({
|
|
287
|
+
version: 1,
|
|
288
|
+
records: {}
|
|
289
|
+
}));
|
|
290
|
+
}
|
|
291
|
+
return {
|
|
292
|
+
async writeBatch(ctx, records) {
|
|
293
|
+
if (records.length === 0) return;
|
|
294
|
+
const key = indexingMetadataIndexKey(ctx);
|
|
295
|
+
const index = await readIndex(key);
|
|
296
|
+
for (const r of records) index.records[hash(r.url)] = r;
|
|
297
|
+
await ds.write(key, new TextEncoder().encode(JSON.stringify(index)));
|
|
298
|
+
},
|
|
299
|
+
async loadIndex(ctx) {
|
|
300
|
+
return readIndex(indexingMetadataIndexKey(ctx));
|
|
301
|
+
},
|
|
302
|
+
async getLatest(ctx, url) {
|
|
303
|
+
return (await readIndex(indexingMetadataIndexKey(ctx))).records[hash(url)];
|
|
304
|
+
}
|
|
305
|
+
};
|
|
306
|
+
}
|
|
307
|
+
function createEmptyTypesStore(opts) {
|
|
308
|
+
const ds = opts.dataSource;
|
|
309
|
+
const now = opts.now ?? (() => Date.now());
|
|
310
|
+
async function readDoc(key) {
|
|
311
|
+
return await ds.read(key).then((bytes) => JSON.parse(new TextDecoder().decode(bytes)), () => ({
|
|
312
|
+
version: 1,
|
|
313
|
+
emptyTypes: [],
|
|
314
|
+
markedAt: {}
|
|
315
|
+
}));
|
|
316
|
+
}
|
|
317
|
+
async function writeDoc(key, doc) {
|
|
318
|
+
await ds.write(key, new TextEncoder().encode(JSON.stringify(doc)));
|
|
319
|
+
}
|
|
320
|
+
return {
|
|
321
|
+
async load(ctx) {
|
|
322
|
+
return readDoc(emptyTypesKey(ctx));
|
|
323
|
+
},
|
|
324
|
+
async mark(ctx, types, at) {
|
|
325
|
+
if (types.length === 0) return readDoc(emptyTypesKey(ctx));
|
|
326
|
+
const key = emptyTypesKey(ctx);
|
|
327
|
+
const doc = await readDoc(key);
|
|
328
|
+
const stamp = at ?? now();
|
|
329
|
+
let changed = false;
|
|
330
|
+
for (const t of types) {
|
|
331
|
+
if (!doc.emptyTypes.includes(t)) {
|
|
332
|
+
doc.emptyTypes.push(t);
|
|
333
|
+
changed = true;
|
|
334
|
+
}
|
|
335
|
+
if (doc.markedAt[t] === void 0) {
|
|
336
|
+
doc.markedAt[t] = stamp;
|
|
337
|
+
changed = true;
|
|
338
|
+
}
|
|
339
|
+
}
|
|
340
|
+
if (changed) {
|
|
341
|
+
doc.emptyTypes.sort();
|
|
342
|
+
await writeDoc(key, doc);
|
|
343
|
+
}
|
|
344
|
+
return doc;
|
|
345
|
+
},
|
|
346
|
+
async clear(ctx, types) {
|
|
347
|
+
if (types.length === 0) return readDoc(emptyTypesKey(ctx));
|
|
348
|
+
const key = emptyTypesKey(ctx);
|
|
349
|
+
const doc = await readDoc(key);
|
|
350
|
+
const drop = new Set(types);
|
|
351
|
+
const before = doc.emptyTypes.length;
|
|
352
|
+
doc.emptyTypes = doc.emptyTypes.filter((t) => !drop.has(t));
|
|
353
|
+
for (const t of drop) delete doc.markedAt[t];
|
|
354
|
+
if (doc.emptyTypes.length !== before) await writeDoc(key, doc);
|
|
355
|
+
return doc;
|
|
356
|
+
}
|
|
357
|
+
};
|
|
358
|
+
}
|
|
359
|
+
export { createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createInspectionStoreSqlite, createSitemapStore, emptyTypesKey, hashUrl, indexingMetadataIndexKey, inspectionHistoryKey, inspectionIndexKey, inspectionSqliteKey, sitemapHistoryKey, sitemapIndexKey };
|