@gscdump/engine 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +53 -0
- package/dist/adapters/duckdb-node.d.mts +19 -0
- package/dist/adapters/duckdb-node.mjs +78 -0
- package/dist/adapters/filesystem.d.mts +206 -0
- package/dist/adapters/filesystem.mjs +320 -0
- package/dist/adapters/http.d.mts +227 -0
- package/dist/adapters/http.mjs +119 -0
- package/dist/adapters/hyparquet.d.mts +107 -0
- package/dist/adapters/hyparquet.mjs +250 -0
- package/dist/adapters/inspection-sqlite-browser.d.mts +9 -0
- package/dist/adapters/inspection-sqlite-browser.mjs +42 -0
- package/dist/adapters/inspection-sqlite-node.d.mts +9 -0
- package/dist/adapters/inspection-sqlite-node.mjs +32 -0
- package/dist/adapters/node-harness.d.mts +334 -0
- package/dist/adapters/node-harness.mjs +1907 -0
- package/dist/adapters/r2-manifest.d.mts +227 -0
- package/dist/adapters/r2-manifest.mjs +355 -0
- package/dist/adapters/r2.d.mts +93 -0
- package/dist/adapters/r2.mjs +65 -0
- package/dist/arrow-utils.d.mts +14 -0
- package/dist/arrow-utils.mjs +8 -0
- package/dist/contracts.d.mts +436 -0
- package/dist/contracts.mjs +1 -0
- package/dist/entities.d.mts +238 -0
- package/dist/entities.mjs +359 -0
- package/dist/index.d.mts +1849 -0
- package/dist/index.mjs +1976 -0
- package/dist/ingest.d.mts +96 -0
- package/dist/ingest.mjs +187 -0
- package/dist/planner.d.mts +16 -0
- package/dist/planner.mjs +321 -0
- package/dist/resolver/index.d.mts +207 -0
- package/dist/resolver/index.mjs +869 -0
- package/dist/rollups.d.mts +207 -0
- package/dist/rollups.mjs +553 -0
- package/dist/schema.d.mts +1258 -0
- package/dist/schema.mjs +139 -0
- package/dist/scope.d.mts +38 -0
- package/dist/scope.mjs +28 -0
- package/dist/snapshot.d.mts +14 -0
- package/dist/snapshot.mjs +1 -0
- package/dist/sql-bind.d.mts +19 -0
- package/dist/sql-bind.mjs +92 -0
- package/dist/sql-fragments.d.mts +21 -0
- package/dist/sql-fragments.mjs +13 -0
- package/package.json +168 -0
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
import { TableName, TableName as TableName$1 } from "gscdump/contracts";
|
|
2
|
+
import { SearchType } from "gscdump/query";
|
|
3
|
+
/**
|
|
4
|
+
* Compaction tier of a manifest entry. Determines which compactor stage may
|
|
5
|
+
* pick it up as input:
|
|
6
|
+
* - `raw`: per-day file produced by `writeDay`. Eligible for raw→d7 merge at 7d.
|
|
7
|
+
* - `d7`: weekly compaction output. Eligible for d7→d30 merge at 30d.
|
|
8
|
+
* - `d30`: monthly compaction output (matches the legacy `monthly/` partition
|
|
9
|
+
* shape — pre-tier entries are read as `d30`). Eligible for d30→d90 at 90d.
|
|
10
|
+
* - `d90`: quarterly cold-tier output. Terminal; never recompacted.
|
|
11
|
+
*
|
|
12
|
+
* Without an explicit tier, entries written before this field landed default
|
|
13
|
+
* to `raw` for `daily/` partitions and `d30` for `monthly/` partitions, so
|
|
14
|
+
* the tiered compactor picks the right inputs without a backfill rewrite.
|
|
15
|
+
*/
|
|
16
|
+
type CompactionTier = 'raw' | 'd7' | 'd30' | 'd90';
|
|
17
|
+
interface ManifestEntry {
|
|
18
|
+
userId: string;
|
|
19
|
+
siteId?: string;
|
|
20
|
+
table: TableName;
|
|
21
|
+
partition: string;
|
|
22
|
+
objectKey: string;
|
|
23
|
+
rowCount: number;
|
|
24
|
+
bytes: number;
|
|
25
|
+
createdAt: number;
|
|
26
|
+
retiredAt?: number;
|
|
27
|
+
/** Table schema version at write time. Omitted on pre-#27 entries — treat as 1. */
|
|
28
|
+
schemaVersion?: number;
|
|
29
|
+
/**
|
|
30
|
+
* Compaction tier. Omitted on entries written before tiered compaction —
|
|
31
|
+
* treat as `raw` for `daily/` partitions and `d30` for `monthly/` partitions
|
|
32
|
+
* (see {@link inferLegacyTier}).
|
|
33
|
+
*/
|
|
34
|
+
tier?: CompactionTier;
|
|
35
|
+
/**
|
|
36
|
+
* GSC search-type this entry covers (web | discover | news | googleNews |
|
|
37
|
+
* image | video). Omitted on entries written before per-type partitioning
|
|
38
|
+
* landed — treat as `web` (see {@link inferSearchType}). Compaction merges
|
|
39
|
+
* only entries with the same searchType.
|
|
40
|
+
*/
|
|
41
|
+
searchType?: SearchType;
|
|
42
|
+
}
|
|
43
|
+
interface ListLiveFilter {
|
|
44
|
+
userId: string;
|
|
45
|
+
siteId?: string;
|
|
46
|
+
table?: TableName;
|
|
47
|
+
partitions?: string[];
|
|
48
|
+
/**
|
|
49
|
+
* Narrow to a single compaction tier. Tier-aware compaction stages set this
|
|
50
|
+
* so the store doesn't have to return (and the caller doesn't have to scan)
|
|
51
|
+
* the entire manifest just to compact the raw cohort. Legacy entries without
|
|
52
|
+
* an explicit `tier` field match on {@link inferLegacyTier}.
|
|
53
|
+
*/
|
|
54
|
+
tier?: CompactionTier;
|
|
55
|
+
}
|
|
56
|
+
interface WatermarkScope {
|
|
57
|
+
userId: string;
|
|
58
|
+
siteId?: string;
|
|
59
|
+
table: TableName;
|
|
60
|
+
}
|
|
61
|
+
interface Watermark extends WatermarkScope {
|
|
62
|
+
newestDateSynced: string;
|
|
63
|
+
oldestDateSynced: string;
|
|
64
|
+
lastSyncAt: number;
|
|
65
|
+
}
|
|
66
|
+
interface WatermarkFilter {
|
|
67
|
+
userId: string;
|
|
68
|
+
siteId?: string;
|
|
69
|
+
table?: TableName;
|
|
70
|
+
}
|
|
71
|
+
type SyncStateKind = 'pending' | 'inflight' | 'done' | 'failed';
|
|
72
|
+
interface SyncStateScope {
|
|
73
|
+
userId: string;
|
|
74
|
+
siteId?: string;
|
|
75
|
+
table: TableName;
|
|
76
|
+
date: string;
|
|
77
|
+
/**
|
|
78
|
+
* GSC search-type this sync state covers. Omitted = `web` (the legacy
|
|
79
|
+
* default; matches pre-#5 sync states stored before per-type sync landed).
|
|
80
|
+
* Lookups must compare via {@link inferSearchType} so a missing field
|
|
81
|
+
* matches an explicit `'web'` and vice versa.
|
|
82
|
+
*/
|
|
83
|
+
searchType?: SearchType;
|
|
84
|
+
}
|
|
85
|
+
interface SyncState extends SyncStateScope {
|
|
86
|
+
state: SyncStateKind;
|
|
87
|
+
updatedAt: number;
|
|
88
|
+
attempts: number;
|
|
89
|
+
error?: string;
|
|
90
|
+
}
|
|
91
|
+
interface SyncStateFilter {
|
|
92
|
+
userId: string;
|
|
93
|
+
siteId?: string;
|
|
94
|
+
table?: TableName;
|
|
95
|
+
state?: SyncStateKind;
|
|
96
|
+
searchType?: SearchType;
|
|
97
|
+
}
|
|
98
|
+
interface SyncStateDetail {
|
|
99
|
+
at?: number;
|
|
100
|
+
error?: string;
|
|
101
|
+
}
|
|
102
|
+
interface LockScope {
|
|
103
|
+
userId: string;
|
|
104
|
+
siteId?: string;
|
|
105
|
+
table: TableName;
|
|
106
|
+
partition: string;
|
|
107
|
+
}
|
|
108
|
+
interface PurgeFilter {
|
|
109
|
+
userId: string;
|
|
110
|
+
siteId?: string;
|
|
111
|
+
}
|
|
112
|
+
interface ManifestPurgeResult {
|
|
113
|
+
entriesRemoved: number;
|
|
114
|
+
watermarksRemoved: number;
|
|
115
|
+
syncStatesRemoved: number;
|
|
116
|
+
}
|
|
117
|
+
interface ManifestStore {
|
|
118
|
+
listLive: (filter: ListLiveFilter) => Promise<ManifestEntry[]>;
|
|
119
|
+
listAll: (filter: ListLiveFilter) => Promise<ManifestEntry[]>;
|
|
120
|
+
registerVersion: (entry: ManifestEntry, superseding?: ManifestEntry[]) => Promise<void>;
|
|
121
|
+
registerVersions: (entries: ManifestEntry[], superseding?: ManifestEntry[]) => Promise<void>;
|
|
122
|
+
listRetired: (olderThan: number) => Promise<ManifestEntry[]>;
|
|
123
|
+
delete: (entries: ManifestEntry[]) => Promise<void>;
|
|
124
|
+
getWatermarks: (filter: WatermarkFilter) => Promise<Watermark[]>;
|
|
125
|
+
bumpWatermark: (scope: WatermarkScope, date: string, at?: number) => Promise<void>;
|
|
126
|
+
getSyncStates: (filter: SyncStateFilter) => Promise<SyncState[]>;
|
|
127
|
+
setSyncState: (scope: SyncStateScope, state: SyncStateKind, detail?: SyncStateDetail) => Promise<void>;
|
|
128
|
+
/**
|
|
129
|
+
* Serialize concurrent writers against the same scope. Held across the
|
|
130
|
+
* write+register window so GC (orphan sweep) won't delete bytes that are
|
|
131
|
+
* midway between `dataSource.write` and `manifestStore.registerVersion`.
|
|
132
|
+
* Scope = tenant × table × partition.
|
|
133
|
+
*/
|
|
134
|
+
withLock: <T>(scope: LockScope, fn: () => Promise<T>) => Promise<T>;
|
|
135
|
+
/**
|
|
136
|
+
* GDPR-grade tenant purge. Removes every manifest entry, watermark, and
|
|
137
|
+
* sync-state record matching the filter. Does NOT touch the underlying
|
|
138
|
+
* data-source bytes; callers (typically {@link StorageEngine.purgeTenant})
|
|
139
|
+
* must sweep the tenant prefix separately before invoking this so that
|
|
140
|
+
* mid-flight failures can't leave orphan parquet with no manifest record.
|
|
141
|
+
*
|
|
142
|
+
* On stores with CAS-backed sharding (R2 manifest) this may issue one
|
|
143
|
+
* mutation per shard. On read-only stores (HTTP) this throws.
|
|
144
|
+
*/
|
|
145
|
+
purgeTenant: (filter: PurgeFilter) => Promise<ManifestPurgeResult>;
|
|
146
|
+
}
|
|
147
|
+
interface R2ObjectMetadata {
|
|
148
|
+
etag: string;
|
|
149
|
+
}
|
|
150
|
+
interface R2ObjectBody extends R2ObjectMetadata {
|
|
151
|
+
text: () => Promise<string>;
|
|
152
|
+
}
|
|
153
|
+
interface R2ListResult {
|
|
154
|
+
objects: Array<{
|
|
155
|
+
key: string;
|
|
156
|
+
}>;
|
|
157
|
+
truncated: boolean;
|
|
158
|
+
cursor?: string;
|
|
159
|
+
}
|
|
160
|
+
interface R2ConditionalPutOptions {
|
|
161
|
+
/**
|
|
162
|
+
* Workers-binding-style precondition. `etagMatches` rejects with `null`
|
|
163
|
+
* return on mismatch; `etagDoesNotMatch: '*'` rejects if the key exists.
|
|
164
|
+
*/
|
|
165
|
+
onlyIf?: {
|
|
166
|
+
etagMatches?: string;
|
|
167
|
+
etagDoesNotMatch?: string;
|
|
168
|
+
};
|
|
169
|
+
}
|
|
170
|
+
/**
|
|
171
|
+
* Minimal Cloudflare R2 binding shape needed for the manifest CAS loop.
|
|
172
|
+
* Structurally compatible with Cloudflare's `R2Bucket` Workers API.
|
|
173
|
+
*/
|
|
174
|
+
interface R2ManifestBucketLike {
|
|
175
|
+
get: (key: string) => Promise<R2ObjectBody | null>;
|
|
176
|
+
put: (key: string, bytes: string | Uint8Array, options?: R2ConditionalPutOptions) => Promise<R2ObjectMetadata | null>;
|
|
177
|
+
list: (options?: {
|
|
178
|
+
prefix?: string;
|
|
179
|
+
cursor?: string;
|
|
180
|
+
limit?: number;
|
|
181
|
+
}) => Promise<R2ListResult>;
|
|
182
|
+
/**
|
|
183
|
+
* Bulk delete. Required by {@link ManifestStore.purgeTenant}. Cloudflare's
|
|
184
|
+
* `R2Bucket.delete` accepts a single key or a string[] batch; both shapes
|
|
185
|
+
* work here.
|
|
186
|
+
*/
|
|
187
|
+
delete: (keys: string | string[]) => Promise<void>;
|
|
188
|
+
}
|
|
189
|
+
/**
|
|
190
|
+
* CAS lifecycle events emitted by the manifest store. Consumers wire these
|
|
191
|
+
* into metrics (prom-client, console.table, the contention harness) to
|
|
192
|
+
* measure rejection rate and latency under real R2 load.
|
|
193
|
+
*/
|
|
194
|
+
type R2ManifestEvent = {
|
|
195
|
+
kind: 'cas-attempt';
|
|
196
|
+
siteId: string;
|
|
197
|
+
table: TableName$1;
|
|
198
|
+
attempt: number;
|
|
199
|
+
} | {
|
|
200
|
+
kind: 'cas-rejected';
|
|
201
|
+
siteId: string;
|
|
202
|
+
table: TableName$1;
|
|
203
|
+
attempt: number;
|
|
204
|
+
} | {
|
|
205
|
+
kind: 'cas-committed';
|
|
206
|
+
siteId: string;
|
|
207
|
+
table: TableName$1;
|
|
208
|
+
attempts: number;
|
|
209
|
+
};
|
|
210
|
+
interface CreateR2ManifestStoreOptions {
|
|
211
|
+
bucket: R2ManifestBucketLike;
|
|
212
|
+
/** Tenant scope. All shard keys are prefixed `u_<userId>/manifest/...`. */
|
|
213
|
+
userId: string;
|
|
214
|
+
/** Override the snapshot version-id generator. Defaults to `${ts}-${random}`. */
|
|
215
|
+
newSnapshotId?: () => string;
|
|
216
|
+
now?: () => number;
|
|
217
|
+
/** Maximum CAS retries before giving up. Defaults to 8. */
|
|
218
|
+
maxRetries?: number;
|
|
219
|
+
/**
|
|
220
|
+
* Optional telemetry hook. Fired synchronously from the CAS loop on each
|
|
221
|
+
* attempt, rejection, and successful commit. Must not throw; exceptions
|
|
222
|
+
* propagate and will fail the mutation.
|
|
223
|
+
*/
|
|
224
|
+
onEvent?: (event: R2ManifestEvent) => void;
|
|
225
|
+
}
|
|
226
|
+
declare function createR2ManifestStore(opts: CreateR2ManifestStoreOptions): ManifestStore;
|
|
227
|
+
export { CreateR2ManifestStoreOptions, R2ManifestBucketLike, R2ManifestEvent, createR2ManifestStore };
|
|
@@ -0,0 +1,355 @@
|
|
|
1
|
+
function inferSearchType(entry) {
|
|
2
|
+
return entry.searchType ?? "web";
|
|
3
|
+
}
|
|
4
|
+
function inferLegacyTier(entry) {
|
|
5
|
+
if (entry.tier !== void 0) return entry.tier;
|
|
6
|
+
if (entry.partition.startsWith("daily/")) return "raw";
|
|
7
|
+
if (entry.partition.startsWith("monthly/")) return "d30";
|
|
8
|
+
}
|
|
9
|
+
const SHARD_RE = /^u_[^/]+\/manifest\/(?<siteId>[^/]+)\/(?<table>[^/]+)\/HEAD$/;
|
|
10
|
+
function defaultSnapshotId() {
|
|
11
|
+
return `${Date.now()}-${Math.random().toString(36).slice(2, 10)}`;
|
|
12
|
+
}
|
|
13
|
+
function shardPrefix(userId, siteId, table) {
|
|
14
|
+
return `u_${userId}/manifest/${siteId}/${table}/`;
|
|
15
|
+
}
|
|
16
|
+
function headKey(userId, siteId, table) {
|
|
17
|
+
return `${shardPrefix(userId, siteId, table)}HEAD`;
|
|
18
|
+
}
|
|
19
|
+
function snapshotKey(userId, siteId, table, snapshotId) {
|
|
20
|
+
return `${shardPrefix(userId, siteId, table)}v${snapshotId}.json`;
|
|
21
|
+
}
|
|
22
|
+
function emptySnapshot() {
|
|
23
|
+
return {
|
|
24
|
+
version: 1,
|
|
25
|
+
entries: [],
|
|
26
|
+
watermarks: [],
|
|
27
|
+
syncStates: []
|
|
28
|
+
};
|
|
29
|
+
}
|
|
30
|
+
function shardScopesFromEntries(entries) {
|
|
31
|
+
const out = /* @__PURE__ */ new Set();
|
|
32
|
+
for (const e of entries) {
|
|
33
|
+
if (e.siteId === void 0) throw new Error("R2 manifest store requires entries to carry siteId; cross-site entries are unshardable");
|
|
34
|
+
out.add(`${e.siteId}\0${e.table}`);
|
|
35
|
+
}
|
|
36
|
+
return out;
|
|
37
|
+
}
|
|
38
|
+
function matchesEntryFilter(entry, filter) {
|
|
39
|
+
if (filter.siteId !== void 0 && entry.siteId !== filter.siteId) return false;
|
|
40
|
+
if (filter.table !== void 0 && entry.table !== filter.table) return false;
|
|
41
|
+
if (filter.partitions && !filter.partitions.includes(entry.partition)) return false;
|
|
42
|
+
if (filter.tier !== void 0 && inferLegacyTier(entry) !== filter.tier) return false;
|
|
43
|
+
return true;
|
|
44
|
+
}
|
|
45
|
+
function matchesWatermarkFilter(w, filter) {
|
|
46
|
+
if (filter.siteId !== void 0 && w.siteId !== filter.siteId) return false;
|
|
47
|
+
if (filter.table !== void 0 && w.table !== filter.table) return false;
|
|
48
|
+
return true;
|
|
49
|
+
}
|
|
50
|
+
function matchesSyncStateFilter(s, filter) {
|
|
51
|
+
if (filter.siteId !== void 0 && s.siteId !== filter.siteId) return false;
|
|
52
|
+
if (filter.table !== void 0 && s.table !== filter.table) return false;
|
|
53
|
+
if (filter.state !== void 0 && s.state !== filter.state) return false;
|
|
54
|
+
if (filter.searchType !== void 0 && inferSearchType(s) !== filter.searchType) return false;
|
|
55
|
+
return true;
|
|
56
|
+
}
|
|
57
|
+
function createR2ManifestStore(opts) {
|
|
58
|
+
const { bucket, userId } = opts;
|
|
59
|
+
const newSnapshotId = opts.newSnapshotId ?? defaultSnapshotId;
|
|
60
|
+
const now = opts.now ?? (() => Date.now());
|
|
61
|
+
const maxRetries = opts.maxRetries ?? 8;
|
|
62
|
+
const onEvent = opts.onEvent;
|
|
63
|
+
async function readShard(siteId, table) {
|
|
64
|
+
const head = await bucket.get(headKey(userId, siteId, table));
|
|
65
|
+
if (!head) return {
|
|
66
|
+
snapshot: emptySnapshot(),
|
|
67
|
+
headEtag: void 0
|
|
68
|
+
};
|
|
69
|
+
const snapshotId = (await head.text()).trim();
|
|
70
|
+
if (!snapshotId) return {
|
|
71
|
+
snapshot: emptySnapshot(),
|
|
72
|
+
headEtag: head.etag
|
|
73
|
+
};
|
|
74
|
+
const snap = await bucket.get(snapshotKey(userId, siteId, table, snapshotId));
|
|
75
|
+
if (!snap) return {
|
|
76
|
+
snapshot: emptySnapshot(),
|
|
77
|
+
headEtag: head.etag
|
|
78
|
+
};
|
|
79
|
+
const parsed = JSON.parse(await snap.text());
|
|
80
|
+
if (parsed.version !== 1) throw new Error(`unsupported manifest snapshot version: ${parsed.version}`);
|
|
81
|
+
return {
|
|
82
|
+
snapshot: parsed,
|
|
83
|
+
headEtag: head.etag
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
async function writeShard(siteId, table, snapshot, headEtag) {
|
|
87
|
+
const id = newSnapshotId();
|
|
88
|
+
const snapKey = snapshotKey(userId, siteId, table, id);
|
|
89
|
+
await bucket.put(snapKey, JSON.stringify(snapshot));
|
|
90
|
+
const conditional = headEtag ? { onlyIf: { etagMatches: headEtag } } : { onlyIf: { etagDoesNotMatch: "*" } };
|
|
91
|
+
return { ok: await bucket.put(headKey(userId, siteId, table), id, conditional) !== null };
|
|
92
|
+
}
|
|
93
|
+
async function mutateShard(siteId, table, mutate) {
|
|
94
|
+
let attempt = 0;
|
|
95
|
+
while (attempt < maxRetries) {
|
|
96
|
+
onEvent?.({
|
|
97
|
+
kind: "cas-attempt",
|
|
98
|
+
siteId,
|
|
99
|
+
table,
|
|
100
|
+
attempt
|
|
101
|
+
});
|
|
102
|
+
const { snapshot, headEtag } = await readShard(siteId, table);
|
|
103
|
+
await mutate(snapshot);
|
|
104
|
+
const { ok } = await writeShard(siteId, table, snapshot, headEtag);
|
|
105
|
+
if (ok) {
|
|
106
|
+
onEvent?.({
|
|
107
|
+
kind: "cas-committed",
|
|
108
|
+
siteId,
|
|
109
|
+
table,
|
|
110
|
+
attempts: attempt + 1
|
|
111
|
+
});
|
|
112
|
+
return;
|
|
113
|
+
}
|
|
114
|
+
onEvent?.({
|
|
115
|
+
kind: "cas-rejected",
|
|
116
|
+
siteId,
|
|
117
|
+
table,
|
|
118
|
+
attempt
|
|
119
|
+
});
|
|
120
|
+
attempt++;
|
|
121
|
+
}
|
|
122
|
+
throw new Error(`R2 manifest CAS exceeded ${maxRetries} retries for ${siteId}/${table}`);
|
|
123
|
+
}
|
|
124
|
+
async function listShards() {
|
|
125
|
+
const shards = [];
|
|
126
|
+
let cursor;
|
|
127
|
+
do {
|
|
128
|
+
const res = await bucket.list({
|
|
129
|
+
prefix: `u_${userId}/manifest/`,
|
|
130
|
+
cursor,
|
|
131
|
+
limit: 1e3
|
|
132
|
+
});
|
|
133
|
+
for (const obj of res.objects) {
|
|
134
|
+
const m = SHARD_RE.exec(obj.key);
|
|
135
|
+
if (m?.groups) shards.push({
|
|
136
|
+
siteId: m.groups.siteId,
|
|
137
|
+
table: m.groups.table
|
|
138
|
+
});
|
|
139
|
+
}
|
|
140
|
+
cursor = res.truncated ? res.cursor : void 0;
|
|
141
|
+
} while (cursor);
|
|
142
|
+
return shards;
|
|
143
|
+
}
|
|
144
|
+
async function shardsForFilter(filter) {
|
|
145
|
+
if (filter.siteId !== void 0 && filter.table !== void 0) return [{
|
|
146
|
+
siteId: filter.siteId,
|
|
147
|
+
table: filter.table
|
|
148
|
+
}];
|
|
149
|
+
return (await listShards()).filter((s) => (filter.siteId === void 0 || s.siteId === filter.siteId) && (filter.table === void 0 || s.table === filter.table));
|
|
150
|
+
}
|
|
151
|
+
async function readEntriesAcrossShards(filter, includeRetired) {
|
|
152
|
+
const shards = await shardsForFilter(filter);
|
|
153
|
+
const all = [];
|
|
154
|
+
for (const { siteId, table } of shards) {
|
|
155
|
+
const { snapshot } = await readShard(siteId, table);
|
|
156
|
+
for (const entry of snapshot.entries) {
|
|
157
|
+
if (!includeRetired && entry.retiredAt !== void 0) continue;
|
|
158
|
+
if (matchesEntryFilter(entry, filter)) all.push(entry);
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
return all;
|
|
162
|
+
}
|
|
163
|
+
function groupBySiteTable(entries) {
|
|
164
|
+
const out = /* @__PURE__ */ new Map();
|
|
165
|
+
for (const e of entries) {
|
|
166
|
+
const key = `${e.siteId}\0${e.table}`;
|
|
167
|
+
if (!out.has(key)) out.set(key, []);
|
|
168
|
+
out.get(key).push(e);
|
|
169
|
+
}
|
|
170
|
+
return out;
|
|
171
|
+
}
|
|
172
|
+
async function registerVersionsImpl(newEntries, superseding) {
|
|
173
|
+
if (newEntries.length === 0 && (!superseding || superseding.length === 0)) return;
|
|
174
|
+
const supersededAt = newEntries[0]?.createdAt ?? now();
|
|
175
|
+
const byShard = /* @__PURE__ */ new Map();
|
|
176
|
+
function bucket(entry, kind) {
|
|
177
|
+
if (entry.siteId === void 0) throw new Error("R2 manifest store requires entries to carry siteId");
|
|
178
|
+
const key = `${entry.siteId}\0${entry.table}`;
|
|
179
|
+
let bag = byShard.get(key);
|
|
180
|
+
if (!bag) {
|
|
181
|
+
bag = {
|
|
182
|
+
newEntries: [],
|
|
183
|
+
superseding: []
|
|
184
|
+
};
|
|
185
|
+
byShard.set(key, bag);
|
|
186
|
+
}
|
|
187
|
+
if (kind === "new") bag.newEntries.push(entry);
|
|
188
|
+
else bag.superseding.push(entry);
|
|
189
|
+
}
|
|
190
|
+
for (const e of newEntries) bucket(e, "new");
|
|
191
|
+
if (superseding) for (const e of superseding) bucket(e, "super");
|
|
192
|
+
for (const [shardKey, { newEntries: news, superseding: supers }] of byShard) {
|
|
193
|
+
const [siteId, table] = shardKey.split("\0");
|
|
194
|
+
await mutateShard(siteId, table, (snap) => {
|
|
195
|
+
const byObjectKey = new Map(snap.entries.map((e) => [e.objectKey, e]));
|
|
196
|
+
for (const s of supers) {
|
|
197
|
+
const existing = byObjectKey.get(s.objectKey);
|
|
198
|
+
if (existing && existing.retiredAt === void 0) byObjectKey.set(s.objectKey, {
|
|
199
|
+
...existing,
|
|
200
|
+
retiredAt: supersededAt
|
|
201
|
+
});
|
|
202
|
+
}
|
|
203
|
+
for (const n of news) byObjectKey.set(n.objectKey, n);
|
|
204
|
+
snap.entries = Array.from(byObjectKey.values());
|
|
205
|
+
});
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
return {
|
|
209
|
+
async listLive(filter) {
|
|
210
|
+
return readEntriesAcrossShards(filter, false);
|
|
211
|
+
},
|
|
212
|
+
async listAll(filter) {
|
|
213
|
+
return readEntriesAcrossShards(filter, true);
|
|
214
|
+
},
|
|
215
|
+
async registerVersion(entry, superseding) {
|
|
216
|
+
return registerVersionsImpl([entry], superseding);
|
|
217
|
+
},
|
|
218
|
+
async registerVersions(entries, superseding) {
|
|
219
|
+
shardScopesFromEntries(entries);
|
|
220
|
+
return registerVersionsImpl(entries, superseding);
|
|
221
|
+
},
|
|
222
|
+
async listRetired(olderThan) {
|
|
223
|
+
const shards = await listShards();
|
|
224
|
+
const out = [];
|
|
225
|
+
for (const { siteId, table } of shards) {
|
|
226
|
+
const { snapshot } = await readShard(siteId, table);
|
|
227
|
+
for (const e of snapshot.entries) if (e.retiredAt !== void 0 && e.retiredAt <= olderThan) out.push(e);
|
|
228
|
+
}
|
|
229
|
+
return out;
|
|
230
|
+
},
|
|
231
|
+
async delete(toDelete) {
|
|
232
|
+
const grouped = groupBySiteTable(toDelete);
|
|
233
|
+
for (const [shardKey, entries] of grouped) {
|
|
234
|
+
const [siteId, table] = shardKey.split("\0");
|
|
235
|
+
await mutateShard(siteId, table, (snap) => {
|
|
236
|
+
const drop = new Set(entries.map((e) => e.objectKey));
|
|
237
|
+
snap.entries = snap.entries.filter((e) => !drop.has(e.objectKey));
|
|
238
|
+
});
|
|
239
|
+
}
|
|
240
|
+
},
|
|
241
|
+
async getWatermarks(filter) {
|
|
242
|
+
const shards = await shardsForFilter(filter);
|
|
243
|
+
const out = [];
|
|
244
|
+
for (const { siteId, table } of shards) {
|
|
245
|
+
const { snapshot } = await readShard(siteId, table);
|
|
246
|
+
for (const w of snapshot.watermarks) if (matchesWatermarkFilter(w, filter)) out.push(w);
|
|
247
|
+
}
|
|
248
|
+
return out;
|
|
249
|
+
},
|
|
250
|
+
async bumpWatermark(scope, date, at) {
|
|
251
|
+
if (scope.siteId === void 0) throw new Error("R2 manifest store requires watermarks to carry siteId");
|
|
252
|
+
const ts = at ?? now();
|
|
253
|
+
await mutateShard(scope.siteId, scope.table, (snap) => {
|
|
254
|
+
const idx = snap.watermarks.findIndex((w) => w.userId === userId && w.siteId === scope.siteId && w.table === scope.table);
|
|
255
|
+
if (idx === -1) {
|
|
256
|
+
snap.watermarks.push({
|
|
257
|
+
userId,
|
|
258
|
+
siteId: scope.siteId,
|
|
259
|
+
table: scope.table,
|
|
260
|
+
newestDateSynced: date,
|
|
261
|
+
oldestDateSynced: date,
|
|
262
|
+
lastSyncAt: ts
|
|
263
|
+
});
|
|
264
|
+
return;
|
|
265
|
+
}
|
|
266
|
+
const w = snap.watermarks[idx];
|
|
267
|
+
const newest = date > w.newestDateSynced ? date : w.newestDateSynced;
|
|
268
|
+
const oldest = date < w.oldestDateSynced ? date : w.oldestDateSynced;
|
|
269
|
+
const lastSyncAt = ts > w.lastSyncAt ? ts : w.lastSyncAt;
|
|
270
|
+
snap.watermarks[idx] = {
|
|
271
|
+
...w,
|
|
272
|
+
newestDateSynced: newest,
|
|
273
|
+
oldestDateSynced: oldest,
|
|
274
|
+
lastSyncAt
|
|
275
|
+
};
|
|
276
|
+
});
|
|
277
|
+
},
|
|
278
|
+
async getSyncStates(filter) {
|
|
279
|
+
const shards = await shardsForFilter(filter);
|
|
280
|
+
const out = [];
|
|
281
|
+
for (const { siteId, table } of shards) {
|
|
282
|
+
const { snapshot } = await readShard(siteId, table);
|
|
283
|
+
for (const s of snapshot.syncStates) if (matchesSyncStateFilter(s, filter)) out.push(s);
|
|
284
|
+
}
|
|
285
|
+
return out;
|
|
286
|
+
},
|
|
287
|
+
async setSyncState(scope, state, detail) {
|
|
288
|
+
if (scope.siteId === void 0) throw new Error("R2 manifest store requires sync states to carry siteId");
|
|
289
|
+
const at = detail?.at ?? now();
|
|
290
|
+
const scopeSearchType = inferSearchType(scope);
|
|
291
|
+
await mutateShard(scope.siteId, scope.table, (snap) => {
|
|
292
|
+
const idx = snap.syncStates.findIndex((s) => s.userId === userId && s.siteId === scope.siteId && s.table === scope.table && s.date === scope.date && inferSearchType(s) === scopeSearchType);
|
|
293
|
+
if (idx === -1) {
|
|
294
|
+
snap.syncStates.push({
|
|
295
|
+
userId,
|
|
296
|
+
siteId: scope.siteId,
|
|
297
|
+
table: scope.table,
|
|
298
|
+
date: scope.date,
|
|
299
|
+
state,
|
|
300
|
+
updatedAt: at,
|
|
301
|
+
attempts: 1,
|
|
302
|
+
error: detail?.error,
|
|
303
|
+
...scope.searchType !== void 0 ? { searchType: scope.searchType } : {}
|
|
304
|
+
});
|
|
305
|
+
return;
|
|
306
|
+
}
|
|
307
|
+
const prev = snap.syncStates[idx];
|
|
308
|
+
const attempts = state === "inflight" && prev.state !== "inflight" ? prev.attempts + 1 : prev.attempts;
|
|
309
|
+
const error = state === "done" ? void 0 : detail?.error ?? prev.error;
|
|
310
|
+
snap.syncStates[idx] = {
|
|
311
|
+
...prev,
|
|
312
|
+
state,
|
|
313
|
+
updatedAt: at,
|
|
314
|
+
attempts,
|
|
315
|
+
error
|
|
316
|
+
};
|
|
317
|
+
});
|
|
318
|
+
},
|
|
319
|
+
async withLock(_scope, fn) {
|
|
320
|
+
return fn();
|
|
321
|
+
},
|
|
322
|
+
async purgeTenant(filter) {
|
|
323
|
+
if (filter.userId !== userId) throw new Error(`purgeTenant: store is scoped to userId=${userId}, got ${filter.userId}`);
|
|
324
|
+
const shards = await shardsForFilter({ siteId: filter.siteId });
|
|
325
|
+
let entriesRemoved = 0;
|
|
326
|
+
let watermarksRemoved = 0;
|
|
327
|
+
let syncStatesRemoved = 0;
|
|
328
|
+
for (const { siteId, table } of shards) {
|
|
329
|
+
const { snapshot } = await readShard(siteId, table);
|
|
330
|
+
entriesRemoved += snapshot.entries.length;
|
|
331
|
+
watermarksRemoved += snapshot.watermarks.length;
|
|
332
|
+
syncStatesRemoved += snapshot.syncStates.length;
|
|
333
|
+
const prefix = shardPrefix(userId, siteId, table);
|
|
334
|
+
const keys = [];
|
|
335
|
+
let cursor;
|
|
336
|
+
do {
|
|
337
|
+
const res = await bucket.list({
|
|
338
|
+
prefix,
|
|
339
|
+
cursor,
|
|
340
|
+
limit: 1e3
|
|
341
|
+
});
|
|
342
|
+
for (const obj of res.objects) keys.push(obj.key);
|
|
343
|
+
cursor = res.truncated ? res.cursor : void 0;
|
|
344
|
+
} while (cursor);
|
|
345
|
+
if (keys.length > 0) await bucket.delete(keys);
|
|
346
|
+
}
|
|
347
|
+
return {
|
|
348
|
+
entriesRemoved,
|
|
349
|
+
watermarksRemoved,
|
|
350
|
+
syncStatesRemoved
|
|
351
|
+
};
|
|
352
|
+
}
|
|
353
|
+
};
|
|
354
|
+
}
|
|
355
|
+
export { createR2ManifestStore };
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
interface DataSource {
|
|
2
|
+
read: (key: string, range?: {
|
|
3
|
+
offset: number;
|
|
4
|
+
length: number;
|
|
5
|
+
}, signal?: AbortSignal) => Promise<Uint8Array>;
|
|
6
|
+
write: (key: string, bytes: Uint8Array) => Promise<void>;
|
|
7
|
+
delete: (keys: string[]) => Promise<void>;
|
|
8
|
+
/**
|
|
9
|
+
* One-shot listing under a prefix. Implementations may cap the number of
|
|
10
|
+
* returned keys (typically 10k) — callers iterating full tenant space
|
|
11
|
+
* should prefer `streamList` when available or narrow the prefix.
|
|
12
|
+
*/
|
|
13
|
+
list: (prefix: string) => Promise<string[]>;
|
|
14
|
+
/**
|
|
15
|
+
* Per-key URI probe. Returns a URI string DuckDB's `httpfs` (or an
|
|
16
|
+
* equivalent engine that fetches its own I/O) can read directly, or
|
|
17
|
+
* `undefined` if the key isn't URI-resolvable on this backend and the
|
|
18
|
+
* caller must fall back to `read(key)` for the bytes.
|
|
19
|
+
*
|
|
20
|
+
* Contracts:
|
|
21
|
+
* - When defined, the returned URI MUST yield byte-identical content to
|
|
22
|
+
* `read(key)`. Callers rely on this for correctness.
|
|
23
|
+
* - Backends with a native URI for every key (filesystem: absolute path,
|
|
24
|
+
* R2 via `httpfs`: signed URL) may always return a string.
|
|
25
|
+
* - Backends without a native URI shape (in-memory) omit the method or
|
|
26
|
+
* return `undefined` per call.
|
|
27
|
+
* - Mixed-per-query is allowed: some keys in one query may return a URI,
|
|
28
|
+
* others may not; the executor branches per key.
|
|
29
|
+
*/
|
|
30
|
+
uri?: (key: string) => string | undefined;
|
|
31
|
+
/**
|
|
32
|
+
* Optional — probe the byte size of a key without reading it. Used by
|
|
33
|
+
* the engine to fill in `WriteResult.bytes` when a codec reports 0 or
|
|
34
|
+
* unknown but the file is non-trivial.
|
|
35
|
+
*/
|
|
36
|
+
head?: (key: string) => Promise<{
|
|
37
|
+
bytes: number;
|
|
38
|
+
} | undefined>;
|
|
39
|
+
/**
|
|
40
|
+
* Optional streaming variant of `list`. Implementations that page
|
|
41
|
+
* backing-store results (R2, S3) should implement this and yield keys
|
|
42
|
+
* lazily. `list` may return up to an adapter-defined cap (typically
|
|
43
|
+
* 10k keys); callers iterating full tenant space must prefer
|
|
44
|
+
* `streamList` when available, or chunk by narrower prefixes.
|
|
45
|
+
*/
|
|
46
|
+
streamList?: (prefix: string) => AsyncIterable<string>;
|
|
47
|
+
}
|
|
48
|
+
interface R2GetOptions {
|
|
49
|
+
range?: {
|
|
50
|
+
offset: number;
|
|
51
|
+
length: number;
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
interface R2ObjectBody {
|
|
55
|
+
arrayBuffer: () => Promise<ArrayBuffer>;
|
|
56
|
+
}
|
|
57
|
+
interface R2HeadObject {
|
|
58
|
+
size: number;
|
|
59
|
+
}
|
|
60
|
+
interface R2ListResult {
|
|
61
|
+
objects: Array<{
|
|
62
|
+
key: string;
|
|
63
|
+
}>;
|
|
64
|
+
truncated: boolean;
|
|
65
|
+
cursor?: string;
|
|
66
|
+
}
|
|
67
|
+
interface R2ListOptions {
|
|
68
|
+
prefix?: string;
|
|
69
|
+
cursor?: string;
|
|
70
|
+
limit?: number;
|
|
71
|
+
}
|
|
72
|
+
interface R2BucketLike {
|
|
73
|
+
get: (key: string, options?: R2GetOptions) => Promise<R2ObjectBody | null>;
|
|
74
|
+
put: (key: string, bytes: Uint8Array) => Promise<unknown>;
|
|
75
|
+
delete: (keys: string | string[]) => Promise<void>;
|
|
76
|
+
list: (options?: R2ListOptions) => Promise<R2ListResult>;
|
|
77
|
+
head: (key: string) => Promise<R2HeadObject | null>;
|
|
78
|
+
}
|
|
79
|
+
interface R2DataSourceOptions {
|
|
80
|
+
/**
|
|
81
|
+
* R2 bucket binding. Structurally typed — pass a Cloudflare `R2Bucket` or
|
|
82
|
+
* any object with the same get/put/delete/list/head surface.
|
|
83
|
+
*/
|
|
84
|
+
bucket: R2BucketLike;
|
|
85
|
+
/**
|
|
86
|
+
* When set, `uri(key)` returns `r2://{bucketName}/{key}` for DuckDB httpfs
|
|
87
|
+
* reads. Omit if the backing DuckDB can't resolve `r2://` URIs and the
|
|
88
|
+
* caller should fall back to `read(key)` for bytes.
|
|
89
|
+
*/
|
|
90
|
+
bucketName?: string;
|
|
91
|
+
}
|
|
92
|
+
declare function createR2DataSource(options: R2DataSourceOptions): DataSource;
|
|
93
|
+
export { R2DataSourceOptions, createR2DataSource };
|