@gscdump/engine 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +53 -0
- package/dist/adapters/duckdb-node.d.mts +19 -0
- package/dist/adapters/duckdb-node.mjs +78 -0
- package/dist/adapters/filesystem.d.mts +206 -0
- package/dist/adapters/filesystem.mjs +320 -0
- package/dist/adapters/http.d.mts +227 -0
- package/dist/adapters/http.mjs +119 -0
- package/dist/adapters/hyparquet.d.mts +107 -0
- package/dist/adapters/hyparquet.mjs +250 -0
- package/dist/adapters/inspection-sqlite-browser.d.mts +9 -0
- package/dist/adapters/inspection-sqlite-browser.mjs +42 -0
- package/dist/adapters/inspection-sqlite-node.d.mts +9 -0
- package/dist/adapters/inspection-sqlite-node.mjs +32 -0
- package/dist/adapters/node-harness.d.mts +334 -0
- package/dist/adapters/node-harness.mjs +1907 -0
- package/dist/adapters/r2-manifest.d.mts +227 -0
- package/dist/adapters/r2-manifest.mjs +355 -0
- package/dist/adapters/r2.d.mts +93 -0
- package/dist/adapters/r2.mjs +65 -0
- package/dist/arrow-utils.d.mts +14 -0
- package/dist/arrow-utils.mjs +8 -0
- package/dist/contracts.d.mts +436 -0
- package/dist/contracts.mjs +1 -0
- package/dist/entities.d.mts +238 -0
- package/dist/entities.mjs +359 -0
- package/dist/index.d.mts +1849 -0
- package/dist/index.mjs +1976 -0
- package/dist/ingest.d.mts +96 -0
- package/dist/ingest.mjs +187 -0
- package/dist/planner.d.mts +16 -0
- package/dist/planner.mjs +321 -0
- package/dist/resolver/index.d.mts +207 -0
- package/dist/resolver/index.mjs +869 -0
- package/dist/rollups.d.mts +207 -0
- package/dist/rollups.mjs +553 -0
- package/dist/schema.d.mts +1258 -0
- package/dist/schema.mjs +139 -0
- package/dist/scope.d.mts +38 -0
- package/dist/scope.mjs +28 -0
- package/dist/snapshot.d.mts +14 -0
- package/dist/snapshot.mjs +1 -0
- package/dist/sql-bind.d.mts +19 -0
- package/dist/sql-bind.mjs +92 -0
- package/dist/sql-fragments.d.mts +21 -0
- package/dist/sql-fragments.mjs +13 -0
- package/package.json +168 -0
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
import { TableName } from "gscdump/contracts";
|
|
2
|
+
import { SearchType } from "gscdump/query";
|
|
3
|
+
/**
|
|
4
|
+
* Compaction tier of a manifest entry. Determines which compactor stage may
|
|
5
|
+
* pick it up as input:
|
|
6
|
+
* - `raw`: per-day file produced by `writeDay`. Eligible for raw→d7 merge at 7d.
|
|
7
|
+
* - `d7`: weekly compaction output. Eligible for d7→d30 merge at 30d.
|
|
8
|
+
* - `d30`: monthly compaction output (matches the legacy `monthly/` partition
|
|
9
|
+
* shape — pre-tier entries are read as `d30`). Eligible for d30→d90 at 90d.
|
|
10
|
+
* - `d90`: quarterly cold-tier output. Terminal; never recompacted.
|
|
11
|
+
*
|
|
12
|
+
* Without an explicit tier, entries written before this field landed default
|
|
13
|
+
* to `raw` for `daily/` partitions and `d30` for `monthly/` partitions, so
|
|
14
|
+
* the tiered compactor picks the right inputs without a backfill rewrite.
|
|
15
|
+
*/
|
|
16
|
+
type CompactionTier = 'raw' | 'd7' | 'd30' | 'd90';
|
|
17
|
+
interface ManifestEntry {
|
|
18
|
+
userId: string;
|
|
19
|
+
siteId?: string;
|
|
20
|
+
table: TableName;
|
|
21
|
+
partition: string;
|
|
22
|
+
objectKey: string;
|
|
23
|
+
rowCount: number;
|
|
24
|
+
bytes: number;
|
|
25
|
+
createdAt: number;
|
|
26
|
+
retiredAt?: number;
|
|
27
|
+
/** Table schema version at write time. Omitted on pre-#27 entries — treat as 1. */
|
|
28
|
+
schemaVersion?: number;
|
|
29
|
+
/**
|
|
30
|
+
* Compaction tier. Omitted on entries written before tiered compaction —
|
|
31
|
+
* treat as `raw` for `daily/` partitions and `d30` for `monthly/` partitions
|
|
32
|
+
* (see {@link inferLegacyTier}).
|
|
33
|
+
*/
|
|
34
|
+
tier?: CompactionTier;
|
|
35
|
+
/**
|
|
36
|
+
* GSC search-type this entry covers (web | discover | news | googleNews |
|
|
37
|
+
* image | video). Omitted on entries written before per-type partitioning
|
|
38
|
+
* landed — treat as `web` (see {@link inferSearchType}). Compaction merges
|
|
39
|
+
* only entries with the same searchType.
|
|
40
|
+
*/
|
|
41
|
+
searchType?: SearchType;
|
|
42
|
+
}
|
|
43
|
+
interface ListLiveFilter {
|
|
44
|
+
userId: string;
|
|
45
|
+
siteId?: string;
|
|
46
|
+
table?: TableName;
|
|
47
|
+
partitions?: string[];
|
|
48
|
+
/**
|
|
49
|
+
* Narrow to a single compaction tier. Tier-aware compaction stages set this
|
|
50
|
+
* so the store doesn't have to return (and the caller doesn't have to scan)
|
|
51
|
+
* the entire manifest just to compact the raw cohort. Legacy entries without
|
|
52
|
+
* an explicit `tier` field match on {@link inferLegacyTier}.
|
|
53
|
+
*/
|
|
54
|
+
tier?: CompactionTier;
|
|
55
|
+
}
|
|
56
|
+
interface DataSource {
|
|
57
|
+
read: (key: string, range?: {
|
|
58
|
+
offset: number;
|
|
59
|
+
length: number;
|
|
60
|
+
}, signal?: AbortSignal) => Promise<Uint8Array>;
|
|
61
|
+
write: (key: string, bytes: Uint8Array) => Promise<void>;
|
|
62
|
+
delete: (keys: string[]) => Promise<void>;
|
|
63
|
+
/**
|
|
64
|
+
* One-shot listing under a prefix. Implementations may cap the number of
|
|
65
|
+
* returned keys (typically 10k) — callers iterating full tenant space
|
|
66
|
+
* should prefer `streamList` when available or narrow the prefix.
|
|
67
|
+
*/
|
|
68
|
+
list: (prefix: string) => Promise<string[]>;
|
|
69
|
+
/**
|
|
70
|
+
* Per-key URI probe. Returns a URI string DuckDB's `httpfs` (or an
|
|
71
|
+
* equivalent engine that fetches its own I/O) can read directly, or
|
|
72
|
+
* `undefined` if the key isn't URI-resolvable on this backend and the
|
|
73
|
+
* caller must fall back to `read(key)` for the bytes.
|
|
74
|
+
*
|
|
75
|
+
* Contracts:
|
|
76
|
+
* - When defined, the returned URI MUST yield byte-identical content to
|
|
77
|
+
* `read(key)`. Callers rely on this for correctness.
|
|
78
|
+
* - Backends with a native URI for every key (filesystem: absolute path,
|
|
79
|
+
* R2 via `httpfs`: signed URL) may always return a string.
|
|
80
|
+
* - Backends without a native URI shape (in-memory) omit the method or
|
|
81
|
+
* return `undefined` per call.
|
|
82
|
+
* - Mixed-per-query is allowed: some keys in one query may return a URI,
|
|
83
|
+
* others may not; the executor branches per key.
|
|
84
|
+
*/
|
|
85
|
+
uri?: (key: string) => string | undefined;
|
|
86
|
+
/**
|
|
87
|
+
* Optional — probe the byte size of a key without reading it. Used by
|
|
88
|
+
* the engine to fill in `WriteResult.bytes` when a codec reports 0 or
|
|
89
|
+
* unknown but the file is non-trivial.
|
|
90
|
+
*/
|
|
91
|
+
head?: (key: string) => Promise<{
|
|
92
|
+
bytes: number;
|
|
93
|
+
} | undefined>;
|
|
94
|
+
/**
|
|
95
|
+
* Optional streaming variant of `list`. Implementations that page
|
|
96
|
+
* backing-store results (R2, S3) should implement this and yield keys
|
|
97
|
+
* lazily. `list` may return up to an adapter-defined cap (typically
|
|
98
|
+
* 10k keys); callers iterating full tenant space must prefer
|
|
99
|
+
* `streamList` when available, or chunk by narrower prefixes.
|
|
100
|
+
*/
|
|
101
|
+
streamList?: (prefix: string) => AsyncIterable<string>;
|
|
102
|
+
}
|
|
103
|
+
interface WatermarkScope {
|
|
104
|
+
userId: string;
|
|
105
|
+
siteId?: string;
|
|
106
|
+
table: TableName;
|
|
107
|
+
}
|
|
108
|
+
interface Watermark extends WatermarkScope {
|
|
109
|
+
newestDateSynced: string;
|
|
110
|
+
oldestDateSynced: string;
|
|
111
|
+
lastSyncAt: number;
|
|
112
|
+
}
|
|
113
|
+
interface WatermarkFilter {
|
|
114
|
+
userId: string;
|
|
115
|
+
siteId?: string;
|
|
116
|
+
table?: TableName;
|
|
117
|
+
}
|
|
118
|
+
type SyncStateKind = 'pending' | 'inflight' | 'done' | 'failed';
|
|
119
|
+
interface SyncStateScope {
|
|
120
|
+
userId: string;
|
|
121
|
+
siteId?: string;
|
|
122
|
+
table: TableName;
|
|
123
|
+
date: string;
|
|
124
|
+
/**
|
|
125
|
+
* GSC search-type this sync state covers. Omitted = `web` (the legacy
|
|
126
|
+
* default; matches pre-#5 sync states stored before per-type sync landed).
|
|
127
|
+
* Lookups must compare via {@link inferSearchType} so a missing field
|
|
128
|
+
* matches an explicit `'web'` and vice versa.
|
|
129
|
+
*/
|
|
130
|
+
searchType?: SearchType;
|
|
131
|
+
}
|
|
132
|
+
interface SyncState extends SyncStateScope {
|
|
133
|
+
state: SyncStateKind;
|
|
134
|
+
updatedAt: number;
|
|
135
|
+
attempts: number;
|
|
136
|
+
error?: string;
|
|
137
|
+
}
|
|
138
|
+
interface SyncStateFilter {
|
|
139
|
+
userId: string;
|
|
140
|
+
siteId?: string;
|
|
141
|
+
table?: TableName;
|
|
142
|
+
state?: SyncStateKind;
|
|
143
|
+
searchType?: SearchType;
|
|
144
|
+
}
|
|
145
|
+
interface SyncStateDetail {
|
|
146
|
+
at?: number;
|
|
147
|
+
error?: string;
|
|
148
|
+
}
|
|
149
|
+
interface LockScope {
|
|
150
|
+
userId: string;
|
|
151
|
+
siteId?: string;
|
|
152
|
+
table: TableName;
|
|
153
|
+
partition: string;
|
|
154
|
+
}
|
|
155
|
+
interface PurgeFilter {
|
|
156
|
+
userId: string;
|
|
157
|
+
siteId?: string;
|
|
158
|
+
}
|
|
159
|
+
interface ManifestPurgeResult {
|
|
160
|
+
entriesRemoved: number;
|
|
161
|
+
watermarksRemoved: number;
|
|
162
|
+
syncStatesRemoved: number;
|
|
163
|
+
}
|
|
164
|
+
interface ManifestStore {
|
|
165
|
+
listLive: (filter: ListLiveFilter) => Promise<ManifestEntry[]>;
|
|
166
|
+
listAll: (filter: ListLiveFilter) => Promise<ManifestEntry[]>;
|
|
167
|
+
registerVersion: (entry: ManifestEntry, superseding?: ManifestEntry[]) => Promise<void>;
|
|
168
|
+
registerVersions: (entries: ManifestEntry[], superseding?: ManifestEntry[]) => Promise<void>;
|
|
169
|
+
listRetired: (olderThan: number) => Promise<ManifestEntry[]>;
|
|
170
|
+
delete: (entries: ManifestEntry[]) => Promise<void>;
|
|
171
|
+
getWatermarks: (filter: WatermarkFilter) => Promise<Watermark[]>;
|
|
172
|
+
bumpWatermark: (scope: WatermarkScope, date: string, at?: number) => Promise<void>;
|
|
173
|
+
getSyncStates: (filter: SyncStateFilter) => Promise<SyncState[]>;
|
|
174
|
+
setSyncState: (scope: SyncStateScope, state: SyncStateKind, detail?: SyncStateDetail) => Promise<void>;
|
|
175
|
+
/**
|
|
176
|
+
* Serialize concurrent writers against the same scope. Held across the
|
|
177
|
+
* write+register window so GC (orphan sweep) won't delete bytes that are
|
|
178
|
+
* midway between `dataSource.write` and `manifestStore.registerVersion`.
|
|
179
|
+
* Scope = tenant × table × partition.
|
|
180
|
+
*/
|
|
181
|
+
withLock: <T>(scope: LockScope, fn: () => Promise<T>) => Promise<T>;
|
|
182
|
+
/**
|
|
183
|
+
* GDPR-grade tenant purge. Removes every manifest entry, watermark, and
|
|
184
|
+
* sync-state record matching the filter. Does NOT touch the underlying
|
|
185
|
+
* data-source bytes; callers (typically {@link StorageEngine.purgeTenant})
|
|
186
|
+
* must sweep the tenant prefix separately before invoking this so that
|
|
187
|
+
* mid-flight failures can't leave orphan parquet with no manifest record.
|
|
188
|
+
*
|
|
189
|
+
* On stores with CAS-backed sharding (R2 manifest) this may issue one
|
|
190
|
+
* mutation per shard. On read-only stores (HTTP) this throws.
|
|
191
|
+
*/
|
|
192
|
+
purgeTenant: (filter: PurgeFilter) => Promise<ManifestPurgeResult>;
|
|
193
|
+
}
|
|
194
|
+
interface HttpDataSourceOptions {
|
|
195
|
+
/**
|
|
196
|
+
* Base URL to prefix each object key with. MUST NOT have a trailing slash.
|
|
197
|
+
* E.g. `https://pub-abcdef.r2.dev/gscdump-data`.
|
|
198
|
+
*/
|
|
199
|
+
baseUrl: string;
|
|
200
|
+
/**
|
|
201
|
+
* Optional transformer that produces the final URL for a key. Use this when
|
|
202
|
+
* keys need signing (pre-signed URLs, per-request tokens, etc.). If omitted,
|
|
203
|
+
* the default is `${baseUrl}/${encodeKey(key)}` where forward slashes in the
|
|
204
|
+
* key are preserved.
|
|
205
|
+
*/
|
|
206
|
+
signUrl?: (key: string) => string;
|
|
207
|
+
/**
|
|
208
|
+
* Whether `uri(key)` should return the HTTPS URL so DuckDB's httpfs can
|
|
209
|
+
* fetch directly. Default true (browser, Workers, anywhere httpfs is
|
|
210
|
+
* loaded). Set to false for environments where httpfs isn't available —
|
|
211
|
+
* the executor will fall back to `read(key)` and buffer the bytes itself.
|
|
212
|
+
*/
|
|
213
|
+
useDuckDBHttpfs?: boolean;
|
|
214
|
+
}
|
|
215
|
+
declare function createHttpDataSource(opts: HttpDataSourceOptions): DataSource;
|
|
216
|
+
interface HttpManifestStoreOptions {
|
|
217
|
+
/**
|
|
218
|
+
* URL of a JSON manifest snapshot. The response MUST be:
|
|
219
|
+
* { version: 1, entries: ManifestEntry[], watermarks?: Watermark[] }
|
|
220
|
+
* (Matches the on-disk layout produced by the filesystem adapter.)
|
|
221
|
+
*/
|
|
222
|
+
manifestUrl: string;
|
|
223
|
+
/** Override fetch for tests / custom origins. */
|
|
224
|
+
fetchImpl?: typeof fetch;
|
|
225
|
+
}
|
|
226
|
+
declare function createHttpManifestStore(opts: HttpManifestStoreOptions): ManifestStore;
|
|
227
|
+
export { HttpDataSourceOptions, HttpManifestStoreOptions, createHttpDataSource, createHttpManifestStore };
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
function inferLegacyTier(entry) {
|
|
2
|
+
if (entry.tier !== void 0) return entry.tier;
|
|
3
|
+
if (entry.partition.startsWith("daily/")) return "raw";
|
|
4
|
+
if (entry.partition.startsWith("monthly/")) return "d30";
|
|
5
|
+
}
|
|
6
|
+
function readOnly(name) {
|
|
7
|
+
throw new Error(`http adapter is read-only: ${name} is not supported`);
|
|
8
|
+
}
|
|
9
|
+
function encodeKey(key) {
|
|
10
|
+
return key.split("/").map(encodeURIComponent).join("/");
|
|
11
|
+
}
|
|
12
|
+
const TRAILING_SLASH = /\/$/;
|
|
13
|
+
function createHttpDataSource(opts) {
|
|
14
|
+
const base = opts.baseUrl.replace(TRAILING_SLASH, "");
|
|
15
|
+
const sign = opts.signUrl ?? ((key) => `${base}/${encodeKey(key)}`);
|
|
16
|
+
const useHttpfs = opts.useDuckDBHttpfs ?? true;
|
|
17
|
+
async function readBytes(key, range, signal) {
|
|
18
|
+
const url = sign(key);
|
|
19
|
+
const headers = {};
|
|
20
|
+
if (range) headers.Range = `bytes=${range.offset}-${range.offset + range.length - 1}`;
|
|
21
|
+
const res = await fetch(url, {
|
|
22
|
+
headers,
|
|
23
|
+
signal
|
|
24
|
+
});
|
|
25
|
+
if (!res.ok) throw new Error(`http read failed ${res.status} ${res.statusText} for ${url}`);
|
|
26
|
+
return new Uint8Array(await res.arrayBuffer());
|
|
27
|
+
}
|
|
28
|
+
return {
|
|
29
|
+
read: readBytes,
|
|
30
|
+
async write() {
|
|
31
|
+
readOnly("write");
|
|
32
|
+
},
|
|
33
|
+
async delete() {
|
|
34
|
+
readOnly("delete");
|
|
35
|
+
},
|
|
36
|
+
async list() {
|
|
37
|
+
readOnly("list");
|
|
38
|
+
},
|
|
39
|
+
async head(key) {
|
|
40
|
+
const res = await fetch(sign(key), { method: "HEAD" });
|
|
41
|
+
if (!res.ok) return void 0;
|
|
42
|
+
const len = res.headers.get("content-length");
|
|
43
|
+
return len == null ? void 0 : { bytes: Number(len) };
|
|
44
|
+
},
|
|
45
|
+
uri(key) {
|
|
46
|
+
return useHttpfs ? sign(key) : void 0;
|
|
47
|
+
}
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
function matchesFilter(entry, filter) {
|
|
51
|
+
if (entry.userId !== filter.userId) return false;
|
|
52
|
+
if (filter.siteId !== void 0 && entry.siteId !== filter.siteId) return false;
|
|
53
|
+
if (filter.table !== void 0 && entry.table !== filter.table) return false;
|
|
54
|
+
if (filter.partitions && !filter.partitions.includes(entry.partition)) return false;
|
|
55
|
+
if (filter.tier !== void 0 && inferLegacyTier(entry) !== filter.tier) return false;
|
|
56
|
+
return true;
|
|
57
|
+
}
|
|
58
|
+
function matchesWatermark(w, filter) {
|
|
59
|
+
if (w.userId !== filter.userId) return false;
|
|
60
|
+
if (filter.siteId !== void 0 && w.siteId !== filter.siteId) return false;
|
|
61
|
+
if (filter.table !== void 0 && w.table !== filter.table) return false;
|
|
62
|
+
return true;
|
|
63
|
+
}
|
|
64
|
+
function createHttpManifestStore(opts) {
|
|
65
|
+
const fetchImpl = opts.fetchImpl ?? fetch;
|
|
66
|
+
let cache = null;
|
|
67
|
+
async function load() {
|
|
68
|
+
if (!cache) cache = (async () => {
|
|
69
|
+
const res = await fetchImpl(opts.manifestUrl);
|
|
70
|
+
if (!res.ok) throw new Error(`manifest fetch failed ${res.status} ${res.statusText} for ${opts.manifestUrl}`);
|
|
71
|
+
const parsed = await res.json();
|
|
72
|
+
if (parsed.version !== 1) throw new Error(`unsupported manifest version ${parsed.version}`);
|
|
73
|
+
return parsed;
|
|
74
|
+
})();
|
|
75
|
+
return cache;
|
|
76
|
+
}
|
|
77
|
+
return {
|
|
78
|
+
async listLive(filter) {
|
|
79
|
+
const { entries } = await load();
|
|
80
|
+
return entries.filter((e) => e.retiredAt === void 0 && matchesFilter(e, filter));
|
|
81
|
+
},
|
|
82
|
+
async listAll(filter) {
|
|
83
|
+
const { entries } = await load();
|
|
84
|
+
return entries.filter((e) => matchesFilter(e, filter));
|
|
85
|
+
},
|
|
86
|
+
async getWatermarks(filter) {
|
|
87
|
+
const { watermarks = [] } = await load();
|
|
88
|
+
return watermarks.filter((w) => matchesWatermark(w, filter));
|
|
89
|
+
},
|
|
90
|
+
async getSyncStates(_filter) {
|
|
91
|
+
return [];
|
|
92
|
+
},
|
|
93
|
+
async listRetired() {
|
|
94
|
+
return [];
|
|
95
|
+
},
|
|
96
|
+
async registerVersion() {
|
|
97
|
+
readOnly("registerVersion");
|
|
98
|
+
},
|
|
99
|
+
async registerVersions() {
|
|
100
|
+
readOnly("registerVersions");
|
|
101
|
+
},
|
|
102
|
+
async delete() {
|
|
103
|
+
readOnly("delete");
|
|
104
|
+
},
|
|
105
|
+
async bumpWatermark() {
|
|
106
|
+
readOnly("bumpWatermark");
|
|
107
|
+
},
|
|
108
|
+
async setSyncState() {
|
|
109
|
+
readOnly("setSyncState");
|
|
110
|
+
},
|
|
111
|
+
async withLock(_, fn) {
|
|
112
|
+
return fn();
|
|
113
|
+
},
|
|
114
|
+
async purgeTenant() {
|
|
115
|
+
readOnly("purgeTenant");
|
|
116
|
+
}
|
|
117
|
+
};
|
|
118
|
+
}
|
|
119
|
+
export { createHttpDataSource, createHttpManifestStore };
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import { ColumnDef, Row, Row as Row$1, TableName, TableName as TableName$1 } from "gscdump/contracts";
|
|
2
|
+
interface DataSource {
|
|
3
|
+
read: (key: string, range?: {
|
|
4
|
+
offset: number;
|
|
5
|
+
length: number;
|
|
6
|
+
}, signal?: AbortSignal) => Promise<Uint8Array>;
|
|
7
|
+
write: (key: string, bytes: Uint8Array) => Promise<void>;
|
|
8
|
+
delete: (keys: string[]) => Promise<void>;
|
|
9
|
+
/**
|
|
10
|
+
* One-shot listing under a prefix. Implementations may cap the number of
|
|
11
|
+
* returned keys (typically 10k) — callers iterating full tenant space
|
|
12
|
+
* should prefer `streamList` when available or narrow the prefix.
|
|
13
|
+
*/
|
|
14
|
+
list: (prefix: string) => Promise<string[]>;
|
|
15
|
+
/**
|
|
16
|
+
* Per-key URI probe. Returns a URI string DuckDB's `httpfs` (or an
|
|
17
|
+
* equivalent engine that fetches its own I/O) can read directly, or
|
|
18
|
+
* `undefined` if the key isn't URI-resolvable on this backend and the
|
|
19
|
+
* caller must fall back to `read(key)` for the bytes.
|
|
20
|
+
*
|
|
21
|
+
* Contracts:
|
|
22
|
+
* - When defined, the returned URI MUST yield byte-identical content to
|
|
23
|
+
* `read(key)`. Callers rely on this for correctness.
|
|
24
|
+
* - Backends with a native URI for every key (filesystem: absolute path,
|
|
25
|
+
* R2 via `httpfs`: signed URL) may always return a string.
|
|
26
|
+
* - Backends without a native URI shape (in-memory) omit the method or
|
|
27
|
+
* return `undefined` per call.
|
|
28
|
+
* - Mixed-per-query is allowed: some keys in one query may return a URI,
|
|
29
|
+
* others may not; the executor branches per key.
|
|
30
|
+
*/
|
|
31
|
+
uri?: (key: string) => string | undefined;
|
|
32
|
+
/**
|
|
33
|
+
* Optional — probe the byte size of a key without reading it. Used by
|
|
34
|
+
* the engine to fill in `WriteResult.bytes` when a codec reports 0 or
|
|
35
|
+
* unknown but the file is non-trivial.
|
|
36
|
+
*/
|
|
37
|
+
head?: (key: string) => Promise<{
|
|
38
|
+
bytes: number;
|
|
39
|
+
} | undefined>;
|
|
40
|
+
/**
|
|
41
|
+
* Optional streaming variant of `list`. Implementations that page
|
|
42
|
+
* backing-store results (R2, S3) should implement this and yield keys
|
|
43
|
+
* lazily. `list` may return up to an adapter-defined cap (typically
|
|
44
|
+
* 10k keys); callers iterating full tenant space must prefer
|
|
45
|
+
* `streamList` when available, or chunk by narrower prefixes.
|
|
46
|
+
*/
|
|
47
|
+
streamList?: (prefix: string) => AsyncIterable<string>;
|
|
48
|
+
}
|
|
49
|
+
interface WriteResult {
|
|
50
|
+
bytes: number;
|
|
51
|
+
rowCount: number;
|
|
52
|
+
}
|
|
53
|
+
interface CodecCtx {
|
|
54
|
+
table: TableName;
|
|
55
|
+
}
|
|
56
|
+
/**
|
|
57
|
+
* Key-oriented codec. Each method owns its I/O through `dataSource`:
|
|
58
|
+
* - Node / browser codecs read/write bytes via `dataSource.read` / `.write`.
|
|
59
|
+
* - Workers codecs let DuckDB's httpfs read/write remote URIs directly (via
|
|
60
|
+
* `dataSource.uri`) and never materialise bytes in JS.
|
|
61
|
+
*
|
|
62
|
+
* The engine never touches bytes; it just hands rows + keys to the codec.
|
|
63
|
+
*
|
|
64
|
+
* Invariants every implementation MUST uphold:
|
|
65
|
+
* - `writeRows` with an empty `rows` array MUST still write a file
|
|
66
|
+
* carrying the canonical column set for `ctx.table` — a schema-correct
|
|
67
|
+
* empty file. No placeholder-column shortcuts; readers depend on the
|
|
68
|
+
* schema being present for `union_by_name` merges.
|
|
69
|
+
* - `WriteResult.bytes` MUST be the real byte size written to the
|
|
70
|
+
* data source (not 0, not an estimate) so the engine can enforce the
|
|
71
|
+
* payload ceiling without a second `head` round-trip.
|
|
72
|
+
* - `WriteResult.rowCount` MUST equal `rows.length` (or, for
|
|
73
|
+
* `compactRows`, the sum of input row counts).
|
|
74
|
+
*/
|
|
75
|
+
interface ParquetCodec {
|
|
76
|
+
writeRows: (ctx: CodecCtx, rows: Row[], key: string, dataSource: DataSource) => Promise<WriteResult>;
|
|
77
|
+
readRows: (ctx: CodecCtx, key: string, dataSource: DataSource) => Promise<Row[]>;
|
|
78
|
+
compactRows: (ctx: CodecCtx, inputKeys: string[], outputKey: string, dataSource: DataSource) => Promise<WriteResult>;
|
|
79
|
+
}
|
|
80
|
+
declare function encodeRowsToParquet(table: TableName$1, rows: readonly Row$1[]): Uint8Array;
|
|
81
|
+
interface EncodeFlexOptions {
|
|
82
|
+
/** Columns defining the output schema + order. */
|
|
83
|
+
columns: readonly ColumnDef[];
|
|
84
|
+
/** Sort key columns (subset of `columns` by name). Empty = preserve input order. */
|
|
85
|
+
sortKey?: readonly string[];
|
|
86
|
+
/** Row-group size; smaller groups = more prunable DuckDB stats. Default 25000. */
|
|
87
|
+
rowGroupSize?: number;
|
|
88
|
+
}
|
|
89
|
+
/**
|
|
90
|
+
* Schema-free encoder for rollups + auxiliary tables whose column set isn't
|
|
91
|
+
* in `SCHEMAS`. Caller supplies column definitions; types must be one of
|
|
92
|
+
* `VARCHAR | DATE | BIGINT | INTEGER | DOUBLE` — same physical mappings as
|
|
93
|
+
* the canonical encoder so DuckDB `read_parquet(union_by_name = true)`
|
|
94
|
+
* merges cleanly with fact-table reads.
|
|
95
|
+
*/
|
|
96
|
+
declare function encodeRowsToParquetFlex(rows: readonly Row$1[], opts: EncodeFlexOptions): Uint8Array;
|
|
97
|
+
declare function decodeParquetToRows(bytes: Uint8Array): Promise<Row$1[]>;
|
|
98
|
+
interface HyparquetCodecOptions {
|
|
99
|
+
/**
|
|
100
|
+
* Override `readRows`. Useful when reads should be delegated to a faster
|
|
101
|
+
* engine (e.g. DuckDB-WASM via httpfs) while writes + compaction stay on
|
|
102
|
+
* hyparquet to avoid WASM linear-memory growth. Defaults to hyparquet.
|
|
103
|
+
*/
|
|
104
|
+
readRows?: (ctx: CodecCtx, key: string, dataSource: DataSource) => Promise<Row$1[]>;
|
|
105
|
+
}
|
|
106
|
+
declare function createHyparquetCodec(options?: HyparquetCodecOptions): ParquetCodec;
|
|
107
|
+
export { EncodeFlexOptions, HyparquetCodecOptions, createHyparquetCodec, decodeParquetToRows, encodeRowsToParquet, encodeRowsToParquetFlex };
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
import { parquetReadObjects } from "hyparquet";
|
|
2
|
+
import { parquetWriteBuffer } from "hyparquet-writer";
|
|
3
|
+
import { date, doublePrecision, getTableConfig, integer, pgTable, varchar } from "drizzle-orm/pg-core";
|
|
4
|
+
function metricCols() {
|
|
5
|
+
return {
|
|
6
|
+
clicks: integer("clicks").notNull(),
|
|
7
|
+
impressions: integer("impressions").notNull(),
|
|
8
|
+
sum_position: doublePrecision("sum_position").notNull()
|
|
9
|
+
};
|
|
10
|
+
}
|
|
11
|
+
const dateCol = () => date("date").notNull();
|
|
12
|
+
const drizzleSchema = {
|
|
13
|
+
pages: pgTable("pages", {
|
|
14
|
+
url: varchar("url").notNull(),
|
|
15
|
+
date: dateCol(),
|
|
16
|
+
...metricCols()
|
|
17
|
+
}),
|
|
18
|
+
keywords: pgTable("keywords", {
|
|
19
|
+
query: varchar("query").notNull(),
|
|
20
|
+
query_canonical: varchar("query_canonical"),
|
|
21
|
+
date: dateCol(),
|
|
22
|
+
...metricCols()
|
|
23
|
+
}),
|
|
24
|
+
countries: pgTable("countries", {
|
|
25
|
+
country: varchar("country").notNull(),
|
|
26
|
+
date: dateCol(),
|
|
27
|
+
...metricCols()
|
|
28
|
+
}),
|
|
29
|
+
devices: pgTable("devices", {
|
|
30
|
+
device: varchar("device").notNull(),
|
|
31
|
+
date: dateCol(),
|
|
32
|
+
...metricCols()
|
|
33
|
+
}),
|
|
34
|
+
page_keywords: pgTable("page_keywords", {
|
|
35
|
+
url: varchar("url").notNull(),
|
|
36
|
+
query: varchar("query").notNull(),
|
|
37
|
+
query_canonical: varchar("query_canonical"),
|
|
38
|
+
date: dateCol(),
|
|
39
|
+
...metricCols()
|
|
40
|
+
}),
|
|
41
|
+
search_appearance: pgTable("search_appearance", {
|
|
42
|
+
searchAppearance: varchar("searchAppearance").notNull(),
|
|
43
|
+
date: dateCol(),
|
|
44
|
+
...metricCols()
|
|
45
|
+
})
|
|
46
|
+
};
|
|
47
|
+
const TABLE_METADATA = {
|
|
48
|
+
pages: {
|
|
49
|
+
sortKey: ["date", "url"],
|
|
50
|
+
version: 1
|
|
51
|
+
},
|
|
52
|
+
keywords: {
|
|
53
|
+
sortKey: ["date", "query"],
|
|
54
|
+
version: 2
|
|
55
|
+
},
|
|
56
|
+
countries: {
|
|
57
|
+
sortKey: ["date", "country"],
|
|
58
|
+
version: 1
|
|
59
|
+
},
|
|
60
|
+
devices: {
|
|
61
|
+
sortKey: ["date", "device"],
|
|
62
|
+
version: 1
|
|
63
|
+
},
|
|
64
|
+
page_keywords: {
|
|
65
|
+
sortKey: [
|
|
66
|
+
"date",
|
|
67
|
+
"url",
|
|
68
|
+
"query"
|
|
69
|
+
],
|
|
70
|
+
version: 2
|
|
71
|
+
},
|
|
72
|
+
search_appearance: {
|
|
73
|
+
sortKey: ["date", "searchAppearance"],
|
|
74
|
+
version: 1
|
|
75
|
+
}
|
|
76
|
+
};
|
|
77
|
+
function pgSqlTypeToColumnType(sqlType) {
|
|
78
|
+
const t = sqlType.toLowerCase();
|
|
79
|
+
if (t.startsWith("varchar") || t === "text" || t.startsWith("char")) return "VARCHAR";
|
|
80
|
+
if (t === "date" || t.startsWith("timestamp")) return "DATE";
|
|
81
|
+
if (t.startsWith("double") || t === "real" || t.startsWith("numeric") || t.startsWith("decimal")) return "DOUBLE";
|
|
82
|
+
if (t === "bigint" || t === "int8") return "BIGINT";
|
|
83
|
+
if (t === "integer" || t === "int" || t === "int4" || t === "smallint" || t === "int2") return "INTEGER";
|
|
84
|
+
throw new Error(`unmapped pg type '${sqlType}' — extend pgSqlTypeToColumnType in @gscdump/engine/schema`);
|
|
85
|
+
}
|
|
86
|
+
function tableSchemaFrom(tableName) {
|
|
87
|
+
const columns = getTableConfig(drizzleSchema[tableName]).columns.map((col) => ({
|
|
88
|
+
name: col.name,
|
|
89
|
+
type: pgSqlTypeToColumnType(col.getSQLType()),
|
|
90
|
+
nullable: !col.notNull
|
|
91
|
+
}));
|
|
92
|
+
const meta = TABLE_METADATA[tableName];
|
|
93
|
+
return {
|
|
94
|
+
name: tableName,
|
|
95
|
+
columns,
|
|
96
|
+
sortKey: meta.sortKey,
|
|
97
|
+
version: meta.version
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
const SCHEMAS = Object.fromEntries([
|
|
101
|
+
"pages",
|
|
102
|
+
"keywords",
|
|
103
|
+
"countries",
|
|
104
|
+
"devices",
|
|
105
|
+
"page_keywords",
|
|
106
|
+
"search_appearance"
|
|
107
|
+
].map((t) => [t, tableSchemaFrom(t)]));
|
|
108
|
+
const ROW_GROUP_SIZE = 25e3;
|
|
109
|
+
function basicTypeFor(colType) {
|
|
110
|
+
if (colType === "VARCHAR" || colType === "DATE") return "STRING";
|
|
111
|
+
if (colType === "BIGINT") return "INT64";
|
|
112
|
+
if (colType === "INTEGER") return "INT32";
|
|
113
|
+
if (colType === "DOUBLE") return "DOUBLE";
|
|
114
|
+
throw new Error(`unsupported column type for parquet encoding: ${colType}`);
|
|
115
|
+
}
|
|
116
|
+
function coerceValue(value, type) {
|
|
117
|
+
if (value === null || value === void 0) return null;
|
|
118
|
+
if (type === "STRING") return typeof value === "string" ? value : String(value);
|
|
119
|
+
if (type === "INT32") {
|
|
120
|
+
const n = typeof value === "number" ? value : Number(value);
|
|
121
|
+
if (!Number.isFinite(n)) throw new Error(`non-finite number for INT32: ${String(value)}`);
|
|
122
|
+
return Math.trunc(n);
|
|
123
|
+
}
|
|
124
|
+
if (type === "INT64") {
|
|
125
|
+
if (typeof value === "bigint") return value;
|
|
126
|
+
const n = typeof value === "number" ? value : Number(value);
|
|
127
|
+
if (!Number.isFinite(n)) throw new Error(`non-finite number for INT64: ${String(value)}`);
|
|
128
|
+
return BigInt(Math.trunc(n));
|
|
129
|
+
}
|
|
130
|
+
if (type === "DOUBLE") {
|
|
131
|
+
const n = typeof value === "number" ? value : Number(value);
|
|
132
|
+
if (!Number.isFinite(n)) throw new Error(`non-finite number for DOUBLE: ${String(value)}`);
|
|
133
|
+
return n;
|
|
134
|
+
}
|
|
135
|
+
return value;
|
|
136
|
+
}
|
|
137
|
+
function compareValues(a, b) {
|
|
138
|
+
if (a === b) return 0;
|
|
139
|
+
if (a === null || a === void 0) return -1;
|
|
140
|
+
if (b === null || b === void 0) return 1;
|
|
141
|
+
if (typeof a === "number" && typeof b === "number") return a - b;
|
|
142
|
+
return String(a) < String(b) ? -1 : 1;
|
|
143
|
+
}
|
|
144
|
+
function sortRowsBySortKey(table, rows) {
|
|
145
|
+
const sortKey = TABLE_METADATA[table].sortKey;
|
|
146
|
+
if (sortKey.length === 0 || rows.length <= 1) return rows;
|
|
147
|
+
const copy = rows.slice();
|
|
148
|
+
copy.sort((a, b) => {
|
|
149
|
+
for (const col of sortKey) {
|
|
150
|
+
const cmp = compareValues(a[col], b[col]);
|
|
151
|
+
if (cmp !== 0) return cmp;
|
|
152
|
+
}
|
|
153
|
+
return 0;
|
|
154
|
+
});
|
|
155
|
+
return copy;
|
|
156
|
+
}
|
|
157
|
+
function encodeRowsToParquet(table, rows) {
|
|
158
|
+
const schema = SCHEMAS[table];
|
|
159
|
+
const sorted = sortRowsBySortKey(table, rows);
|
|
160
|
+
const buffer = parquetWriteBuffer({
|
|
161
|
+
columnData: schema.columns.map((col) => {
|
|
162
|
+
const type = basicTypeFor(col.type);
|
|
163
|
+
const data = sorted.map((r) => coerceValue(r[col.name], type));
|
|
164
|
+
return {
|
|
165
|
+
name: col.name,
|
|
166
|
+
data,
|
|
167
|
+
type,
|
|
168
|
+
nullable: col.nullable,
|
|
169
|
+
columnIndex: true
|
|
170
|
+
};
|
|
171
|
+
}),
|
|
172
|
+
rowGroupSize: ROW_GROUP_SIZE
|
|
173
|
+
});
|
|
174
|
+
return new Uint8Array(buffer);
|
|
175
|
+
}
|
|
176
|
+
function encodeRowsToParquetFlex(rows, opts) {
|
|
177
|
+
const { columns, sortKey = [], rowGroupSize = ROW_GROUP_SIZE } = opts;
|
|
178
|
+
const sorted = sortKey.length === 0 || rows.length <= 1 ? rows : [...rows].sort((a, b) => {
|
|
179
|
+
for (const col of sortKey) {
|
|
180
|
+
const cmp = compareValues(a[col], b[col]);
|
|
181
|
+
if (cmp !== 0) return cmp;
|
|
182
|
+
}
|
|
183
|
+
return 0;
|
|
184
|
+
});
|
|
185
|
+
const buffer = parquetWriteBuffer({
|
|
186
|
+
columnData: columns.map((col) => {
|
|
187
|
+
const type = basicTypeFor(col.type);
|
|
188
|
+
const data = sorted.map((r) => coerceValue(r[col.name], type));
|
|
189
|
+
return {
|
|
190
|
+
name: col.name,
|
|
191
|
+
data,
|
|
192
|
+
type,
|
|
193
|
+
nullable: col.nullable,
|
|
194
|
+
columnIndex: true
|
|
195
|
+
};
|
|
196
|
+
}),
|
|
197
|
+
rowGroupSize
|
|
198
|
+
});
|
|
199
|
+
return new Uint8Array(buffer);
|
|
200
|
+
}
|
|
201
|
+
function asyncBufferFromBytes(bytes) {
|
|
202
|
+
const ab = bytes.buffer.slice(bytes.byteOffset, bytes.byteOffset + bytes.byteLength);
|
|
203
|
+
return {
|
|
204
|
+
byteLength: ab.byteLength,
|
|
205
|
+
slice(start, end) {
|
|
206
|
+
return ab.slice(start, end);
|
|
207
|
+
}
|
|
208
|
+
};
|
|
209
|
+
}
|
|
210
|
+
async function decodeParquetToRows(bytes) {
|
|
211
|
+
if (bytes.byteLength === 0) return [];
|
|
212
|
+
return await parquetReadObjects({ file: asyncBufferFromBytes(bytes) });
|
|
213
|
+
}
|
|
214
|
+
function createHyparquetCodec(options = {}) {
|
|
215
|
+
return {
|
|
216
|
+
async writeRows(ctx, rows, key, dataSource) {
|
|
217
|
+
const bytes = encodeRowsToParquet(ctx.table, rows);
|
|
218
|
+
await dataSource.write(key, bytes);
|
|
219
|
+
return {
|
|
220
|
+
bytes: bytes.byteLength,
|
|
221
|
+
rowCount: rows.length
|
|
222
|
+
};
|
|
223
|
+
},
|
|
224
|
+
readRows: options.readRows ?? (async (_ctx, key, dataSource) => {
|
|
225
|
+
return decodeParquetToRows(await dataSource.read(key));
|
|
226
|
+
}),
|
|
227
|
+
async compactRows(ctx, inputKeys, outputKey, dataSource) {
|
|
228
|
+
if (inputKeys.length === 0) {
|
|
229
|
+
const bytes = encodeRowsToParquet(ctx.table, []);
|
|
230
|
+
await dataSource.write(outputKey, bytes);
|
|
231
|
+
return {
|
|
232
|
+
bytes: bytes.byteLength,
|
|
233
|
+
rowCount: 0
|
|
234
|
+
};
|
|
235
|
+
}
|
|
236
|
+
const allRows = [];
|
|
237
|
+
for (const key of inputKeys) {
|
|
238
|
+
const rows = await decodeParquetToRows(await dataSource.read(key));
|
|
239
|
+
allRows.push(...rows);
|
|
240
|
+
}
|
|
241
|
+
const bytes = encodeRowsToParquet(ctx.table, allRows);
|
|
242
|
+
await dataSource.write(outputKey, bytes);
|
|
243
|
+
return {
|
|
244
|
+
bytes: bytes.byteLength,
|
|
245
|
+
rowCount: allRows.length
|
|
246
|
+
};
|
|
247
|
+
}
|
|
248
|
+
};
|
|
249
|
+
}
|
|
250
|
+
export { createHyparquetCodec, decodeParquetToRows, encodeRowsToParquet, encodeRowsToParquetFlex };
|