@gscdump/engine 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +53 -0
- package/dist/adapters/duckdb-node.d.mts +19 -0
- package/dist/adapters/duckdb-node.mjs +78 -0
- package/dist/adapters/filesystem.d.mts +206 -0
- package/dist/adapters/filesystem.mjs +320 -0
- package/dist/adapters/http.d.mts +227 -0
- package/dist/adapters/http.mjs +119 -0
- package/dist/adapters/hyparquet.d.mts +107 -0
- package/dist/adapters/hyparquet.mjs +250 -0
- package/dist/adapters/inspection-sqlite-browser.d.mts +9 -0
- package/dist/adapters/inspection-sqlite-browser.mjs +42 -0
- package/dist/adapters/inspection-sqlite-node.d.mts +9 -0
- package/dist/adapters/inspection-sqlite-node.mjs +32 -0
- package/dist/adapters/node-harness.d.mts +334 -0
- package/dist/adapters/node-harness.mjs +1907 -0
- package/dist/adapters/r2-manifest.d.mts +227 -0
- package/dist/adapters/r2-manifest.mjs +355 -0
- package/dist/adapters/r2.d.mts +93 -0
- package/dist/adapters/r2.mjs +65 -0
- package/dist/arrow-utils.d.mts +14 -0
- package/dist/arrow-utils.mjs +8 -0
- package/dist/contracts.d.mts +436 -0
- package/dist/contracts.mjs +1 -0
- package/dist/entities.d.mts +238 -0
- package/dist/entities.mjs +359 -0
- package/dist/index.d.mts +1849 -0
- package/dist/index.mjs +1976 -0
- package/dist/ingest.d.mts +96 -0
- package/dist/ingest.mjs +187 -0
- package/dist/planner.d.mts +16 -0
- package/dist/planner.mjs +321 -0
- package/dist/resolver/index.d.mts +207 -0
- package/dist/resolver/index.mjs +869 -0
- package/dist/rollups.d.mts +207 -0
- package/dist/rollups.mjs +553 -0
- package/dist/schema.d.mts +1258 -0
- package/dist/schema.mjs +139 -0
- package/dist/scope.d.mts +38 -0
- package/dist/scope.mjs +28 -0
- package/dist/snapshot.d.mts +14 -0
- package/dist/snapshot.mjs +1 -0
- package/dist/sql-bind.d.mts +19 -0
- package/dist/sql-bind.mjs +92 -0
- package/dist/sql-fragments.d.mts +21 -0
- package/dist/sql-fragments.mjs +13 -0
- package/package.json +168 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Harlan Wilton
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
## @gscdump/engine
|
|
2
|
+
|
|
3
|
+
Append-only Parquet/DuckDB storage engine for the gscdump pipeline. Owns the storage runtime, planner, schema, and adapters that were previously bundled into `gscdump`.
|
|
4
|
+
|
|
5
|
+
Edge consumers stay on [`gscdump`](../gscdump). Anything that needs to read/write Parquet, run the DuckDB executor, or attach a snapshot lives here.
|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
npm install @gscdump/engine
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
Optional peers (install only what your runtime needs):
|
|
14
|
+
|
|
15
|
+
- `@duckdb/duckdb-wasm` — DuckDB-WASM executor (browser + node-blocking shim).
|
|
16
|
+
- `hyparquet`, `hyparquet-writer` — pure-JS Parquet codec for environments without a native build.
|
|
17
|
+
|
|
18
|
+
## Subpath exports
|
|
19
|
+
|
|
20
|
+
| Subpath | Purpose |
|
|
21
|
+
|---|---|
|
|
22
|
+
| `@gscdump/engine` | Barrel: `createStorageEngine`, codec/executor factories, all storage contracts. |
|
|
23
|
+
| `@gscdump/engine/contracts` | `StorageEngine`, `Row`, `TableName`, `WriteCtx`, `ManifestEntry`, ... |
|
|
24
|
+
| `@gscdump/engine/schema` | `SCHEMAS`, `allTables`, `inferTable`, column metadata. |
|
|
25
|
+
| `@gscdump/engine/planner` | `resolveToSQL`, `enumeratePartitions`, partition planning. |
|
|
26
|
+
| `@gscdump/engine/snapshot` | `SnapshotIndex` contract for hot/cold snapshot files. |
|
|
27
|
+
| `@gscdump/engine/ingest` | GSC row → storage row helpers (`createRowAccumulator`, `transformGscRow`). |
|
|
28
|
+
| `@gscdump/engine/sql` | SQL literal binding helpers (`bindLiterals`, `formatLiteral`). |
|
|
29
|
+
| `@gscdump/engine/node` | Node-only DuckDB handle. |
|
|
30
|
+
| `@gscdump/engine/filesystem` | Node-only `DataSource` + `ManifestStore` adapters. |
|
|
31
|
+
| `@gscdump/engine/http` | Read-only HTTP `DataSource` (signed URLs, Range requests). |
|
|
32
|
+
| `@gscdump/engine/hyparquet` | Pure-JS `ParquetCodec`. |
|
|
33
|
+
| `@gscdump/engine/r2` | Cloudflare R2 `DataSource` (structurally typed against `R2Bucket`). |
|
|
34
|
+
|
|
35
|
+
## Stability
|
|
36
|
+
|
|
37
|
+
| Surface | Stability |
|
|
38
|
+
|---|---|
|
|
39
|
+
| `createStorageEngine` and storage contracts (`StorageEngine`, `Row`, `WriteCtx`, ...) | Public |
|
|
40
|
+
| Adapters (`/node`, `/filesystem`, `/http`, `/hyparquet`, `/r2`) | Public |
|
|
41
|
+
| Planner (`resolveToSQL`, `enumeratePartitions`) | Public |
|
|
42
|
+
| Schema (`SCHEMAS`, `allTables`, ...) | Public |
|
|
43
|
+
| Internals reached through `@gscdump/engine/<file>` paths not listed above | Private — may break between minors |
|
|
44
|
+
|
|
45
|
+
## Related
|
|
46
|
+
|
|
47
|
+
- [`gscdump`](../gscdump) — REST client + query builder (edge-safe peer dep).
|
|
48
|
+
- [`@gscdump/analysis`](../analysis) — analyzers; consumes `StorageEngine` via `createEngine` factories.
|
|
49
|
+
- [`@gscdump/cli`](../cli) — CLI wrapping engine + analysis.
|
|
50
|
+
|
|
51
|
+
## License
|
|
52
|
+
|
|
53
|
+
[MIT](../../LICENSE)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import { Row } from "gscdump/contracts";
|
|
2
|
+
interface DuckDBHandle {
|
|
3
|
+
query: (sql: string, params?: unknown[]) => Promise<Row[]>;
|
|
4
|
+
registerFileBuffer: (name: string, bytes: Uint8Array) => Promise<void>;
|
|
5
|
+
copyFileToBuffer: (name: string) => Promise<Uint8Array>;
|
|
6
|
+
dropFiles: (names: string[]) => Promise<void>;
|
|
7
|
+
/**
|
|
8
|
+
* Returns a unique path suitable for `COPY TO '…'` + `copyFileToBuffer`.
|
|
9
|
+
* In Node this is an absolute path under `os.tmpdir()` so DuckDB doesn't
|
|
10
|
+
* litter the CWD; in browsers/Workers it's a plain virtual-FS name.
|
|
11
|
+
*/
|
|
12
|
+
makeTempPath: (ext: string) => string;
|
|
13
|
+
}
|
|
14
|
+
interface NodeDuckDBOptions {
|
|
15
|
+
verbose?: boolean;
|
|
16
|
+
}
|
|
17
|
+
declare function createNodeDuckDBHandle(opts?: NodeDuckDBOptions): DuckDBHandle;
|
|
18
|
+
declare function resetNodeDuckDB(): void;
|
|
19
|
+
export { NodeDuckDBOptions, createNodeDuckDBHandle, resetNodeDuckDB };
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import { createRequire } from "node:module";
|
|
2
|
+
import { unlinkSync } from "node:fs";
|
|
3
|
+
import { tmpdir } from "node:os";
|
|
4
|
+
import { join } from "node:path";
|
|
5
|
+
import process from "node:process";
|
|
6
|
+
import { fileURLToPath } from "node:url";
|
|
7
|
+
import { ConsoleLogger, NODE_RUNTIME, VoidLogger, createDuckDB } from "@duckdb/duckdb-wasm/dist/duckdb-node-blocking.cjs";
|
|
8
|
+
function arrowToRows(result) {
|
|
9
|
+
const r = result;
|
|
10
|
+
const arr = Array.isArray(r) ? r : typeof r?.toArray === "function" ? r.toArray() : [];
|
|
11
|
+
if (!arr || arr.length === 0) return [];
|
|
12
|
+
if (typeof arr[0]?.toJSON === "function") return arr.map((r) => r.toJSON());
|
|
13
|
+
return arr;
|
|
14
|
+
}
|
|
15
|
+
const require_ = createRequire(typeof __filename !== "undefined" ? __filename : typeof import.meta !== "undefined" ? fileURLToPath(import.meta.url) : process.cwd());
|
|
16
|
+
let singleton = null;
|
|
17
|
+
function bundles() {
|
|
18
|
+
return {
|
|
19
|
+
mvp: {
|
|
20
|
+
mainModule: require_.resolve("@duckdb/duckdb-wasm/dist/duckdb-mvp.wasm"),
|
|
21
|
+
mainWorker: null
|
|
22
|
+
},
|
|
23
|
+
eh: {
|
|
24
|
+
mainModule: require_.resolve("@duckdb/duckdb-wasm/dist/duckdb-eh.wasm"),
|
|
25
|
+
mainWorker: null
|
|
26
|
+
}
|
|
27
|
+
};
|
|
28
|
+
}
|
|
29
|
+
async function initialize(opts) {
|
|
30
|
+
const logger = opts.verbose ? new ConsoleLogger() : new VoidLogger();
|
|
31
|
+
const db = await createDuckDB(bundles(), logger, NODE_RUNTIME);
|
|
32
|
+
await db.instantiate();
|
|
33
|
+
return {
|
|
34
|
+
db,
|
|
35
|
+
conn: db.connect()
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
function createNodeDuckDBHandle(opts = {}) {
|
|
39
|
+
if (!singleton) singleton = initialize(opts);
|
|
40
|
+
return {
|
|
41
|
+
async query(sql, params) {
|
|
42
|
+
const { conn } = await singleton;
|
|
43
|
+
if (!params || params.length === 0) return arrowToRows(conn.query(sql));
|
|
44
|
+
const stmt = conn.prepare(sql);
|
|
45
|
+
try {
|
|
46
|
+
return arrowToRows(stmt.query(...params));
|
|
47
|
+
} finally {
|
|
48
|
+
stmt.close();
|
|
49
|
+
}
|
|
50
|
+
},
|
|
51
|
+
async registerFileBuffer(name, bytes) {
|
|
52
|
+
const { db } = await singleton;
|
|
53
|
+
db.registerFileBuffer(name, bytes);
|
|
54
|
+
},
|
|
55
|
+
async copyFileToBuffer(name) {
|
|
56
|
+
const { db } = await singleton;
|
|
57
|
+
return db.copyFileToBuffer(name);
|
|
58
|
+
},
|
|
59
|
+
async dropFiles(names) {
|
|
60
|
+
const { db } = await singleton;
|
|
61
|
+
for (const name of names) {
|
|
62
|
+
try {
|
|
63
|
+
db.dropFile(name);
|
|
64
|
+
} catch {}
|
|
65
|
+
try {
|
|
66
|
+
unlinkSync(name);
|
|
67
|
+
} catch {}
|
|
68
|
+
}
|
|
69
|
+
},
|
|
70
|
+
makeTempPath(ext) {
|
|
71
|
+
return join(tmpdir(), `gscdump-${Math.random().toString(36).slice(2, 10)}.${ext}`);
|
|
72
|
+
}
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
function resetNodeDuckDB() {
|
|
76
|
+
singleton = null;
|
|
77
|
+
}
|
|
78
|
+
export { createNodeDuckDBHandle, resetNodeDuckDB };
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
import { TableName } from "gscdump/contracts";
|
|
2
|
+
import { SearchType } from "gscdump/query";
|
|
3
|
+
/**
|
|
4
|
+
* Compaction tier of a manifest entry. Determines which compactor stage may
|
|
5
|
+
* pick it up as input:
|
|
6
|
+
* - `raw`: per-day file produced by `writeDay`. Eligible for raw→d7 merge at 7d.
|
|
7
|
+
* - `d7`: weekly compaction output. Eligible for d7→d30 merge at 30d.
|
|
8
|
+
* - `d30`: monthly compaction output (matches the legacy `monthly/` partition
|
|
9
|
+
* shape — pre-tier entries are read as `d30`). Eligible for d30→d90 at 90d.
|
|
10
|
+
* - `d90`: quarterly cold-tier output. Terminal; never recompacted.
|
|
11
|
+
*
|
|
12
|
+
* Without an explicit tier, entries written before this field landed default
|
|
13
|
+
* to `raw` for `daily/` partitions and `d30` for `monthly/` partitions, so
|
|
14
|
+
* the tiered compactor picks the right inputs without a backfill rewrite.
|
|
15
|
+
*/
|
|
16
|
+
type CompactionTier = 'raw' | 'd7' | 'd30' | 'd90';
|
|
17
|
+
interface ManifestEntry {
|
|
18
|
+
userId: string;
|
|
19
|
+
siteId?: string;
|
|
20
|
+
table: TableName;
|
|
21
|
+
partition: string;
|
|
22
|
+
objectKey: string;
|
|
23
|
+
rowCount: number;
|
|
24
|
+
bytes: number;
|
|
25
|
+
createdAt: number;
|
|
26
|
+
retiredAt?: number;
|
|
27
|
+
/** Table schema version at write time. Omitted on pre-#27 entries — treat as 1. */
|
|
28
|
+
schemaVersion?: number;
|
|
29
|
+
/**
|
|
30
|
+
* Compaction tier. Omitted on entries written before tiered compaction —
|
|
31
|
+
* treat as `raw` for `daily/` partitions and `d30` for `monthly/` partitions
|
|
32
|
+
* (see {@link inferLegacyTier}).
|
|
33
|
+
*/
|
|
34
|
+
tier?: CompactionTier;
|
|
35
|
+
/**
|
|
36
|
+
* GSC search-type this entry covers (web | discover | news | googleNews |
|
|
37
|
+
* image | video). Omitted on entries written before per-type partitioning
|
|
38
|
+
* landed — treat as `web` (see {@link inferSearchType}). Compaction merges
|
|
39
|
+
* only entries with the same searchType.
|
|
40
|
+
*/
|
|
41
|
+
searchType?: SearchType;
|
|
42
|
+
}
|
|
43
|
+
interface ListLiveFilter {
|
|
44
|
+
userId: string;
|
|
45
|
+
siteId?: string;
|
|
46
|
+
table?: TableName;
|
|
47
|
+
partitions?: string[];
|
|
48
|
+
/**
|
|
49
|
+
* Narrow to a single compaction tier. Tier-aware compaction stages set this
|
|
50
|
+
* so the store doesn't have to return (and the caller doesn't have to scan)
|
|
51
|
+
* the entire manifest just to compact the raw cohort. Legacy entries without
|
|
52
|
+
* an explicit `tier` field match on {@link inferLegacyTier}.
|
|
53
|
+
*/
|
|
54
|
+
tier?: CompactionTier;
|
|
55
|
+
}
|
|
56
|
+
interface DataSource {
|
|
57
|
+
read: (key: string, range?: {
|
|
58
|
+
offset: number;
|
|
59
|
+
length: number;
|
|
60
|
+
}, signal?: AbortSignal) => Promise<Uint8Array>;
|
|
61
|
+
write: (key: string, bytes: Uint8Array) => Promise<void>;
|
|
62
|
+
delete: (keys: string[]) => Promise<void>;
|
|
63
|
+
/**
|
|
64
|
+
* One-shot listing under a prefix. Implementations may cap the number of
|
|
65
|
+
* returned keys (typically 10k) — callers iterating full tenant space
|
|
66
|
+
* should prefer `streamList` when available or narrow the prefix.
|
|
67
|
+
*/
|
|
68
|
+
list: (prefix: string) => Promise<string[]>;
|
|
69
|
+
/**
|
|
70
|
+
* Per-key URI probe. Returns a URI string DuckDB's `httpfs` (or an
|
|
71
|
+
* equivalent engine that fetches its own I/O) can read directly, or
|
|
72
|
+
* `undefined` if the key isn't URI-resolvable on this backend and the
|
|
73
|
+
* caller must fall back to `read(key)` for the bytes.
|
|
74
|
+
*
|
|
75
|
+
* Contracts:
|
|
76
|
+
* - When defined, the returned URI MUST yield byte-identical content to
|
|
77
|
+
* `read(key)`. Callers rely on this for correctness.
|
|
78
|
+
* - Backends with a native URI for every key (filesystem: absolute path,
|
|
79
|
+
* R2 via `httpfs`: signed URL) may always return a string.
|
|
80
|
+
* - Backends without a native URI shape (in-memory) omit the method or
|
|
81
|
+
* return `undefined` per call.
|
|
82
|
+
* - Mixed-per-query is allowed: some keys in one query may return a URI,
|
|
83
|
+
* others may not; the executor branches per key.
|
|
84
|
+
*/
|
|
85
|
+
uri?: (key: string) => string | undefined;
|
|
86
|
+
/**
|
|
87
|
+
* Optional — probe the byte size of a key without reading it. Used by
|
|
88
|
+
* the engine to fill in `WriteResult.bytes` when a codec reports 0 or
|
|
89
|
+
* unknown but the file is non-trivial.
|
|
90
|
+
*/
|
|
91
|
+
head?: (key: string) => Promise<{
|
|
92
|
+
bytes: number;
|
|
93
|
+
} | undefined>;
|
|
94
|
+
/**
|
|
95
|
+
* Optional streaming variant of `list`. Implementations that page
|
|
96
|
+
* backing-store results (R2, S3) should implement this and yield keys
|
|
97
|
+
* lazily. `list` may return up to an adapter-defined cap (typically
|
|
98
|
+
* 10k keys); callers iterating full tenant space must prefer
|
|
99
|
+
* `streamList` when available, or chunk by narrower prefixes.
|
|
100
|
+
*/
|
|
101
|
+
streamList?: (prefix: string) => AsyncIterable<string>;
|
|
102
|
+
}
|
|
103
|
+
interface WatermarkScope {
|
|
104
|
+
userId: string;
|
|
105
|
+
siteId?: string;
|
|
106
|
+
table: TableName;
|
|
107
|
+
}
|
|
108
|
+
interface Watermark extends WatermarkScope {
|
|
109
|
+
newestDateSynced: string;
|
|
110
|
+
oldestDateSynced: string;
|
|
111
|
+
lastSyncAt: number;
|
|
112
|
+
}
|
|
113
|
+
interface WatermarkFilter {
|
|
114
|
+
userId: string;
|
|
115
|
+
siteId?: string;
|
|
116
|
+
table?: TableName;
|
|
117
|
+
}
|
|
118
|
+
type SyncStateKind = 'pending' | 'inflight' | 'done' | 'failed';
|
|
119
|
+
interface SyncStateScope {
|
|
120
|
+
userId: string;
|
|
121
|
+
siteId?: string;
|
|
122
|
+
table: TableName;
|
|
123
|
+
date: string;
|
|
124
|
+
/**
|
|
125
|
+
* GSC search-type this sync state covers. Omitted = `web` (the legacy
|
|
126
|
+
* default; matches pre-#5 sync states stored before per-type sync landed).
|
|
127
|
+
* Lookups must compare via {@link inferSearchType} so a missing field
|
|
128
|
+
* matches an explicit `'web'` and vice versa.
|
|
129
|
+
*/
|
|
130
|
+
searchType?: SearchType;
|
|
131
|
+
}
|
|
132
|
+
interface SyncState extends SyncStateScope {
|
|
133
|
+
state: SyncStateKind;
|
|
134
|
+
updatedAt: number;
|
|
135
|
+
attempts: number;
|
|
136
|
+
error?: string;
|
|
137
|
+
}
|
|
138
|
+
interface SyncStateFilter {
|
|
139
|
+
userId: string;
|
|
140
|
+
siteId?: string;
|
|
141
|
+
table?: TableName;
|
|
142
|
+
state?: SyncStateKind;
|
|
143
|
+
searchType?: SearchType;
|
|
144
|
+
}
|
|
145
|
+
interface SyncStateDetail {
|
|
146
|
+
at?: number;
|
|
147
|
+
error?: string;
|
|
148
|
+
}
|
|
149
|
+
interface LockScope {
|
|
150
|
+
userId: string;
|
|
151
|
+
siteId?: string;
|
|
152
|
+
table: TableName;
|
|
153
|
+
partition: string;
|
|
154
|
+
}
|
|
155
|
+
interface PurgeFilter {
|
|
156
|
+
userId: string;
|
|
157
|
+
siteId?: string;
|
|
158
|
+
}
|
|
159
|
+
interface ManifestPurgeResult {
|
|
160
|
+
entriesRemoved: number;
|
|
161
|
+
watermarksRemoved: number;
|
|
162
|
+
syncStatesRemoved: number;
|
|
163
|
+
}
|
|
164
|
+
interface ManifestStore {
|
|
165
|
+
listLive: (filter: ListLiveFilter) => Promise<ManifestEntry[]>;
|
|
166
|
+
listAll: (filter: ListLiveFilter) => Promise<ManifestEntry[]>;
|
|
167
|
+
registerVersion: (entry: ManifestEntry, superseding?: ManifestEntry[]) => Promise<void>;
|
|
168
|
+
registerVersions: (entries: ManifestEntry[], superseding?: ManifestEntry[]) => Promise<void>;
|
|
169
|
+
listRetired: (olderThan: number) => Promise<ManifestEntry[]>;
|
|
170
|
+
delete: (entries: ManifestEntry[]) => Promise<void>;
|
|
171
|
+
getWatermarks: (filter: WatermarkFilter) => Promise<Watermark[]>;
|
|
172
|
+
bumpWatermark: (scope: WatermarkScope, date: string, at?: number) => Promise<void>;
|
|
173
|
+
getSyncStates: (filter: SyncStateFilter) => Promise<SyncState[]>;
|
|
174
|
+
setSyncState: (scope: SyncStateScope, state: SyncStateKind, detail?: SyncStateDetail) => Promise<void>;
|
|
175
|
+
/**
|
|
176
|
+
* Serialize concurrent writers against the same scope. Held across the
|
|
177
|
+
* write+register window so GC (orphan sweep) won't delete bytes that are
|
|
178
|
+
* midway between `dataSource.write` and `manifestStore.registerVersion`.
|
|
179
|
+
* Scope = tenant × table × partition.
|
|
180
|
+
*/
|
|
181
|
+
withLock: <T>(scope: LockScope, fn: () => Promise<T>) => Promise<T>;
|
|
182
|
+
/**
|
|
183
|
+
* GDPR-grade tenant purge. Removes every manifest entry, watermark, and
|
|
184
|
+
* sync-state record matching the filter. Does NOT touch the underlying
|
|
185
|
+
* data-source bytes; callers (typically {@link StorageEngine.purgeTenant})
|
|
186
|
+
* must sweep the tenant prefix separately before invoking this so that
|
|
187
|
+
* mid-flight failures can't leave orphan parquet with no manifest record.
|
|
188
|
+
*
|
|
189
|
+
* On stores with CAS-backed sharding (R2 manifest) this may issue one
|
|
190
|
+
* mutation per shard. On read-only stores (HTTP) this throws.
|
|
191
|
+
*/
|
|
192
|
+
purgeTenant: (filter: PurgeFilter) => Promise<ManifestPurgeResult>;
|
|
193
|
+
}
|
|
194
|
+
interface FilesystemDataSourceOptions {
|
|
195
|
+
rootDir: string;
|
|
196
|
+
}
|
|
197
|
+
declare function createFilesystemDataSource(opts: FilesystemDataSourceOptions): DataSource;
|
|
198
|
+
interface FilesystemManifestStoreOptions {
|
|
199
|
+
path: string;
|
|
200
|
+
}
|
|
201
|
+
declare function createFilesystemManifestStore(opts: FilesystemManifestStoreOptions): ManifestStore;
|
|
202
|
+
declare function filesystemStats(rootDir: string): Promise<{
|
|
203
|
+
files: number;
|
|
204
|
+
bytes: number;
|
|
205
|
+
}>;
|
|
206
|
+
export { FilesystemDataSourceOptions, FilesystemManifestStoreOptions, createFilesystemDataSource, createFilesystemManifestStore, filesystemStats };
|
|
@@ -0,0 +1,320 @@
|
|
|
1
|
+
import { Buffer } from "node:buffer";
|
|
2
|
+
import { randomBytes } from "node:crypto";
|
|
3
|
+
import { mkdir, readFile, readdir, rename, rm, stat, unlink, writeFile } from "node:fs/promises";
|
|
4
|
+
import { dirname, join, resolve } from "node:path";
|
|
5
|
+
import { lock } from "proper-lockfile";
|
|
6
|
+
function inferSearchType(entry) {
|
|
7
|
+
return entry.searchType ?? "web";
|
|
8
|
+
}
|
|
9
|
+
function inferLegacyTier(entry) {
|
|
10
|
+
if (entry.tier !== void 0) return entry.tier;
|
|
11
|
+
if (entry.partition.startsWith("daily/")) return "raw";
|
|
12
|
+
if (entry.partition.startsWith("monthly/")) return "d30";
|
|
13
|
+
}
|
|
14
|
+
function createFilesystemDataSource(opts) {
|
|
15
|
+
const root = resolve(opts.rootDir);
|
|
16
|
+
function pathFor(key) {
|
|
17
|
+
const resolved = resolve(root, key);
|
|
18
|
+
if (!resolved.startsWith(`${root}/`) && resolved !== root) throw new Error(`path escapes root: ${key}`);
|
|
19
|
+
return resolved;
|
|
20
|
+
}
|
|
21
|
+
return {
|
|
22
|
+
async read(key, range, signal) {
|
|
23
|
+
const bytes = await readFile(pathFor(key), { signal });
|
|
24
|
+
if (!range) return new Uint8Array(bytes.buffer, bytes.byteOffset, bytes.byteLength);
|
|
25
|
+
const sliced = bytes.subarray(range.offset, range.offset + range.length);
|
|
26
|
+
return new Uint8Array(sliced.buffer, sliced.byteOffset, sliced.byteLength);
|
|
27
|
+
},
|
|
28
|
+
async write(key, bytes) {
|
|
29
|
+
const path = pathFor(key);
|
|
30
|
+
await mkdir(dirname(path), { recursive: true });
|
|
31
|
+
await writeFile(path, Buffer.from(bytes));
|
|
32
|
+
},
|
|
33
|
+
async delete(keys) {
|
|
34
|
+
await Promise.all(keys.map(async (k) => {
|
|
35
|
+
await rm(pathFor(k), { force: true });
|
|
36
|
+
}));
|
|
37
|
+
},
|
|
38
|
+
async list(prefix) {
|
|
39
|
+
const full = resolve(root, prefix);
|
|
40
|
+
const out = [];
|
|
41
|
+
await walk(full, out);
|
|
42
|
+
return out.map((p) => p.slice(root.length + 1));
|
|
43
|
+
},
|
|
44
|
+
async *streamList(prefix) {
|
|
45
|
+
const full = resolve(root, prefix);
|
|
46
|
+
for await (const p of walkStream(full)) yield p.slice(root.length + 1);
|
|
47
|
+
},
|
|
48
|
+
async head(key) {
|
|
49
|
+
return stat(pathFor(key)).then((s) => ({ bytes: s.size }), (err) => {
|
|
50
|
+
if (err.code === "ENOENT") return void 0;
|
|
51
|
+
throw err;
|
|
52
|
+
});
|
|
53
|
+
},
|
|
54
|
+
uri(key) {
|
|
55
|
+
return pathFor(key);
|
|
56
|
+
}
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
async function* walkStream(dir) {
|
|
60
|
+
const entries = await readdir(dir, { withFileTypes: true }).catch((err) => {
|
|
61
|
+
if (err.code === "ENOENT") return [];
|
|
62
|
+
throw err;
|
|
63
|
+
});
|
|
64
|
+
for (const entry of entries) {
|
|
65
|
+
const p = join(dir, String(entry.name));
|
|
66
|
+
if (entry.isDirectory()) yield* walkStream(p);
|
|
67
|
+
else yield p;
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
async function walk(dir, out) {
|
|
71
|
+
const entries = await readdir(dir, { withFileTypes: true }).catch((err) => {
|
|
72
|
+
if (err.code === "ENOENT") return [];
|
|
73
|
+
throw err;
|
|
74
|
+
});
|
|
75
|
+
for (const entry of entries) {
|
|
76
|
+
const p = join(dir, String(entry.name));
|
|
77
|
+
if (entry.isDirectory()) await walk(p, out);
|
|
78
|
+
else out.push(p);
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
function watermarkKey(w) {
|
|
82
|
+
return `${w.userId}|${w.siteId ?? ""}|${w.table}`;
|
|
83
|
+
}
|
|
84
|
+
function matchesWatermarkFilter(w, filter) {
|
|
85
|
+
if (w.userId !== filter.userId) return false;
|
|
86
|
+
if (filter.siteId !== void 0 && w.siteId !== filter.siteId) return false;
|
|
87
|
+
if (filter.table !== void 0 && w.table !== filter.table) return false;
|
|
88
|
+
return true;
|
|
89
|
+
}
|
|
90
|
+
function syncStateKey(s) {
|
|
91
|
+
return `${s.userId}|${s.siteId ?? ""}|${s.table}|${s.date}|${inferSearchType(s)}`;
|
|
92
|
+
}
|
|
93
|
+
function matchesSyncStateFilter(s, filter) {
|
|
94
|
+
if (s.userId !== filter.userId) return false;
|
|
95
|
+
if (filter.siteId !== void 0 && s.siteId !== filter.siteId) return false;
|
|
96
|
+
if (filter.table !== void 0 && s.table !== filter.table) return false;
|
|
97
|
+
if (filter.state !== void 0 && s.state !== filter.state) return false;
|
|
98
|
+
if (filter.searchType !== void 0 && inferSearchType(s) !== filter.searchType) return false;
|
|
99
|
+
return true;
|
|
100
|
+
}
|
|
101
|
+
function mergeSyncState(existing, scope, state, detail) {
|
|
102
|
+
const at = detail?.at ?? Date.now();
|
|
103
|
+
const attemptsBump = state === "inflight" ? 1 : 0;
|
|
104
|
+
if (!existing) return {
|
|
105
|
+
userId: scope.userId,
|
|
106
|
+
siteId: scope.siteId,
|
|
107
|
+
table: scope.table,
|
|
108
|
+
date: scope.date,
|
|
109
|
+
state,
|
|
110
|
+
updatedAt: at,
|
|
111
|
+
attempts: attemptsBump,
|
|
112
|
+
error: detail?.error,
|
|
113
|
+
...scope.searchType !== void 0 ? { searchType: scope.searchType } : {}
|
|
114
|
+
};
|
|
115
|
+
return {
|
|
116
|
+
...existing,
|
|
117
|
+
state,
|
|
118
|
+
updatedAt: at,
|
|
119
|
+
attempts: existing.attempts + attemptsBump,
|
|
120
|
+
error: state === "done" ? void 0 : detail?.error ?? existing.error
|
|
121
|
+
};
|
|
122
|
+
}
|
|
123
|
+
function matchesFilter(entry, filter) {
|
|
124
|
+
if (entry.userId !== filter.userId) return false;
|
|
125
|
+
if (filter.siteId !== void 0 && entry.siteId !== filter.siteId) return false;
|
|
126
|
+
if (filter.table !== void 0 && entry.table !== filter.table) return false;
|
|
127
|
+
if (filter.partitions && !filter.partitions.includes(entry.partition)) return false;
|
|
128
|
+
if (filter.tier !== void 0 && inferLegacyTier(entry) !== filter.tier) return false;
|
|
129
|
+
return true;
|
|
130
|
+
}
|
|
131
|
+
function lockFileFor(locksDir, scope) {
|
|
132
|
+
return join(locksDir, `${`${scope.userId}|${scope.siteId ?? ""}|${scope.table}|${scope.partition}`.replace(/[^\w.-]/g, "_")}.lock`);
|
|
133
|
+
}
|
|
134
|
+
function createFilesystemManifestStore(opts) {
|
|
135
|
+
const manifestPath = resolve(opts.path);
|
|
136
|
+
const locksDir = join(dirname(manifestPath), "locks");
|
|
137
|
+
async function load() {
|
|
138
|
+
const content = await readFile(manifestPath, "utf8").catch((err) => {
|
|
139
|
+
if (err.code === "ENOENT") return null;
|
|
140
|
+
throw err;
|
|
141
|
+
});
|
|
142
|
+
if (content === null) return {
|
|
143
|
+
version: 1,
|
|
144
|
+
entries: []
|
|
145
|
+
};
|
|
146
|
+
const parsed = JSON.parse(content);
|
|
147
|
+
if (parsed.version !== 1) throw new Error(`unsupported manifest version ${parsed.version}`);
|
|
148
|
+
return parsed;
|
|
149
|
+
}
|
|
150
|
+
async function save(data) {
|
|
151
|
+
await mkdir(dirname(manifestPath), { recursive: true });
|
|
152
|
+
const tmp = `${manifestPath}.${randomBytes(6).toString("hex")}.tmp`;
|
|
153
|
+
await writeFile(tmp, JSON.stringify(data), "utf8");
|
|
154
|
+
await rename(tmp, manifestPath).catch(async (err) => {
|
|
155
|
+
await unlink(tmp).catch(() => {});
|
|
156
|
+
throw err;
|
|
157
|
+
});
|
|
158
|
+
}
|
|
159
|
+
const queue = [];
|
|
160
|
+
let running = false;
|
|
161
|
+
function enqueue(fn) {
|
|
162
|
+
return new Promise((resolvePromise, rejectPromise) => {
|
|
163
|
+
queue.push(async () => {
|
|
164
|
+
await fn().then(resolvePromise, rejectPromise);
|
|
165
|
+
});
|
|
166
|
+
drain();
|
|
167
|
+
});
|
|
168
|
+
}
|
|
169
|
+
async function drain() {
|
|
170
|
+
if (running) return;
|
|
171
|
+
running = true;
|
|
172
|
+
while (queue.length > 0) await queue.shift()().catch(() => {});
|
|
173
|
+
running = false;
|
|
174
|
+
}
|
|
175
|
+
function entryKey(e) {
|
|
176
|
+
return e.objectKey;
|
|
177
|
+
}
|
|
178
|
+
async function registerVersionsImpl(newEntries, superseding) {
|
|
179
|
+
const data = await load();
|
|
180
|
+
const supersededAt = newEntries[0]?.createdAt ?? Date.now();
|
|
181
|
+
const byKey = new Map(data.entries.map((e) => [entryKey(e), e]));
|
|
182
|
+
if (superseding) for (const s of superseding) {
|
|
183
|
+
const existing = byKey.get(entryKey(s));
|
|
184
|
+
if (existing && existing.retiredAt === void 0) byKey.set(entryKey(s), {
|
|
185
|
+
...existing,
|
|
186
|
+
retiredAt: supersededAt
|
|
187
|
+
});
|
|
188
|
+
}
|
|
189
|
+
for (const e of newEntries) byKey.set(entryKey(e), e);
|
|
190
|
+
data.entries = Array.from(byKey.values());
|
|
191
|
+
await save(data);
|
|
192
|
+
}
|
|
193
|
+
return {
|
|
194
|
+
async listLive(filter) {
|
|
195
|
+
return (await load()).entries.filter((e) => e.retiredAt === void 0 && matchesFilter(e, filter));
|
|
196
|
+
},
|
|
197
|
+
async listAll(filter) {
|
|
198
|
+
return (await load()).entries.filter((e) => matchesFilter(e, filter));
|
|
199
|
+
},
|
|
200
|
+
async registerVersion(entry, superseding) {
|
|
201
|
+
return enqueue(() => registerVersionsImpl([entry], superseding));
|
|
202
|
+
},
|
|
203
|
+
async registerVersions(entries, superseding) {
|
|
204
|
+
return enqueue(() => registerVersionsImpl(entries, superseding));
|
|
205
|
+
},
|
|
206
|
+
async listRetired(olderThan) {
|
|
207
|
+
return (await load()).entries.filter((e) => e.retiredAt !== void 0 && e.retiredAt <= olderThan);
|
|
208
|
+
},
|
|
209
|
+
async delete(toDelete) {
|
|
210
|
+
return enqueue(async () => {
|
|
211
|
+
const data = await load();
|
|
212
|
+
const toDeleteKeys = new Set(toDelete.map(entryKey));
|
|
213
|
+
data.entries = data.entries.filter((e) => !toDeleteKeys.has(entryKey(e)));
|
|
214
|
+
await save(data);
|
|
215
|
+
});
|
|
216
|
+
},
|
|
217
|
+
async getWatermarks(filter) {
|
|
218
|
+
return ((await load()).watermarks ?? []).filter((w) => matchesWatermarkFilter(w, filter));
|
|
219
|
+
},
|
|
220
|
+
async getSyncStates(filter) {
|
|
221
|
+
return ((await load()).syncStates ?? []).filter((s) => matchesSyncStateFilter(s, filter));
|
|
222
|
+
},
|
|
223
|
+
async setSyncState(scope, state, detail) {
|
|
224
|
+
return enqueue(async () => {
|
|
225
|
+
const data = await load();
|
|
226
|
+
const key = syncStateKey(scope);
|
|
227
|
+
const byKey = new Map((data.syncStates ?? []).map((s) => [syncStateKey(s), s]));
|
|
228
|
+
byKey.set(key, mergeSyncState(byKey.get(key), scope, state, detail));
|
|
229
|
+
data.syncStates = Array.from(byKey.values());
|
|
230
|
+
await save(data);
|
|
231
|
+
});
|
|
232
|
+
},
|
|
233
|
+
async withLock(scope, fn) {
|
|
234
|
+
await mkdir(locksDir, { recursive: true });
|
|
235
|
+
const path = lockFileFor(locksDir, scope);
|
|
236
|
+
await writeFile(path, "", { flag: "a" });
|
|
237
|
+
const release = await lock(path, {
|
|
238
|
+
realpath: false,
|
|
239
|
+
stale: 3e4,
|
|
240
|
+
retries: {
|
|
241
|
+
retries: 20,
|
|
242
|
+
minTimeout: 50,
|
|
243
|
+
maxTimeout: 500,
|
|
244
|
+
factor: 1.5
|
|
245
|
+
}
|
|
246
|
+
});
|
|
247
|
+
return await fn().finally(() => release().catch(() => {}));
|
|
248
|
+
},
|
|
249
|
+
async purgeTenant(filter) {
|
|
250
|
+
return enqueue(async () => {
|
|
251
|
+
const data = await load();
|
|
252
|
+
const matches = (r) => r.userId === filter.userId && (filter.siteId === void 0 || r.siteId === filter.siteId);
|
|
253
|
+
const before = {
|
|
254
|
+
entries: data.entries.length,
|
|
255
|
+
watermarks: (data.watermarks ?? []).length,
|
|
256
|
+
syncStates: (data.syncStates ?? []).length
|
|
257
|
+
};
|
|
258
|
+
data.entries = data.entries.filter((e) => !matches(e));
|
|
259
|
+
data.watermarks = (data.watermarks ?? []).filter((w) => !matches(w));
|
|
260
|
+
data.syncStates = (data.syncStates ?? []).filter((s) => !matches(s));
|
|
261
|
+
await save(data);
|
|
262
|
+
return {
|
|
263
|
+
entriesRemoved: before.entries - data.entries.length,
|
|
264
|
+
watermarksRemoved: before.watermarks - data.watermarks.length,
|
|
265
|
+
syncStatesRemoved: before.syncStates - data.syncStates.length
|
|
266
|
+
};
|
|
267
|
+
});
|
|
268
|
+
},
|
|
269
|
+
async bumpWatermark(scope, date, at) {
|
|
270
|
+
return enqueue(async () => {
|
|
271
|
+
const data = await load();
|
|
272
|
+
const key = watermarkKey(scope);
|
|
273
|
+
const byKey = new Map((data.watermarks ?? []).map((w) => [watermarkKey(w), w]));
|
|
274
|
+
const existing = byKey.get(key);
|
|
275
|
+
const nowMs = at ?? Date.now();
|
|
276
|
+
const next = existing ? {
|
|
277
|
+
...existing,
|
|
278
|
+
newestDateSynced: date > existing.newestDateSynced ? date : existing.newestDateSynced,
|
|
279
|
+
oldestDateSynced: date < existing.oldestDateSynced ? date : existing.oldestDateSynced,
|
|
280
|
+
lastSyncAt: nowMs > existing.lastSyncAt ? nowMs : existing.lastSyncAt
|
|
281
|
+
} : {
|
|
282
|
+
userId: scope.userId,
|
|
283
|
+
siteId: scope.siteId,
|
|
284
|
+
table: scope.table,
|
|
285
|
+
newestDateSynced: date,
|
|
286
|
+
oldestDateSynced: date,
|
|
287
|
+
lastSyncAt: nowMs
|
|
288
|
+
};
|
|
289
|
+
byKey.set(key, next);
|
|
290
|
+
data.watermarks = Array.from(byKey.values());
|
|
291
|
+
await save(data);
|
|
292
|
+
});
|
|
293
|
+
}
|
|
294
|
+
};
|
|
295
|
+
}
|
|
296
|
+
async function filesystemStats(rootDir) {
|
|
297
|
+
const keys = [];
|
|
298
|
+
await walkForStats(resolve(rootDir), keys);
|
|
299
|
+
let bytes = 0;
|
|
300
|
+
for (const k of keys) {
|
|
301
|
+
const s = await stat(k);
|
|
302
|
+
bytes += s.size;
|
|
303
|
+
}
|
|
304
|
+
return {
|
|
305
|
+
files: keys.length,
|
|
306
|
+
bytes
|
|
307
|
+
};
|
|
308
|
+
}
|
|
309
|
+
async function walkForStats(dir, out) {
|
|
310
|
+
const entries = await readdir(dir, { withFileTypes: true }).catch((err) => {
|
|
311
|
+
if (err.code === "ENOENT") return [];
|
|
312
|
+
throw err;
|
|
313
|
+
});
|
|
314
|
+
for (const entry of entries) {
|
|
315
|
+
const p = join(dir, String(entry.name));
|
|
316
|
+
if (entry.isDirectory()) await walkForStats(p, out);
|
|
317
|
+
else out.push(p);
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
export { createFilesystemDataSource, createFilesystemManifestStore, filesystemStats };
|