@gscdump/engine 0.6.1 → 0.6.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_chunks/compiler.mjs +288 -0
- package/dist/_chunks/duckdb.d.mts +26 -0
- package/dist/_chunks/engine.mjs +578 -0
- package/dist/_chunks/pg-adapter.mjs +676 -0
- package/dist/_chunks/planner.d.mts +15 -0
- package/dist/_chunks/schema.d.mts +1258 -0
- package/dist/_chunks/schema.mjs +139 -0
- package/dist/_chunks/storage.d.mts +476 -0
- package/dist/_chunks/storage.mjs +39 -0
- package/dist/_chunks/types.d.mts +53 -0
- package/dist/adapters/duckdb-node.d.mts +1 -13
- package/dist/adapters/duckdb-node.mjs +1 -7
- package/dist/adapters/filesystem.d.mts +1 -193
- package/dist/adapters/filesystem.mjs +2 -9
- package/dist/adapters/http.d.mts +1 -193
- package/dist/adapters/http.mjs +1 -5
- package/dist/adapters/hyparquet.d.mts +6 -83
- package/dist/adapters/hyparquet.mjs +1 -105
- package/dist/adapters/inspection-sqlite-browser.d.mts +1 -7
- package/dist/adapters/inspection-sqlite-node.d.mts +1 -7
- package/dist/adapters/inspection-sqlite-node.mjs +1 -1
- package/dist/adapters/node-harness.d.mts +3 -306
- package/dist/adapters/node-harness.mjs +4 -1866
- package/dist/adapters/r2-manifest.d.mts +4 -149
- package/dist/adapters/r2-manifest.mjs +1 -8
- package/dist/adapters/r2.d.mts +1 -47
- package/dist/contracts.d.mts +1 -435
- package/dist/entities.d.mts +1 -47
- package/dist/index.d.mts +8 -1844
- package/dist/index.mjs +8 -1962
- package/dist/ingest.d.mts +1 -1
- package/dist/planner.d.mts +3 -16
- package/dist/planner.mjs +1 -320
- package/dist/resolver/index.d.mts +3 -51
- package/dist/resolver/index.mjs +2 -780
- package/dist/rollups.d.mts +6 -51
- package/dist/rollups.mjs +2 -209
- package/dist/schema.d.mts +2 -1258
- package/dist/schema.mjs +1 -138
- package/package.json +2 -2
|
@@ -1,149 +1,4 @@
|
|
|
1
|
-
import { TableName,
|
|
2
|
-
import { SearchType } from "gscdump/query";
|
|
3
|
-
/**
|
|
4
|
-
* Compaction tier of a manifest entry. Determines which compactor stage may
|
|
5
|
-
* pick it up as input:
|
|
6
|
-
* - `raw`: per-day file produced by `writeDay`. Eligible for raw→d7 merge at 7d.
|
|
7
|
-
* - `d7`: weekly compaction output. Eligible for d7→d30 merge at 30d.
|
|
8
|
-
* - `d30`: monthly compaction output (matches the legacy `monthly/` partition
|
|
9
|
-
* shape — pre-tier entries are read as `d30`). Eligible for d30→d90 at 90d.
|
|
10
|
-
* - `d90`: quarterly cold-tier output. Terminal; never recompacted.
|
|
11
|
-
*
|
|
12
|
-
* Without an explicit tier, entries written before this field landed default
|
|
13
|
-
* to `raw` for `daily/` partitions and `d30` for `monthly/` partitions, so
|
|
14
|
-
* the tiered compactor picks the right inputs without a backfill rewrite.
|
|
15
|
-
*/
|
|
16
|
-
type CompactionTier = 'raw' | 'd7' | 'd30' | 'd90';
|
|
17
|
-
interface ManifestEntry {
|
|
18
|
-
userId: string;
|
|
19
|
-
siteId?: string;
|
|
20
|
-
table: TableName;
|
|
21
|
-
partition: string;
|
|
22
|
-
objectKey: string;
|
|
23
|
-
rowCount: number;
|
|
24
|
-
bytes: number;
|
|
25
|
-
createdAt: number;
|
|
26
|
-
retiredAt?: number;
|
|
27
|
-
/** Table schema version at write time. Omitted on pre-#27 entries — treat as 1. */
|
|
28
|
-
schemaVersion?: number;
|
|
29
|
-
/**
|
|
30
|
-
* Compaction tier. Omitted on entries written before tiered compaction —
|
|
31
|
-
* treat as `raw` for `daily/` partitions and `d30` for `monthly/` partitions
|
|
32
|
-
* (see {@link inferLegacyTier}).
|
|
33
|
-
*/
|
|
34
|
-
tier?: CompactionTier;
|
|
35
|
-
/**
|
|
36
|
-
* GSC search-type this entry covers (web | discover | news | googleNews |
|
|
37
|
-
* image | video). Omitted on entries written before per-type partitioning
|
|
38
|
-
* landed — treat as `web` (see {@link inferSearchType}). Compaction merges
|
|
39
|
-
* only entries with the same searchType.
|
|
40
|
-
*/
|
|
41
|
-
searchType?: SearchType;
|
|
42
|
-
}
|
|
43
|
-
interface ListLiveFilter {
|
|
44
|
-
userId: string;
|
|
45
|
-
siteId?: string;
|
|
46
|
-
table?: TableName;
|
|
47
|
-
partitions?: string[];
|
|
48
|
-
/**
|
|
49
|
-
* Narrow to a single compaction tier. Tier-aware compaction stages set this
|
|
50
|
-
* so the store doesn't have to return (and the caller doesn't have to scan)
|
|
51
|
-
* the entire manifest just to compact the raw cohort. Legacy entries without
|
|
52
|
-
* an explicit `tier` field match on {@link inferLegacyTier}.
|
|
53
|
-
*/
|
|
54
|
-
tier?: CompactionTier;
|
|
55
|
-
}
|
|
56
|
-
interface WatermarkScope {
|
|
57
|
-
userId: string;
|
|
58
|
-
siteId?: string;
|
|
59
|
-
table: TableName;
|
|
60
|
-
}
|
|
61
|
-
interface Watermark extends WatermarkScope {
|
|
62
|
-
newestDateSynced: string;
|
|
63
|
-
oldestDateSynced: string;
|
|
64
|
-
lastSyncAt: number;
|
|
65
|
-
}
|
|
66
|
-
interface WatermarkFilter {
|
|
67
|
-
userId: string;
|
|
68
|
-
siteId?: string;
|
|
69
|
-
table?: TableName;
|
|
70
|
-
}
|
|
71
|
-
type SyncStateKind = 'pending' | 'inflight' | 'done' | 'failed';
|
|
72
|
-
interface SyncStateScope {
|
|
73
|
-
userId: string;
|
|
74
|
-
siteId?: string;
|
|
75
|
-
table: TableName;
|
|
76
|
-
date: string;
|
|
77
|
-
/**
|
|
78
|
-
* GSC search-type this sync state covers. Omitted = `web` (the legacy
|
|
79
|
-
* default; matches pre-#5 sync states stored before per-type sync landed).
|
|
80
|
-
* Lookups must compare via {@link inferSearchType} so a missing field
|
|
81
|
-
* matches an explicit `'web'` and vice versa.
|
|
82
|
-
*/
|
|
83
|
-
searchType?: SearchType;
|
|
84
|
-
}
|
|
85
|
-
interface SyncState extends SyncStateScope {
|
|
86
|
-
state: SyncStateKind;
|
|
87
|
-
updatedAt: number;
|
|
88
|
-
attempts: number;
|
|
89
|
-
error?: string;
|
|
90
|
-
}
|
|
91
|
-
interface SyncStateFilter {
|
|
92
|
-
userId: string;
|
|
93
|
-
siteId?: string;
|
|
94
|
-
table?: TableName;
|
|
95
|
-
state?: SyncStateKind;
|
|
96
|
-
searchType?: SearchType;
|
|
97
|
-
}
|
|
98
|
-
interface SyncStateDetail {
|
|
99
|
-
at?: number;
|
|
100
|
-
error?: string;
|
|
101
|
-
}
|
|
102
|
-
interface LockScope {
|
|
103
|
-
userId: string;
|
|
104
|
-
siteId?: string;
|
|
105
|
-
table: TableName;
|
|
106
|
-
partition: string;
|
|
107
|
-
}
|
|
108
|
-
interface PurgeFilter {
|
|
109
|
-
userId: string;
|
|
110
|
-
siteId?: string;
|
|
111
|
-
}
|
|
112
|
-
interface ManifestPurgeResult {
|
|
113
|
-
entriesRemoved: number;
|
|
114
|
-
watermarksRemoved: number;
|
|
115
|
-
syncStatesRemoved: number;
|
|
116
|
-
}
|
|
117
|
-
interface ManifestStore {
|
|
118
|
-
listLive: (filter: ListLiveFilter) => Promise<ManifestEntry[]>;
|
|
119
|
-
listAll: (filter: ListLiveFilter) => Promise<ManifestEntry[]>;
|
|
120
|
-
registerVersion: (entry: ManifestEntry, superseding?: ManifestEntry[]) => Promise<void>;
|
|
121
|
-
registerVersions: (entries: ManifestEntry[], superseding?: ManifestEntry[]) => Promise<void>;
|
|
122
|
-
listRetired: (olderThan: number) => Promise<ManifestEntry[]>;
|
|
123
|
-
delete: (entries: ManifestEntry[]) => Promise<void>;
|
|
124
|
-
getWatermarks: (filter: WatermarkFilter) => Promise<Watermark[]>;
|
|
125
|
-
bumpWatermark: (scope: WatermarkScope, date: string, at?: number) => Promise<void>;
|
|
126
|
-
getSyncStates: (filter: SyncStateFilter) => Promise<SyncState[]>;
|
|
127
|
-
setSyncState: (scope: SyncStateScope, state: SyncStateKind, detail?: SyncStateDetail) => Promise<void>;
|
|
128
|
-
/**
|
|
129
|
-
* Serialize concurrent writers against the same scope. Held across the
|
|
130
|
-
* write+register window so GC (orphan sweep) won't delete bytes that are
|
|
131
|
-
* midway between `dataSource.write` and `manifestStore.registerVersion`.
|
|
132
|
-
* Scope = tenant × table × partition.
|
|
133
|
-
*/
|
|
134
|
-
withLock: <T>(scope: LockScope, fn: () => Promise<T>) => Promise<T>;
|
|
135
|
-
/**
|
|
136
|
-
* GDPR-grade tenant purge. Removes every manifest entry, watermark, and
|
|
137
|
-
* sync-state record matching the filter. Does NOT touch the underlying
|
|
138
|
-
* data-source bytes; callers (typically {@link StorageEngine.purgeTenant})
|
|
139
|
-
* must sweep the tenant prefix separately before invoking this so that
|
|
140
|
-
* mid-flight failures can't leave orphan parquet with no manifest record.
|
|
141
|
-
*
|
|
142
|
-
* On stores with CAS-backed sharding (R2 manifest) this may issue one
|
|
143
|
-
* mutation per shard. On read-only stores (HTTP) this throws.
|
|
144
|
-
*/
|
|
145
|
-
purgeTenant: (filter: PurgeFilter) => Promise<ManifestPurgeResult>;
|
|
146
|
-
}
|
|
1
|
+
import { N as TableName, m as ManifestStore } from "../_chunks/storage.mjs";
|
|
147
2
|
interface R2ObjectMetadata {
|
|
148
3
|
etag: string;
|
|
149
4
|
}
|
|
@@ -194,17 +49,17 @@ interface R2ManifestBucketLike {
|
|
|
194
49
|
type R2ManifestEvent = {
|
|
195
50
|
kind: 'cas-attempt';
|
|
196
51
|
siteId: string;
|
|
197
|
-
table: TableName
|
|
52
|
+
table: TableName;
|
|
198
53
|
attempt: number;
|
|
199
54
|
} | {
|
|
200
55
|
kind: 'cas-rejected';
|
|
201
56
|
siteId: string;
|
|
202
|
-
table: TableName
|
|
57
|
+
table: TableName;
|
|
203
58
|
attempt: number;
|
|
204
59
|
} | {
|
|
205
60
|
kind: 'cas-committed';
|
|
206
61
|
siteId: string;
|
|
207
|
-
table: TableName
|
|
62
|
+
table: TableName;
|
|
208
63
|
attempts: number;
|
|
209
64
|
};
|
|
210
65
|
interface CreateR2ManifestStoreOptions {
|
|
@@ -1,11 +1,4 @@
|
|
|
1
|
-
|
|
2
|
-
return entry.searchType ?? "web";
|
|
3
|
-
}
|
|
4
|
-
function inferLegacyTier(entry) {
|
|
5
|
-
if (entry.tier !== void 0) return entry.tier;
|
|
6
|
-
if (entry.partition.startsWith("daily/")) return "raw";
|
|
7
|
-
if (entry.partition.startsWith("monthly/")) return "d30";
|
|
8
|
-
}
|
|
1
|
+
import { i as inferSearchType, r as inferLegacyTier } from "../_chunks/storage.mjs";
|
|
9
2
|
const SHARD_RE = /^u_[^/]+\/manifest\/(?<siteId>[^/]+)\/(?<table>[^/]+)\/HEAD$/;
|
|
10
3
|
function defaultSnapshotId() {
|
|
11
4
|
return `${Date.now()}-${Math.random().toString(36).slice(2, 10)}`;
|
package/dist/adapters/r2.d.mts
CHANGED
|
@@ -1,50 +1,4 @@
|
|
|
1
|
-
|
|
2
|
-
read: (key: string, range?: {
|
|
3
|
-
offset: number;
|
|
4
|
-
length: number;
|
|
5
|
-
}, signal?: AbortSignal) => Promise<Uint8Array>;
|
|
6
|
-
write: (key: string, bytes: Uint8Array) => Promise<void>;
|
|
7
|
-
delete: (keys: string[]) => Promise<void>;
|
|
8
|
-
/**
|
|
9
|
-
* One-shot listing under a prefix. Implementations may cap the number of
|
|
10
|
-
* returned keys (typically 10k) — callers iterating full tenant space
|
|
11
|
-
* should prefer `streamList` when available or narrow the prefix.
|
|
12
|
-
*/
|
|
13
|
-
list: (prefix: string) => Promise<string[]>;
|
|
14
|
-
/**
|
|
15
|
-
* Per-key URI probe. Returns a URI string DuckDB's `httpfs` (or an
|
|
16
|
-
* equivalent engine that fetches its own I/O) can read directly, or
|
|
17
|
-
* `undefined` if the key isn't URI-resolvable on this backend and the
|
|
18
|
-
* caller must fall back to `read(key)` for the bytes.
|
|
19
|
-
*
|
|
20
|
-
* Contracts:
|
|
21
|
-
* - When defined, the returned URI MUST yield byte-identical content to
|
|
22
|
-
* `read(key)`. Callers rely on this for correctness.
|
|
23
|
-
* - Backends with a native URI for every key (filesystem: absolute path,
|
|
24
|
-
* R2 via `httpfs`: signed URL) may always return a string.
|
|
25
|
-
* - Backends without a native URI shape (in-memory) omit the method or
|
|
26
|
-
* return `undefined` per call.
|
|
27
|
-
* - Mixed-per-query is allowed: some keys in one query may return a URI,
|
|
28
|
-
* others may not; the executor branches per key.
|
|
29
|
-
*/
|
|
30
|
-
uri?: (key: string) => string | undefined;
|
|
31
|
-
/**
|
|
32
|
-
* Optional — probe the byte size of a key without reading it. Used by
|
|
33
|
-
* the engine to fill in `WriteResult.bytes` when a codec reports 0 or
|
|
34
|
-
* unknown but the file is non-trivial.
|
|
35
|
-
*/
|
|
36
|
-
head?: (key: string) => Promise<{
|
|
37
|
-
bytes: number;
|
|
38
|
-
} | undefined>;
|
|
39
|
-
/**
|
|
40
|
-
* Optional streaming variant of `list`. Implementations that page
|
|
41
|
-
* backing-store results (R2, S3) should implement this and yield keys
|
|
42
|
-
* lazily. `list` may return up to an adapter-defined cap (typically
|
|
43
|
-
* 10k keys); callers iterating full tenant space must prefer
|
|
44
|
-
* `streamList` when available, or chunk by narrower prefixes.
|
|
45
|
-
*/
|
|
46
|
-
streamList?: (prefix: string) => AsyncIterable<string>;
|
|
47
|
-
}
|
|
1
|
+
import { a as DataSource } from "../_chunks/storage.mjs";
|
|
48
2
|
interface R2GetOptions {
|
|
49
3
|
range?: {
|
|
50
4
|
offset: number;
|
package/dist/contracts.d.mts
CHANGED
|
@@ -1,436 +1,2 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { BuilderState, SearchType, SearchType as SearchType$1 } from "gscdump/query";
|
|
3
|
-
/**
|
|
4
|
-
* Per-tier age threshold in days. Default ladder collapses on these gates:
|
|
5
|
-
* - raw → d7 once a daily file is older than `raw` days (default 7).
|
|
6
|
-
* - d7 → d30 once the entire weekly bucket sits behind `d7` days (default 30).
|
|
7
|
-
* - d30 → d90 once the entire monthly bucket sits behind `d30` days (default 90).
|
|
8
|
-
*/
|
|
9
|
-
interface CompactionThresholds {
|
|
10
|
-
raw?: number;
|
|
11
|
-
d7?: number;
|
|
12
|
-
d30?: number;
|
|
13
|
-
}
|
|
14
|
-
type ComparisonFilter = 'new' | 'lost' | 'improving' | 'declining';
|
|
15
|
-
interface WriteCtx extends TenantCtx$1 {
|
|
16
|
-
table: TableName$1;
|
|
17
|
-
date?: string;
|
|
18
|
-
now?: () => number;
|
|
19
|
-
/**
|
|
20
|
-
* GSC search-type partition this write belongs to. Defaults to `'web'`.
|
|
21
|
-
* Non-web values (`discover`, `news`, `googleNews`, `image`, `video`)
|
|
22
|
-
* cause the writer to insert the type into the object key path so files
|
|
23
|
-
* for different search types coexist without colliding.
|
|
24
|
-
*/
|
|
25
|
-
searchType?: SearchType$1;
|
|
26
|
-
}
|
|
27
|
-
interface QueryCtx extends TenantCtx$1 {
|
|
28
|
-
table?: TableName$1;
|
|
29
|
-
signal?: AbortSignal;
|
|
30
|
-
}
|
|
31
|
-
interface GcCtx {
|
|
32
|
-
now?: () => number;
|
|
33
|
-
userId?: string;
|
|
34
|
-
siteId?: string;
|
|
35
|
-
}
|
|
36
|
-
/**
|
|
37
|
-
* Compaction tier of a manifest entry. Determines which compactor stage may
|
|
38
|
-
* pick it up as input:
|
|
39
|
-
* - `raw`: per-day file produced by `writeDay`. Eligible for raw→d7 merge at 7d.
|
|
40
|
-
* - `d7`: weekly compaction output. Eligible for d7→d30 merge at 30d.
|
|
41
|
-
* - `d30`: monthly compaction output (matches the legacy `monthly/` partition
|
|
42
|
-
* shape — pre-tier entries are read as `d30`). Eligible for d30→d90 at 90d.
|
|
43
|
-
* - `d90`: quarterly cold-tier output. Terminal; never recompacted.
|
|
44
|
-
*
|
|
45
|
-
* Without an explicit tier, entries written before this field landed default
|
|
46
|
-
* to `raw` for `daily/` partitions and `d30` for `monthly/` partitions, so
|
|
47
|
-
* the tiered compactor picks the right inputs without a backfill rewrite.
|
|
48
|
-
*/
|
|
49
|
-
type CompactionTier = 'raw' | 'd7' | 'd30' | 'd90';
|
|
50
|
-
interface ManifestEntry {
|
|
51
|
-
userId: string;
|
|
52
|
-
siteId?: string;
|
|
53
|
-
table: TableName$1;
|
|
54
|
-
partition: string;
|
|
55
|
-
objectKey: string;
|
|
56
|
-
rowCount: number;
|
|
57
|
-
bytes: number;
|
|
58
|
-
createdAt: number;
|
|
59
|
-
retiredAt?: number;
|
|
60
|
-
/** Table schema version at write time. Omitted on pre-#27 entries — treat as 1. */
|
|
61
|
-
schemaVersion?: number;
|
|
62
|
-
/**
|
|
63
|
-
* Compaction tier. Omitted on entries written before tiered compaction —
|
|
64
|
-
* treat as `raw` for `daily/` partitions and `d30` for `monthly/` partitions
|
|
65
|
-
* (see {@link inferLegacyTier}).
|
|
66
|
-
*/
|
|
67
|
-
tier?: CompactionTier;
|
|
68
|
-
/**
|
|
69
|
-
* GSC search-type this entry covers (web | discover | news | googleNews |
|
|
70
|
-
* image | video). Omitted on entries written before per-type partitioning
|
|
71
|
-
* landed — treat as `web` (see {@link inferSearchType}). Compaction merges
|
|
72
|
-
* only entries with the same searchType.
|
|
73
|
-
*/
|
|
74
|
-
searchType?: SearchType$1;
|
|
75
|
-
}
|
|
76
|
-
interface ListLiveFilter {
|
|
77
|
-
userId: string;
|
|
78
|
-
siteId?: string;
|
|
79
|
-
table?: TableName$1;
|
|
80
|
-
partitions?: string[];
|
|
81
|
-
/**
|
|
82
|
-
* Narrow to a single compaction tier. Tier-aware compaction stages set this
|
|
83
|
-
* so the store doesn't have to return (and the caller doesn't have to scan)
|
|
84
|
-
* the entire manifest just to compact the raw cohort. Legacy entries without
|
|
85
|
-
* an explicit `tier` field match on {@link inferLegacyTier}.
|
|
86
|
-
*/
|
|
87
|
-
tier?: CompactionTier;
|
|
88
|
-
}
|
|
89
|
-
interface DataSource {
|
|
90
|
-
read: (key: string, range?: {
|
|
91
|
-
offset: number;
|
|
92
|
-
length: number;
|
|
93
|
-
}, signal?: AbortSignal) => Promise<Uint8Array>;
|
|
94
|
-
write: (key: string, bytes: Uint8Array) => Promise<void>;
|
|
95
|
-
delete: (keys: string[]) => Promise<void>;
|
|
96
|
-
/**
|
|
97
|
-
* One-shot listing under a prefix. Implementations may cap the number of
|
|
98
|
-
* returned keys (typically 10k) — callers iterating full tenant space
|
|
99
|
-
* should prefer `streamList` when available or narrow the prefix.
|
|
100
|
-
*/
|
|
101
|
-
list: (prefix: string) => Promise<string[]>;
|
|
102
|
-
/**
|
|
103
|
-
* Per-key URI probe. Returns a URI string DuckDB's `httpfs` (or an
|
|
104
|
-
* equivalent engine that fetches its own I/O) can read directly, or
|
|
105
|
-
* `undefined` if the key isn't URI-resolvable on this backend and the
|
|
106
|
-
* caller must fall back to `read(key)` for the bytes.
|
|
107
|
-
*
|
|
108
|
-
* Contracts:
|
|
109
|
-
* - When defined, the returned URI MUST yield byte-identical content to
|
|
110
|
-
* `read(key)`. Callers rely on this for correctness.
|
|
111
|
-
* - Backends with a native URI for every key (filesystem: absolute path,
|
|
112
|
-
* R2 via `httpfs`: signed URL) may always return a string.
|
|
113
|
-
* - Backends without a native URI shape (in-memory) omit the method or
|
|
114
|
-
* return `undefined` per call.
|
|
115
|
-
* - Mixed-per-query is allowed: some keys in one query may return a URI,
|
|
116
|
-
* others may not; the executor branches per key.
|
|
117
|
-
*/
|
|
118
|
-
uri?: (key: string) => string | undefined;
|
|
119
|
-
/**
|
|
120
|
-
* Optional — probe the byte size of a key without reading it. Used by
|
|
121
|
-
* the engine to fill in `WriteResult.bytes` when a codec reports 0 or
|
|
122
|
-
* unknown but the file is non-trivial.
|
|
123
|
-
*/
|
|
124
|
-
head?: (key: string) => Promise<{
|
|
125
|
-
bytes: number;
|
|
126
|
-
} | undefined>;
|
|
127
|
-
/**
|
|
128
|
-
* Optional streaming variant of `list`. Implementations that page
|
|
129
|
-
* backing-store results (R2, S3) should implement this and yield keys
|
|
130
|
-
* lazily. `list` may return up to an adapter-defined cap (typically
|
|
131
|
-
* 10k keys); callers iterating full tenant space must prefer
|
|
132
|
-
* `streamList` when available, or chunk by narrower prefixes.
|
|
133
|
-
*/
|
|
134
|
-
streamList?: (prefix: string) => AsyncIterable<string>;
|
|
135
|
-
}
|
|
136
|
-
interface WatermarkScope {
|
|
137
|
-
userId: string;
|
|
138
|
-
siteId?: string;
|
|
139
|
-
table: TableName$1;
|
|
140
|
-
}
|
|
141
|
-
interface Watermark extends WatermarkScope {
|
|
142
|
-
newestDateSynced: string;
|
|
143
|
-
oldestDateSynced: string;
|
|
144
|
-
lastSyncAt: number;
|
|
145
|
-
}
|
|
146
|
-
interface WatermarkFilter {
|
|
147
|
-
userId: string;
|
|
148
|
-
siteId?: string;
|
|
149
|
-
table?: TableName$1;
|
|
150
|
-
}
|
|
151
|
-
type SyncStateKind = 'pending' | 'inflight' | 'done' | 'failed';
|
|
152
|
-
interface SyncStateScope {
|
|
153
|
-
userId: string;
|
|
154
|
-
siteId?: string;
|
|
155
|
-
table: TableName$1;
|
|
156
|
-
date: string;
|
|
157
|
-
/**
|
|
158
|
-
* GSC search-type this sync state covers. Omitted = `web` (the legacy
|
|
159
|
-
* default; matches pre-#5 sync states stored before per-type sync landed).
|
|
160
|
-
* Lookups must compare via {@link inferSearchType} so a missing field
|
|
161
|
-
* matches an explicit `'web'` and vice versa.
|
|
162
|
-
*/
|
|
163
|
-
searchType?: SearchType$1;
|
|
164
|
-
}
|
|
165
|
-
interface SyncState extends SyncStateScope {
|
|
166
|
-
state: SyncStateKind;
|
|
167
|
-
updatedAt: number;
|
|
168
|
-
attempts: number;
|
|
169
|
-
error?: string;
|
|
170
|
-
}
|
|
171
|
-
interface SyncStateFilter {
|
|
172
|
-
userId: string;
|
|
173
|
-
siteId?: string;
|
|
174
|
-
table?: TableName$1;
|
|
175
|
-
state?: SyncStateKind;
|
|
176
|
-
searchType?: SearchType$1;
|
|
177
|
-
}
|
|
178
|
-
interface SyncStateDetail {
|
|
179
|
-
at?: number;
|
|
180
|
-
error?: string;
|
|
181
|
-
}
|
|
182
|
-
interface LockScope {
|
|
183
|
-
userId: string;
|
|
184
|
-
siteId?: string;
|
|
185
|
-
table: TableName$1;
|
|
186
|
-
partition: string;
|
|
187
|
-
}
|
|
188
|
-
interface PurgeFilter {
|
|
189
|
-
userId: string;
|
|
190
|
-
siteId?: string;
|
|
191
|
-
}
|
|
192
|
-
interface ManifestPurgeResult {
|
|
193
|
-
entriesRemoved: number;
|
|
194
|
-
watermarksRemoved: number;
|
|
195
|
-
syncStatesRemoved: number;
|
|
196
|
-
}
|
|
197
|
-
interface PurgeResult {
|
|
198
|
-
userId: string;
|
|
199
|
-
siteId?: string;
|
|
200
|
-
prefix: string;
|
|
201
|
-
objectsDeleted: number;
|
|
202
|
-
entriesRemoved: number;
|
|
203
|
-
watermarksRemoved: number;
|
|
204
|
-
syncStatesRemoved: number;
|
|
205
|
-
at: number;
|
|
206
|
-
}
|
|
207
|
-
interface PurgeUrlsResult {
|
|
208
|
-
userId: string;
|
|
209
|
-
siteId?: string;
|
|
210
|
-
urlsRequested: number;
|
|
211
|
-
entriesRewritten: number;
|
|
212
|
-
rowsRemoved: number;
|
|
213
|
-
bytesAfter: number;
|
|
214
|
-
at: number;
|
|
215
|
-
}
|
|
216
|
-
interface ManifestStore {
|
|
217
|
-
listLive: (filter: ListLiveFilter) => Promise<ManifestEntry[]>;
|
|
218
|
-
listAll: (filter: ListLiveFilter) => Promise<ManifestEntry[]>;
|
|
219
|
-
registerVersion: (entry: ManifestEntry, superseding?: ManifestEntry[]) => Promise<void>;
|
|
220
|
-
registerVersions: (entries: ManifestEntry[], superseding?: ManifestEntry[]) => Promise<void>;
|
|
221
|
-
listRetired: (olderThan: number) => Promise<ManifestEntry[]>;
|
|
222
|
-
delete: (entries: ManifestEntry[]) => Promise<void>;
|
|
223
|
-
getWatermarks: (filter: WatermarkFilter) => Promise<Watermark[]>;
|
|
224
|
-
bumpWatermark: (scope: WatermarkScope, date: string, at?: number) => Promise<void>;
|
|
225
|
-
getSyncStates: (filter: SyncStateFilter) => Promise<SyncState[]>;
|
|
226
|
-
setSyncState: (scope: SyncStateScope, state: SyncStateKind, detail?: SyncStateDetail) => Promise<void>;
|
|
227
|
-
/**
|
|
228
|
-
* Serialize concurrent writers against the same scope. Held across the
|
|
229
|
-
* write+register window so GC (orphan sweep) won't delete bytes that are
|
|
230
|
-
* midway between `dataSource.write` and `manifestStore.registerVersion`.
|
|
231
|
-
* Scope = tenant × table × partition.
|
|
232
|
-
*/
|
|
233
|
-
withLock: <T>(scope: LockScope, fn: () => Promise<T>) => Promise<T>;
|
|
234
|
-
/**
|
|
235
|
-
* GDPR-grade tenant purge. Removes every manifest entry, watermark, and
|
|
236
|
-
* sync-state record matching the filter. Does NOT touch the underlying
|
|
237
|
-
* data-source bytes; callers (typically {@link StorageEngine.purgeTenant})
|
|
238
|
-
* must sweep the tenant prefix separately before invoking this so that
|
|
239
|
-
* mid-flight failures can't leave orphan parquet with no manifest record.
|
|
240
|
-
*
|
|
241
|
-
* On stores with CAS-backed sharding (R2 manifest) this may issue one
|
|
242
|
-
* mutation per shard. On read-only stores (HTTP) this throws.
|
|
243
|
-
*/
|
|
244
|
-
purgeTenant: (filter: PurgeFilter) => Promise<ManifestPurgeResult>;
|
|
245
|
-
}
|
|
246
|
-
interface WriteResult {
|
|
247
|
-
bytes: number;
|
|
248
|
-
rowCount: number;
|
|
249
|
-
}
|
|
250
|
-
interface CodecCtx {
|
|
251
|
-
table: TableName$1;
|
|
252
|
-
}
|
|
253
|
-
/**
|
|
254
|
-
* Key-oriented codec. Each method owns its I/O through `dataSource`:
|
|
255
|
-
* - Node / browser codecs read/write bytes via `dataSource.read` / `.write`.
|
|
256
|
-
* - Workers codecs let DuckDB's httpfs read/write remote URIs directly (via
|
|
257
|
-
* `dataSource.uri`) and never materialise bytes in JS.
|
|
258
|
-
*
|
|
259
|
-
* The engine never touches bytes; it just hands rows + keys to the codec.
|
|
260
|
-
*
|
|
261
|
-
* Invariants every implementation MUST uphold:
|
|
262
|
-
* - `writeRows` with an empty `rows` array MUST still write a file
|
|
263
|
-
* carrying the canonical column set for `ctx.table` — a schema-correct
|
|
264
|
-
* empty file. No placeholder-column shortcuts; readers depend on the
|
|
265
|
-
* schema being present for `union_by_name` merges.
|
|
266
|
-
* - `WriteResult.bytes` MUST be the real byte size written to the
|
|
267
|
-
* data source (not 0, not an estimate) so the engine can enforce the
|
|
268
|
-
* payload ceiling without a second `head` round-trip.
|
|
269
|
-
* - `WriteResult.rowCount` MUST equal `rows.length` (or, for
|
|
270
|
-
* `compactRows`, the sum of input row counts).
|
|
271
|
-
*/
|
|
272
|
-
interface ParquetCodec {
|
|
273
|
-
writeRows: (ctx: CodecCtx, rows: Row$1[], key: string, dataSource: DataSource) => Promise<WriteResult>;
|
|
274
|
-
readRows: (ctx: CodecCtx, key: string, dataSource: DataSource) => Promise<Row$1[]>;
|
|
275
|
-
compactRows: (ctx: CodecCtx, inputKeys: string[], outputKey: string, dataSource: DataSource) => Promise<WriteResult>;
|
|
276
|
-
}
|
|
277
|
-
interface QueryResult {
|
|
278
|
-
rows: Row$1[];
|
|
279
|
-
sql: string;
|
|
280
|
-
objectKeys: string[];
|
|
281
|
-
}
|
|
282
|
-
interface ComparisonResult {
|
|
283
|
-
rows: Row$1[];
|
|
284
|
-
totalCount: number;
|
|
285
|
-
totals: Record<string, unknown>;
|
|
286
|
-
}
|
|
287
|
-
interface ExtraResult {
|
|
288
|
-
key: string;
|
|
289
|
-
rows: Row$1[];
|
|
290
|
-
}
|
|
291
|
-
interface QueryExecuteOptions {
|
|
292
|
-
sql: string;
|
|
293
|
-
params: unknown[];
|
|
294
|
-
/**
|
|
295
|
-
* Named placeholder → object keys. The executor substitutes `{{NAME}}`
|
|
296
|
-
* occurrences in the SQL with the matching `read_parquet([...])` list,
|
|
297
|
-
* choosing between virtual-FS names or native URIs based on whether
|
|
298
|
-
* `dataSource.uri` is available.
|
|
299
|
-
*/
|
|
300
|
-
fileKeys: Record<string, string[]>;
|
|
301
|
-
dataSource: DataSource;
|
|
302
|
-
table: TableName$1;
|
|
303
|
-
signal?: AbortSignal;
|
|
304
|
-
/**
|
|
305
|
-
* Optional callback invoked by the executor when it detects the DuckDB
|
|
306
|
-
* process is approaching a memory ceiling (e.g. ingesting rows after
|
|
307
|
-
* httpfs decode, or materialising a large temp relation). Callers can
|
|
308
|
-
* shed work, warm a spillover path, or warn the user. Advisory only —
|
|
309
|
-
* not all executors implement it.
|
|
310
|
-
*/
|
|
311
|
-
onMemoryPressure?: (info: {
|
|
312
|
-
bytes?: number;
|
|
313
|
-
reason: string;
|
|
314
|
-
}) => void;
|
|
315
|
-
}
|
|
316
|
-
interface QueryExecuteResult {
|
|
317
|
-
rows: Row$1[];
|
|
318
|
-
/** The final SQL actually run (after placeholder substitution). */
|
|
319
|
-
sql: string;
|
|
320
|
-
/**
|
|
321
|
-
* Optional diagnostics the executor may emit for observability + capacity
|
|
322
|
-
* planning. Undefined on executors that don't instrument their runtime.
|
|
323
|
-
*
|
|
324
|
-
* - `peakBytes`: highest resident memory the engine reported during the
|
|
325
|
-
* query. Callers may use this to decide whether to drop / compact state
|
|
326
|
-
* before the next call.
|
|
327
|
-
* - `resetRecommended`: executor thinks the underlying connection should
|
|
328
|
-
* be recycled (fragmented, near ceiling). Caller-owned decision —
|
|
329
|
-
* honored by `BrowserAnalysisRuntime` consumers but not enforced.
|
|
330
|
-
*/
|
|
331
|
-
diagnostics?: {
|
|
332
|
-
peakBytes?: number;
|
|
333
|
-
resetRecommended?: boolean;
|
|
334
|
-
};
|
|
335
|
-
}
|
|
336
|
-
interface QueryExecutor {
|
|
337
|
-
execute: (opts: QueryExecuteOptions) => Promise<QueryExecuteResult>;
|
|
338
|
-
}
|
|
339
|
-
interface FileSetRef {
|
|
340
|
-
table: TableName$1;
|
|
341
|
-
partitions?: string[];
|
|
342
|
-
}
|
|
343
|
-
interface RunSQLOptions {
|
|
344
|
-
ctx: TenantCtx$1;
|
|
345
|
-
/**
|
|
346
|
-
* Named partition references. Each name becomes a `{{NAME}}` placeholder
|
|
347
|
-
* substituted into the SQL with the matching list of object keys. The
|
|
348
|
-
* canonical name is `FILES`; analyzers also use `FILES_PREV` for a prior
|
|
349
|
-
* window. Providing zero fileSets runs the SQL against no files.
|
|
350
|
-
*/
|
|
351
|
-
fileSets: Record<string, FileSetRef>;
|
|
352
|
-
/** Schema-bearing table; defaults to the first fileSet's table. */
|
|
353
|
-
table?: TableName$1;
|
|
354
|
-
sql: string;
|
|
355
|
-
params?: unknown[];
|
|
356
|
-
signal?: AbortSignal;
|
|
357
|
-
}
|
|
358
|
-
interface StorageEngine {
|
|
359
|
-
writeDay: (ctx: WriteCtx, rows: Row$1[]) => Promise<void>;
|
|
360
|
-
query: (ctx: QueryCtx, state: BuilderState) => Promise<QueryResult>;
|
|
361
|
-
/**
|
|
362
|
-
* Two-window comparison query (resolver-compiled). Joins a `current` and
|
|
363
|
-
* `previous` window CTE on dimensions, applies an optional row filter
|
|
364
|
-
* (`new`/`lost`/`improving`/`declining`), and returns the merged rows plus
|
|
365
|
-
* total count and unfiltered totals.
|
|
366
|
-
*
|
|
367
|
-
* Tenant scoping comes from `ctx.userId`/`ctx.siteId` (manifest lookup) —
|
|
368
|
-
* the SQL itself is single-tenant against the parquet adapter, which has
|
|
369
|
-
* `includeSiteId: false`.
|
|
370
|
-
*
|
|
371
|
-
* Throws if `current` and `previous` resolve to different tables.
|
|
372
|
-
*/
|
|
373
|
-
queryComparison: (ctx: QueryCtx, current: BuilderState, previous: BuilderState, filter?: ComparisonFilter) => Promise<ComparisonResult>;
|
|
374
|
-
/**
|
|
375
|
-
* Canonical-variant enrichment queries. Returns one result per extra
|
|
376
|
-
* surface; today only `queryCanonical` triggers an extra. Empty array
|
|
377
|
-
* when the state has no extras-eligible dimensions.
|
|
378
|
-
*/
|
|
379
|
-
queryExtras: (ctx: QueryCtx, state: BuilderState) => Promise<ExtraResult[]>;
|
|
380
|
-
/**
|
|
381
|
-
* Run arbitrary SQL resolved against named partition sets. Composes
|
|
382
|
-
* manifest lookup + object reads + placeholder substitution + execution
|
|
383
|
-
* so callers don't need to reach into `ManifestStore`/`DataSource`
|
|
384
|
-
* directly.
|
|
385
|
-
*/
|
|
386
|
-
runSQL: (opts: RunSQLOptions) => Promise<QueryResult>;
|
|
387
|
-
compactTiered: (ctx: WriteCtx, thresholds?: CompactionThresholds) => Promise<void>;
|
|
388
|
-
gcOrphans: (ctx: GcCtx, graceMs: number) => Promise<{
|
|
389
|
-
deleted: number;
|
|
390
|
-
}>;
|
|
391
|
-
/**
|
|
392
|
-
* GDPR-grade tenant purge. Deletes every object under the tenant prefix
|
|
393
|
-
* (parquet, rollups, entity stores), then removes manifest/watermark/
|
|
394
|
-
* sync-state records via {@link ManifestStore.purgeTenant}.
|
|
395
|
-
*
|
|
396
|
-
* Order matters: bytes are deleted before manifest entries, so a
|
|
397
|
-
* crash mid-purge leaves orphan manifest records (detectable via the
|
|
398
|
-
* normal orphan sweep) rather than orphan bytes with no record.
|
|
399
|
-
*
|
|
400
|
-
* Returns counters suitable for an audit log. Caller is responsible
|
|
401
|
-
* for persisting the audit entry.
|
|
402
|
-
*/
|
|
403
|
-
purgeTenant: (ctx: TenantCtx$1) => Promise<PurgeResult>;
|
|
404
|
-
/**
|
|
405
|
-
* GDPR URL-matcher purge. Deletes rows whose `url` column matches one of
|
|
406
|
-
* `urls` across every live parquet entry for the tenant in tables that
|
|
407
|
-
* carry a `url` column (`pages`, `page_keywords`). Tables without a `url`
|
|
408
|
-
* column (`keywords`, `countries`, `devices`, `search_appearance`) are
|
|
409
|
-
* untouched — they never store per-URL data.
|
|
410
|
-
*
|
|
411
|
-
* For each affected entry the engine reads the file, filters the matching
|
|
412
|
-
* rows out, writes a replacement parquet at a new object key, and registers
|
|
413
|
-
* the new entry as a supersede of the old. Entries with no matches are
|
|
414
|
-
* left untouched. Entries with all rows matching are replaced by a
|
|
415
|
-
* schema-bearing empty-rows file.
|
|
416
|
-
*
|
|
417
|
-
* Narrower counterpart to {@link purgeTenant}: use this for a per-URL
|
|
418
|
-
* takedown request; use `purgeTenant` for full-account deletion.
|
|
419
|
-
*/
|
|
420
|
-
purgeUrls: (ctx: TenantCtx$1, urls: readonly string[]) => Promise<PurgeUrlsResult>;
|
|
421
|
-
listLive: (filter: ListLiveFilter) => Promise<ManifestEntry[]>;
|
|
422
|
-
listAll: (filter: ListLiveFilter) => Promise<ManifestEntry[]>;
|
|
423
|
-
getWatermarks: (filter: WatermarkFilter) => Promise<Watermark[]>;
|
|
424
|
-
getSyncStates: (filter: SyncStateFilter) => Promise<SyncState[]>;
|
|
425
|
-
setSyncState: (scope: SyncStateScope, state: SyncStateKind, detail?: SyncStateDetail) => Promise<void>;
|
|
426
|
-
/** Read the raw bytes of a single object. Rarely needed outside the `dump` CLI. */
|
|
427
|
-
readObject: (key: string) => Promise<Uint8Array>;
|
|
428
|
-
}
|
|
429
|
-
interface EngineOptions {
|
|
430
|
-
dataSource: DataSource;
|
|
431
|
-
manifestStore: ManifestStore;
|
|
432
|
-
codec: ParquetCodec;
|
|
433
|
-
executor: QueryExecutor;
|
|
434
|
-
now?: () => number;
|
|
435
|
-
}
|
|
1
|
+
import { A as SyncStateFilter, C as QueryResult, D as StorageEngine, E as SearchType, F as Watermark, I as WatermarkFilter, L as WatermarkScope, M as SyncStateScope, N as TableName, O as SyncState, P as TenantCtx, R as WriteCtx, S as QueryExecutor, T as RunSQLOptions, a as DataSource, b as QueryExecuteOptions, c as FileSetRef, d as LockScope, f as ManifestEntry, h as ParquetCodec, j as SyncStateKind, k as SyncStateDetail, l as GcCtx, m as ManifestStore, n as CompactionTier, o as EngineOptions, t as CodecCtx, u as ListLiveFilter, w as Row, x as QueryExecuteResult, y as QueryCtx, z as WriteResult } from "./_chunks/storage.mjs";
|
|
436
2
|
export { type CodecCtx, type CompactionTier, type DataSource, type EngineOptions, type FileSetRef, type GcCtx, type ListLiveFilter, type LockScope, type ManifestEntry, type ManifestStore, type ParquetCodec, type QueryCtx, type QueryExecuteOptions, type QueryExecuteResult, type QueryExecutor, type QueryResult, type Row, type RunSQLOptions, type SearchType, type StorageEngine, type SyncState, type SyncStateDetail, type SyncStateFilter, type SyncStateKind, type SyncStateScope, type TableName, type TenantCtx, type Watermark, type WatermarkFilter, type WatermarkScope, type WriteCtx, type WriteResult };
|