@gscdump/engine 0.6.1 → 0.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/dist/_chunks/compiler.mjs +288 -0
  2. package/dist/_chunks/duckdb.d.mts +26 -0
  3. package/dist/_chunks/engine.mjs +578 -0
  4. package/dist/_chunks/pg-adapter.mjs +676 -0
  5. package/dist/_chunks/planner.d.mts +15 -0
  6. package/dist/_chunks/schema.d.mts +1258 -0
  7. package/dist/_chunks/schema.mjs +139 -0
  8. package/dist/_chunks/storage.d.mts +476 -0
  9. package/dist/_chunks/storage.mjs +39 -0
  10. package/dist/_chunks/types.d.mts +53 -0
  11. package/dist/adapters/duckdb-node.d.mts +1 -13
  12. package/dist/adapters/duckdb-node.mjs +1 -7
  13. package/dist/adapters/filesystem.d.mts +1 -193
  14. package/dist/adapters/filesystem.mjs +2 -9
  15. package/dist/adapters/http.d.mts +1 -193
  16. package/dist/adapters/http.mjs +1 -5
  17. package/dist/adapters/hyparquet.d.mts +6 -83
  18. package/dist/adapters/hyparquet.mjs +1 -105
  19. package/dist/adapters/inspection-sqlite-browser.d.mts +1 -7
  20. package/dist/adapters/inspection-sqlite-node.d.mts +1 -7
  21. package/dist/adapters/inspection-sqlite-node.mjs +1 -1
  22. package/dist/adapters/node-harness.d.mts +3 -306
  23. package/dist/adapters/node-harness.mjs +4 -1866
  24. package/dist/adapters/r2-manifest.d.mts +4 -149
  25. package/dist/adapters/r2-manifest.mjs +1 -8
  26. package/dist/adapters/r2.d.mts +1 -47
  27. package/dist/contracts.d.mts +1 -435
  28. package/dist/entities.d.mts +1 -47
  29. package/dist/index.d.mts +8 -1844
  30. package/dist/index.mjs +8 -1962
  31. package/dist/ingest.d.mts +1 -1
  32. package/dist/planner.d.mts +3 -16
  33. package/dist/planner.mjs +1 -320
  34. package/dist/resolver/index.d.mts +3 -51
  35. package/dist/resolver/index.mjs +2 -780
  36. package/dist/rollups.d.mts +6 -51
  37. package/dist/rollups.mjs +2 -209
  38. package/dist/schema.d.mts +2 -1258
  39. package/dist/schema.mjs +1 -138
  40. package/package.json +2 -2
@@ -0,0 +1,53 @@
1
+ import { LogicalDataset, PlannerCapabilities } from "gscdump/query/plan";
2
+ import { SQL } from "drizzle-orm";
3
+ import { Dimension, InternalFilter, Metric } from "gscdump/query";
4
+ interface ResolverAdapter<TableKey extends string = string> {
5
+ readonly METRIC_NAMES: readonly Metric[];
6
+ readonly capabilities: PlannerCapabilities;
7
+ readonly schema: Record<TableKey, unknown>;
8
+ tableKeyForDataset: (dataset: LogicalDataset) => TableKey;
9
+ inferTable: (dimensions: Dimension[], filterDims?: Dimension[]) => TableKey;
10
+ dimColumn: (dim: Dimension, tableKey: TableKey) => string;
11
+ isMetricDimension: (dim: string) => dim is Metric;
12
+ tableRef: (tableKey: TableKey) => SQL;
13
+ dateColRef: (tableKey: TableKey) => SQL;
14
+ urlToPathExpr: (col: string) => string;
15
+ siteIdColRef?: (tableKey: TableKey) => SQL;
16
+ dimExprSql: (dim: Dimension, tableKey: TableKey) => SQL;
17
+ metricSql: (metric: Metric, tableKey: TableKey) => SQL;
18
+ dimensionPredicates: (filters: InternalFilter[], tableKey: TableKey) => SQL[];
19
+ havingPredicates: (filters: InternalFilter[], tableKey: TableKey) => SQL[];
20
+ topLevelPredicate: (filters: InternalFilter[], tableKey: TableKey) => SQL | undefined;
21
+ compile: (query: SQL) => {
22
+ sql: string;
23
+ params: unknown[];
24
+ };
25
+ }
26
+ type ComparisonFilter = 'new' | 'lost' | 'improving' | 'declining';
27
+ interface ResolverOptions<TableKey extends string = string> {
28
+ adapter: ResolverAdapter<TableKey>;
29
+ /** Optional site scope. Required for multi-tenant D1; omitted for parquet. */
30
+ siteId?: string | number;
31
+ }
32
+ interface ResolvedSQL {
33
+ sql: string;
34
+ params: unknown[];
35
+ countSql: string;
36
+ countParams: unknown[];
37
+ }
38
+ interface ResolvedSQLOptimized {
39
+ sql: string;
40
+ params: unknown[];
41
+ }
42
+ interface ResolvedComparisonSQL {
43
+ sql: string;
44
+ params: unknown[];
45
+ countSql: string;
46
+ countParams: unknown[];
47
+ }
48
+ interface ExtraQuery {
49
+ key: string;
50
+ sql: string;
51
+ params: unknown[];
52
+ }
53
+ export { ResolvedSQLOptimized as a, ResolvedSQL as i, ExtraQuery as n, ResolverAdapter as o, ResolvedComparisonSQL as r, ResolverOptions as s, ComparisonFilter as t };
@@ -1,16 +1,4 @@
1
- import { Row } from "gscdump/contracts";
2
- interface DuckDBHandle {
3
- query: (sql: string, params?: unknown[]) => Promise<Row[]>;
4
- registerFileBuffer: (name: string, bytes: Uint8Array) => Promise<void>;
5
- copyFileToBuffer: (name: string) => Promise<Uint8Array>;
6
- dropFiles: (names: string[]) => Promise<void>;
7
- /**
8
- * Returns a unique path suitable for `COPY TO '…'` + `copyFileToBuffer`.
9
- * In Node this is an absolute path under `os.tmpdir()` so DuckDB doesn't
10
- * litter the CWD; in browsers/Workers it's a plain virtual-FS name.
11
- */
12
- makeTempPath: (ext: string) => string;
13
- }
1
+ import { n as DuckDBHandle } from "../_chunks/duckdb.mjs";
14
2
  interface NodeDuckDBOptions {
15
3
  verbose?: boolean;
16
4
  }
@@ -1,3 +1,4 @@
1
+ import { arrowToRows } from "../arrow-utils.mjs";
1
2
  import { createRequire } from "node:module";
2
3
  import { unlinkSync } from "node:fs";
3
4
  import { tmpdir } from "node:os";
@@ -5,13 +6,6 @@ import { join } from "node:path";
5
6
  import process from "node:process";
6
7
  import { fileURLToPath } from "node:url";
7
8
  import { ConsoleLogger, NODE_RUNTIME, VoidLogger, createDuckDB } from "@duckdb/duckdb-wasm/dist/duckdb-node-blocking.cjs";
8
- function arrowToRows(result) {
9
- const r = result;
10
- const arr = Array.isArray(r) ? r : typeof r?.toArray === "function" ? r.toArray() : [];
11
- if (!arr || arr.length === 0) return [];
12
- if (typeof arr[0]?.toJSON === "function") return arr.map((r) => r.toJSON());
13
- return arr;
14
- }
15
9
  const require_ = createRequire(typeof __filename !== "undefined" ? __filename : typeof import.meta !== "undefined" ? fileURLToPath(import.meta.url) : process.cwd());
16
10
  let singleton = null;
17
11
  function bundles() {
@@ -1,196 +1,4 @@
1
- import { TableName } from "gscdump/contracts";
2
- import { SearchType } from "gscdump/query";
3
- /**
4
- * Compaction tier of a manifest entry. Determines which compactor stage may
5
- * pick it up as input:
6
- * - `raw`: per-day file produced by `writeDay`. Eligible for raw→d7 merge at 7d.
7
- * - `d7`: weekly compaction output. Eligible for d7→d30 merge at 30d.
8
- * - `d30`: monthly compaction output (matches the legacy `monthly/` partition
9
- * shape — pre-tier entries are read as `d30`). Eligible for d30→d90 at 90d.
10
- * - `d90`: quarterly cold-tier output. Terminal; never recompacted.
11
- *
12
- * Without an explicit tier, entries written before this field landed default
13
- * to `raw` for `daily/` partitions and `d30` for `monthly/` partitions, so
14
- * the tiered compactor picks the right inputs without a backfill rewrite.
15
- */
16
- type CompactionTier = 'raw' | 'd7' | 'd30' | 'd90';
17
- interface ManifestEntry {
18
- userId: string;
19
- siteId?: string;
20
- table: TableName;
21
- partition: string;
22
- objectKey: string;
23
- rowCount: number;
24
- bytes: number;
25
- createdAt: number;
26
- retiredAt?: number;
27
- /** Table schema version at write time. Omitted on pre-#27 entries — treat as 1. */
28
- schemaVersion?: number;
29
- /**
30
- * Compaction tier. Omitted on entries written before tiered compaction —
31
- * treat as `raw` for `daily/` partitions and `d30` for `monthly/` partitions
32
- * (see {@link inferLegacyTier}).
33
- */
34
- tier?: CompactionTier;
35
- /**
36
- * GSC search-type this entry covers (web | discover | news | googleNews |
37
- * image | video). Omitted on entries written before per-type partitioning
38
- * landed — treat as `web` (see {@link inferSearchType}). Compaction merges
39
- * only entries with the same searchType.
40
- */
41
- searchType?: SearchType;
42
- }
43
- interface ListLiveFilter {
44
- userId: string;
45
- siteId?: string;
46
- table?: TableName;
47
- partitions?: string[];
48
- /**
49
- * Narrow to a single compaction tier. Tier-aware compaction stages set this
50
- * so the store doesn't have to return (and the caller doesn't have to scan)
51
- * the entire manifest just to compact the raw cohort. Legacy entries without
52
- * an explicit `tier` field match on {@link inferLegacyTier}.
53
- */
54
- tier?: CompactionTier;
55
- }
56
- interface DataSource {
57
- read: (key: string, range?: {
58
- offset: number;
59
- length: number;
60
- }, signal?: AbortSignal) => Promise<Uint8Array>;
61
- write: (key: string, bytes: Uint8Array) => Promise<void>;
62
- delete: (keys: string[]) => Promise<void>;
63
- /**
64
- * One-shot listing under a prefix. Implementations may cap the number of
65
- * returned keys (typically 10k) — callers iterating full tenant space
66
- * should prefer `streamList` when available or narrow the prefix.
67
- */
68
- list: (prefix: string) => Promise<string[]>;
69
- /**
70
- * Per-key URI probe. Returns a URI string DuckDB's `httpfs` (or an
71
- * equivalent engine that fetches its own I/O) can read directly, or
72
- * `undefined` if the key isn't URI-resolvable on this backend and the
73
- * caller must fall back to `read(key)` for the bytes.
74
- *
75
- * Contracts:
76
- * - When defined, the returned URI MUST yield byte-identical content to
77
- * `read(key)`. Callers rely on this for correctness.
78
- * - Backends with a native URI for every key (filesystem: absolute path,
79
- * R2 via `httpfs`: signed URL) may always return a string.
80
- * - Backends without a native URI shape (in-memory) omit the method or
81
- * return `undefined` per call.
82
- * - Mixed-per-query is allowed: some keys in one query may return a URI,
83
- * others may not; the executor branches per key.
84
- */
85
- uri?: (key: string) => string | undefined;
86
- /**
87
- * Optional — probe the byte size of a key without reading it. Used by
88
- * the engine to fill in `WriteResult.bytes` when a codec reports 0 or
89
- * unknown but the file is non-trivial.
90
- */
91
- head?: (key: string) => Promise<{
92
- bytes: number;
93
- } | undefined>;
94
- /**
95
- * Optional streaming variant of `list`. Implementations that page
96
- * backing-store results (R2, S3) should implement this and yield keys
97
- * lazily. `list` may return up to an adapter-defined cap (typically
98
- * 10k keys); callers iterating full tenant space must prefer
99
- * `streamList` when available, or chunk by narrower prefixes.
100
- */
101
- streamList?: (prefix: string) => AsyncIterable<string>;
102
- }
103
- interface WatermarkScope {
104
- userId: string;
105
- siteId?: string;
106
- table: TableName;
107
- }
108
- interface Watermark extends WatermarkScope {
109
- newestDateSynced: string;
110
- oldestDateSynced: string;
111
- lastSyncAt: number;
112
- }
113
- interface WatermarkFilter {
114
- userId: string;
115
- siteId?: string;
116
- table?: TableName;
117
- }
118
- type SyncStateKind = 'pending' | 'inflight' | 'done' | 'failed';
119
- interface SyncStateScope {
120
- userId: string;
121
- siteId?: string;
122
- table: TableName;
123
- date: string;
124
- /**
125
- * GSC search-type this sync state covers. Omitted = `web` (the legacy
126
- * default; matches pre-#5 sync states stored before per-type sync landed).
127
- * Lookups must compare via {@link inferSearchType} so a missing field
128
- * matches an explicit `'web'` and vice versa.
129
- */
130
- searchType?: SearchType;
131
- }
132
- interface SyncState extends SyncStateScope {
133
- state: SyncStateKind;
134
- updatedAt: number;
135
- attempts: number;
136
- error?: string;
137
- }
138
- interface SyncStateFilter {
139
- userId: string;
140
- siteId?: string;
141
- table?: TableName;
142
- state?: SyncStateKind;
143
- searchType?: SearchType;
144
- }
145
- interface SyncStateDetail {
146
- at?: number;
147
- error?: string;
148
- }
149
- interface LockScope {
150
- userId: string;
151
- siteId?: string;
152
- table: TableName;
153
- partition: string;
154
- }
155
- interface PurgeFilter {
156
- userId: string;
157
- siteId?: string;
158
- }
159
- interface ManifestPurgeResult {
160
- entriesRemoved: number;
161
- watermarksRemoved: number;
162
- syncStatesRemoved: number;
163
- }
164
- interface ManifestStore {
165
- listLive: (filter: ListLiveFilter) => Promise<ManifestEntry[]>;
166
- listAll: (filter: ListLiveFilter) => Promise<ManifestEntry[]>;
167
- registerVersion: (entry: ManifestEntry, superseding?: ManifestEntry[]) => Promise<void>;
168
- registerVersions: (entries: ManifestEntry[], superseding?: ManifestEntry[]) => Promise<void>;
169
- listRetired: (olderThan: number) => Promise<ManifestEntry[]>;
170
- delete: (entries: ManifestEntry[]) => Promise<void>;
171
- getWatermarks: (filter: WatermarkFilter) => Promise<Watermark[]>;
172
- bumpWatermark: (scope: WatermarkScope, date: string, at?: number) => Promise<void>;
173
- getSyncStates: (filter: SyncStateFilter) => Promise<SyncState[]>;
174
- setSyncState: (scope: SyncStateScope, state: SyncStateKind, detail?: SyncStateDetail) => Promise<void>;
175
- /**
176
- * Serialize concurrent writers against the same scope. Held across the
177
- * write+register window so GC (orphan sweep) won't delete bytes that are
178
- * midway between `dataSource.write` and `manifestStore.registerVersion`.
179
- * Scope = tenant × table × partition.
180
- */
181
- withLock: <T>(scope: LockScope, fn: () => Promise<T>) => Promise<T>;
182
- /**
183
- * GDPR-grade tenant purge. Removes every manifest entry, watermark, and
184
- * sync-state record matching the filter. Does NOT touch the underlying
185
- * data-source bytes; callers (typically {@link StorageEngine.purgeTenant})
186
- * must sweep the tenant prefix separately before invoking this so that
187
- * mid-flight failures can't leave orphan parquet with no manifest record.
188
- *
189
- * On stores with CAS-backed sharding (R2 manifest) this may issue one
190
- * mutation per shard. On read-only stores (HTTP) this throws.
191
- */
192
- purgeTenant: (filter: PurgeFilter) => Promise<ManifestPurgeResult>;
193
- }
1
+ import { a as DataSource, m as ManifestStore } from "../_chunks/storage.mjs";
194
2
  interface FilesystemDataSourceOptions {
195
3
  rootDir: string;
196
4
  }
@@ -1,16 +1,9 @@
1
+ import { i as inferSearchType, r as inferLegacyTier } from "../_chunks/storage.mjs";
2
+ import { dirname, join, resolve } from "node:path";
1
3
  import { Buffer } from "node:buffer";
2
4
  import { randomBytes } from "node:crypto";
3
5
  import { mkdir, readFile, readdir, rename, rm, stat, unlink, writeFile } from "node:fs/promises";
4
- import { dirname, join, resolve } from "node:path";
5
6
  import { lock } from "proper-lockfile";
6
- function inferSearchType(entry) {
7
- return entry.searchType ?? "web";
8
- }
9
- function inferLegacyTier(entry) {
10
- if (entry.tier !== void 0) return entry.tier;
11
- if (entry.partition.startsWith("daily/")) return "raw";
12
- if (entry.partition.startsWith("monthly/")) return "d30";
13
- }
14
7
  function createFilesystemDataSource(opts) {
15
8
  const root = resolve(opts.rootDir);
16
9
  function pathFor(key) {
@@ -1,196 +1,4 @@
1
- import { TableName } from "gscdump/contracts";
2
- import { SearchType } from "gscdump/query";
3
- /**
4
- * Compaction tier of a manifest entry. Determines which compactor stage may
5
- * pick it up as input:
6
- * - `raw`: per-day file produced by `writeDay`. Eligible for raw→d7 merge at 7d.
7
- * - `d7`: weekly compaction output. Eligible for d7→d30 merge at 30d.
8
- * - `d30`: monthly compaction output (matches the legacy `monthly/` partition
9
- * shape — pre-tier entries are read as `d30`). Eligible for d30→d90 at 90d.
10
- * - `d90`: quarterly cold-tier output. Terminal; never recompacted.
11
- *
12
- * Without an explicit tier, entries written before this field landed default
13
- * to `raw` for `daily/` partitions and `d30` for `monthly/` partitions, so
14
- * the tiered compactor picks the right inputs without a backfill rewrite.
15
- */
16
- type CompactionTier = 'raw' | 'd7' | 'd30' | 'd90';
17
- interface ManifestEntry {
18
- userId: string;
19
- siteId?: string;
20
- table: TableName;
21
- partition: string;
22
- objectKey: string;
23
- rowCount: number;
24
- bytes: number;
25
- createdAt: number;
26
- retiredAt?: number;
27
- /** Table schema version at write time. Omitted on pre-#27 entries — treat as 1. */
28
- schemaVersion?: number;
29
- /**
30
- * Compaction tier. Omitted on entries written before tiered compaction —
31
- * treat as `raw` for `daily/` partitions and `d30` for `monthly/` partitions
32
- * (see {@link inferLegacyTier}).
33
- */
34
- tier?: CompactionTier;
35
- /**
36
- * GSC search-type this entry covers (web | discover | news | googleNews |
37
- * image | video). Omitted on entries written before per-type partitioning
38
- * landed — treat as `web` (see {@link inferSearchType}). Compaction merges
39
- * only entries with the same searchType.
40
- */
41
- searchType?: SearchType;
42
- }
43
- interface ListLiveFilter {
44
- userId: string;
45
- siteId?: string;
46
- table?: TableName;
47
- partitions?: string[];
48
- /**
49
- * Narrow to a single compaction tier. Tier-aware compaction stages set this
50
- * so the store doesn't have to return (and the caller doesn't have to scan)
51
- * the entire manifest just to compact the raw cohort. Legacy entries without
52
- * an explicit `tier` field match on {@link inferLegacyTier}.
53
- */
54
- tier?: CompactionTier;
55
- }
56
- interface DataSource {
57
- read: (key: string, range?: {
58
- offset: number;
59
- length: number;
60
- }, signal?: AbortSignal) => Promise<Uint8Array>;
61
- write: (key: string, bytes: Uint8Array) => Promise<void>;
62
- delete: (keys: string[]) => Promise<void>;
63
- /**
64
- * One-shot listing under a prefix. Implementations may cap the number of
65
- * returned keys (typically 10k) — callers iterating full tenant space
66
- * should prefer `streamList` when available or narrow the prefix.
67
- */
68
- list: (prefix: string) => Promise<string[]>;
69
- /**
70
- * Per-key URI probe. Returns a URI string DuckDB's `httpfs` (or an
71
- * equivalent engine that fetches its own I/O) can read directly, or
72
- * `undefined` if the key isn't URI-resolvable on this backend and the
73
- * caller must fall back to `read(key)` for the bytes.
74
- *
75
- * Contracts:
76
- * - When defined, the returned URI MUST yield byte-identical content to
77
- * `read(key)`. Callers rely on this for correctness.
78
- * - Backends with a native URI for every key (filesystem: absolute path,
79
- * R2 via `httpfs`: signed URL) may always return a string.
80
- * - Backends without a native URI shape (in-memory) omit the method or
81
- * return `undefined` per call.
82
- * - Mixed-per-query is allowed: some keys in one query may return a URI,
83
- * others may not; the executor branches per key.
84
- */
85
- uri?: (key: string) => string | undefined;
86
- /**
87
- * Optional — probe the byte size of a key without reading it. Used by
88
- * the engine to fill in `WriteResult.bytes` when a codec reports 0 or
89
- * unknown but the file is non-trivial.
90
- */
91
- head?: (key: string) => Promise<{
92
- bytes: number;
93
- } | undefined>;
94
- /**
95
- * Optional streaming variant of `list`. Implementations that page
96
- * backing-store results (R2, S3) should implement this and yield keys
97
- * lazily. `list` may return up to an adapter-defined cap (typically
98
- * 10k keys); callers iterating full tenant space must prefer
99
- * `streamList` when available, or chunk by narrower prefixes.
100
- */
101
- streamList?: (prefix: string) => AsyncIterable<string>;
102
- }
103
- interface WatermarkScope {
104
- userId: string;
105
- siteId?: string;
106
- table: TableName;
107
- }
108
- interface Watermark extends WatermarkScope {
109
- newestDateSynced: string;
110
- oldestDateSynced: string;
111
- lastSyncAt: number;
112
- }
113
- interface WatermarkFilter {
114
- userId: string;
115
- siteId?: string;
116
- table?: TableName;
117
- }
118
- type SyncStateKind = 'pending' | 'inflight' | 'done' | 'failed';
119
- interface SyncStateScope {
120
- userId: string;
121
- siteId?: string;
122
- table: TableName;
123
- date: string;
124
- /**
125
- * GSC search-type this sync state covers. Omitted = `web` (the legacy
126
- * default; matches pre-#5 sync states stored before per-type sync landed).
127
- * Lookups must compare via {@link inferSearchType} so a missing field
128
- * matches an explicit `'web'` and vice versa.
129
- */
130
- searchType?: SearchType;
131
- }
132
- interface SyncState extends SyncStateScope {
133
- state: SyncStateKind;
134
- updatedAt: number;
135
- attempts: number;
136
- error?: string;
137
- }
138
- interface SyncStateFilter {
139
- userId: string;
140
- siteId?: string;
141
- table?: TableName;
142
- state?: SyncStateKind;
143
- searchType?: SearchType;
144
- }
145
- interface SyncStateDetail {
146
- at?: number;
147
- error?: string;
148
- }
149
- interface LockScope {
150
- userId: string;
151
- siteId?: string;
152
- table: TableName;
153
- partition: string;
154
- }
155
- interface PurgeFilter {
156
- userId: string;
157
- siteId?: string;
158
- }
159
- interface ManifestPurgeResult {
160
- entriesRemoved: number;
161
- watermarksRemoved: number;
162
- syncStatesRemoved: number;
163
- }
164
- interface ManifestStore {
165
- listLive: (filter: ListLiveFilter) => Promise<ManifestEntry[]>;
166
- listAll: (filter: ListLiveFilter) => Promise<ManifestEntry[]>;
167
- registerVersion: (entry: ManifestEntry, superseding?: ManifestEntry[]) => Promise<void>;
168
- registerVersions: (entries: ManifestEntry[], superseding?: ManifestEntry[]) => Promise<void>;
169
- listRetired: (olderThan: number) => Promise<ManifestEntry[]>;
170
- delete: (entries: ManifestEntry[]) => Promise<void>;
171
- getWatermarks: (filter: WatermarkFilter) => Promise<Watermark[]>;
172
- bumpWatermark: (scope: WatermarkScope, date: string, at?: number) => Promise<void>;
173
- getSyncStates: (filter: SyncStateFilter) => Promise<SyncState[]>;
174
- setSyncState: (scope: SyncStateScope, state: SyncStateKind, detail?: SyncStateDetail) => Promise<void>;
175
- /**
176
- * Serialize concurrent writers against the same scope. Held across the
177
- * write+register window so GC (orphan sweep) won't delete bytes that are
178
- * midway between `dataSource.write` and `manifestStore.registerVersion`.
179
- * Scope = tenant × table × partition.
180
- */
181
- withLock: <T>(scope: LockScope, fn: () => Promise<T>) => Promise<T>;
182
- /**
183
- * GDPR-grade tenant purge. Removes every manifest entry, watermark, and
184
- * sync-state record matching the filter. Does NOT touch the underlying
185
- * data-source bytes; callers (typically {@link StorageEngine.purgeTenant})
186
- * must sweep the tenant prefix separately before invoking this so that
187
- * mid-flight failures can't leave orphan parquet with no manifest record.
188
- *
189
- * On stores with CAS-backed sharding (R2 manifest) this may issue one
190
- * mutation per shard. On read-only stores (HTTP) this throws.
191
- */
192
- purgeTenant: (filter: PurgeFilter) => Promise<ManifestPurgeResult>;
193
- }
1
+ import { a as DataSource, m as ManifestStore } from "../_chunks/storage.mjs";
194
2
  interface HttpDataSourceOptions {
195
3
  /**
196
4
  * Base URL to prefix each object key with. MUST NOT have a trailing slash.
@@ -1,8 +1,4 @@
1
- function inferLegacyTier(entry) {
2
- if (entry.tier !== void 0) return entry.tier;
3
- if (entry.partition.startsWith("daily/")) return "raw";
4
- if (entry.partition.startsWith("monthly/")) return "d30";
5
- }
1
+ import { r as inferLegacyTier } from "../_chunks/storage.mjs";
6
2
  function readOnly(name) {
7
3
  throw new Error(`http adapter is read-only: ${name} is not supported`);
8
4
  }
@@ -1,83 +1,6 @@
1
- import { ColumnDef, Row, Row as Row$1, TableName, TableName as TableName$1 } from "gscdump/contracts";
2
- interface DataSource {
3
- read: (key: string, range?: {
4
- offset: number;
5
- length: number;
6
- }, signal?: AbortSignal) => Promise<Uint8Array>;
7
- write: (key: string, bytes: Uint8Array) => Promise<void>;
8
- delete: (keys: string[]) => Promise<void>;
9
- /**
10
- * One-shot listing under a prefix. Implementations may cap the number of
11
- * returned keys (typically 10k) — callers iterating full tenant space
12
- * should prefer `streamList` when available or narrow the prefix.
13
- */
14
- list: (prefix: string) => Promise<string[]>;
15
- /**
16
- * Per-key URI probe. Returns a URI string DuckDB's `httpfs` (or an
17
- * equivalent engine that fetches its own I/O) can read directly, or
18
- * `undefined` if the key isn't URI-resolvable on this backend and the
19
- * caller must fall back to `read(key)` for the bytes.
20
- *
21
- * Contracts:
22
- * - When defined, the returned URI MUST yield byte-identical content to
23
- * `read(key)`. Callers rely on this for correctness.
24
- * - Backends with a native URI for every key (filesystem: absolute path,
25
- * R2 via `httpfs`: signed URL) may always return a string.
26
- * - Backends without a native URI shape (in-memory) omit the method or
27
- * return `undefined` per call.
28
- * - Mixed-per-query is allowed: some keys in one query may return a URI,
29
- * others may not; the executor branches per key.
30
- */
31
- uri?: (key: string) => string | undefined;
32
- /**
33
- * Optional — probe the byte size of a key without reading it. Used by
34
- * the engine to fill in `WriteResult.bytes` when a codec reports 0 or
35
- * unknown but the file is non-trivial.
36
- */
37
- head?: (key: string) => Promise<{
38
- bytes: number;
39
- } | undefined>;
40
- /**
41
- * Optional streaming variant of `list`. Implementations that page
42
- * backing-store results (R2, S3) should implement this and yield keys
43
- * lazily. `list` may return up to an adapter-defined cap (typically
44
- * 10k keys); callers iterating full tenant space must prefer
45
- * `streamList` when available, or chunk by narrower prefixes.
46
- */
47
- streamList?: (prefix: string) => AsyncIterable<string>;
48
- }
49
- interface WriteResult {
50
- bytes: number;
51
- rowCount: number;
52
- }
53
- interface CodecCtx {
54
- table: TableName;
55
- }
56
- /**
57
- * Key-oriented codec. Each method owns its I/O through `dataSource`:
58
- * - Node / browser codecs read/write bytes via `dataSource.read` / `.write`.
59
- * - Workers codecs let DuckDB's httpfs read/write remote URIs directly (via
60
- * `dataSource.uri`) and never materialise bytes in JS.
61
- *
62
- * The engine never touches bytes; it just hands rows + keys to the codec.
63
- *
64
- * Invariants every implementation MUST uphold:
65
- * - `writeRows` with an empty `rows` array MUST still write a file
66
- * carrying the canonical column set for `ctx.table` — a schema-correct
67
- * empty file. No placeholder-column shortcuts; readers depend on the
68
- * schema being present for `union_by_name` merges.
69
- * - `WriteResult.bytes` MUST be the real byte size written to the
70
- * data source (not 0, not an estimate) so the engine can enforce the
71
- * payload ceiling without a second `head` round-trip.
72
- * - `WriteResult.rowCount` MUST equal `rows.length` (or, for
73
- * `compactRows`, the sum of input row counts).
74
- */
75
- interface ParquetCodec {
76
- writeRows: (ctx: CodecCtx, rows: Row[], key: string, dataSource: DataSource) => Promise<WriteResult>;
77
- readRows: (ctx: CodecCtx, key: string, dataSource: DataSource) => Promise<Row[]>;
78
- compactRows: (ctx: CodecCtx, inputKeys: string[], outputKey: string, dataSource: DataSource) => Promise<WriteResult>;
79
- }
80
- declare function encodeRowsToParquet(table: TableName$1, rows: readonly Row$1[]): Uint8Array;
1
+ import { N as TableName, a as DataSource, h as ParquetCodec, t as CodecCtx, w as Row } from "../_chunks/storage.mjs";
2
+ import { t as ColumnDef } from "../_chunks/schema.mjs";
3
+ declare function encodeRowsToParquet(table: TableName, rows: readonly Row[]): Uint8Array;
81
4
  interface EncodeFlexOptions {
82
5
  /** Columns defining the output schema + order. */
83
6
  columns: readonly ColumnDef[];
@@ -93,15 +16,15 @@ interface EncodeFlexOptions {
93
16
  * the canonical encoder so DuckDB `read_parquet(union_by_name = true)`
94
17
  * merges cleanly with fact-table reads.
95
18
  */
96
- declare function encodeRowsToParquetFlex(rows: readonly Row$1[], opts: EncodeFlexOptions): Uint8Array;
97
- declare function decodeParquetToRows(bytes: Uint8Array): Promise<Row$1[]>;
19
+ declare function encodeRowsToParquetFlex(rows: readonly Row[], opts: EncodeFlexOptions): Uint8Array;
20
+ declare function decodeParquetToRows(bytes: Uint8Array): Promise<Row[]>;
98
21
  interface HyparquetCodecOptions {
99
22
  /**
100
23
  * Override `readRows`. Useful when reads should be delegated to a faster
101
24
  * engine (e.g. DuckDB-WASM via httpfs) while writes + compaction stay on
102
25
  * hyparquet to avoid WASM linear-memory growth. Defaults to hyparquet.
103
26
  */
104
- readRows?: (ctx: CodecCtx, key: string, dataSource: DataSource) => Promise<Row$1[]>;
27
+ readRows?: (ctx: CodecCtx, key: string, dataSource: DataSource) => Promise<Row[]>;
105
28
  }
106
29
  declare function createHyparquetCodec(options?: HyparquetCodecOptions): ParquetCodec;
107
30
  export { EncodeFlexOptions, HyparquetCodecOptions, createHyparquetCodec, decodeParquetToRows, encodeRowsToParquet, encodeRowsToParquetFlex };