@gscdump/engine 0.6.1 → 0.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/dist/_chunks/compiler.mjs +288 -0
  2. package/dist/_chunks/duckdb.d.mts +26 -0
  3. package/dist/_chunks/engine.mjs +578 -0
  4. package/dist/_chunks/pg-adapter.mjs +676 -0
  5. package/dist/_chunks/planner.d.mts +15 -0
  6. package/dist/_chunks/schema.d.mts +1258 -0
  7. package/dist/_chunks/schema.mjs +139 -0
  8. package/dist/_chunks/storage.d.mts +476 -0
  9. package/dist/_chunks/storage.mjs +39 -0
  10. package/dist/_chunks/types.d.mts +53 -0
  11. package/dist/adapters/duckdb-node.d.mts +1 -13
  12. package/dist/adapters/duckdb-node.mjs +1 -7
  13. package/dist/adapters/filesystem.d.mts +1 -193
  14. package/dist/adapters/filesystem.mjs +2 -9
  15. package/dist/adapters/http.d.mts +1 -193
  16. package/dist/adapters/http.mjs +1 -5
  17. package/dist/adapters/hyparquet.d.mts +6 -83
  18. package/dist/adapters/hyparquet.mjs +1 -105
  19. package/dist/adapters/inspection-sqlite-browser.d.mts +1 -7
  20. package/dist/adapters/inspection-sqlite-node.d.mts +1 -7
  21. package/dist/adapters/inspection-sqlite-node.mjs +1 -1
  22. package/dist/adapters/node-harness.d.mts +3 -306
  23. package/dist/adapters/node-harness.mjs +4 -1866
  24. package/dist/adapters/r2-manifest.d.mts +4 -149
  25. package/dist/adapters/r2-manifest.mjs +1 -8
  26. package/dist/adapters/r2.d.mts +1 -47
  27. package/dist/contracts.d.mts +1 -435
  28. package/dist/entities.d.mts +1 -47
  29. package/dist/index.d.mts +8 -1844
  30. package/dist/index.mjs +8 -1962
  31. package/dist/ingest.d.mts +1 -1
  32. package/dist/planner.d.mts +3 -16
  33. package/dist/planner.mjs +1 -320
  34. package/dist/resolver/index.d.mts +3 -51
  35. package/dist/resolver/index.mjs +2 -780
  36. package/dist/rollups.d.mts +6 -51
  37. package/dist/rollups.mjs +2 -209
  38. package/dist/schema.d.mts +2 -1258
  39. package/dist/schema.mjs +1 -138
  40. package/package.json +2 -2
@@ -0,0 +1,139 @@
1
+ import { date, doublePrecision, getTableConfig, integer, pgTable, varchar } from "drizzle-orm/pg-core";
2
+ function metricCols() {
3
+ return {
4
+ clicks: integer("clicks").notNull(),
5
+ impressions: integer("impressions").notNull(),
6
+ sum_position: doublePrecision("sum_position").notNull()
7
+ };
8
+ }
9
+ const dateCol = () => date("date").notNull();
10
+ const pages = pgTable("pages", {
11
+ url: varchar("url").notNull(),
12
+ date: dateCol(),
13
+ ...metricCols()
14
+ });
15
+ const keywords = pgTable("keywords", {
16
+ query: varchar("query").notNull(),
17
+ query_canonical: varchar("query_canonical"),
18
+ date: dateCol(),
19
+ ...metricCols()
20
+ });
21
+ const countries = pgTable("countries", {
22
+ country: varchar("country").notNull(),
23
+ date: dateCol(),
24
+ ...metricCols()
25
+ });
26
+ const devices = pgTable("devices", {
27
+ device: varchar("device").notNull(),
28
+ date: dateCol(),
29
+ ...metricCols()
30
+ });
31
+ const page_keywords = pgTable("page_keywords", {
32
+ url: varchar("url").notNull(),
33
+ query: varchar("query").notNull(),
34
+ query_canonical: varchar("query_canonical"),
35
+ date: dateCol(),
36
+ ...metricCols()
37
+ });
38
+ const search_appearance = pgTable("search_appearance", {
39
+ searchAppearance: varchar("searchAppearance").notNull(),
40
+ date: dateCol(),
41
+ ...metricCols()
42
+ });
43
+ const drizzleSchema = {
44
+ pages,
45
+ keywords,
46
+ countries,
47
+ devices,
48
+ page_keywords,
49
+ search_appearance
50
+ };
51
+ const TABLE_METADATA = {
52
+ pages: {
53
+ sortKey: ["date", "url"],
54
+ version: 1
55
+ },
56
+ keywords: {
57
+ sortKey: ["date", "query"],
58
+ version: 2
59
+ },
60
+ countries: {
61
+ sortKey: ["date", "country"],
62
+ version: 1
63
+ },
64
+ devices: {
65
+ sortKey: ["date", "device"],
66
+ version: 1
67
+ },
68
+ page_keywords: {
69
+ sortKey: [
70
+ "date",
71
+ "url",
72
+ "query"
73
+ ],
74
+ version: 2
75
+ },
76
+ search_appearance: {
77
+ sortKey: ["date", "searchAppearance"],
78
+ version: 1
79
+ }
80
+ };
81
+ function pgSqlTypeToColumnType(sqlType) {
82
+ const t = sqlType.toLowerCase();
83
+ if (t.startsWith("varchar") || t === "text" || t.startsWith("char")) return "VARCHAR";
84
+ if (t === "date" || t.startsWith("timestamp")) return "DATE";
85
+ if (t.startsWith("double") || t === "real" || t.startsWith("numeric") || t.startsWith("decimal")) return "DOUBLE";
86
+ if (t === "bigint" || t === "int8") return "BIGINT";
87
+ if (t === "integer" || t === "int" || t === "int4" || t === "smallint" || t === "int2") return "INTEGER";
88
+ throw new Error(`unmapped pg type '${sqlType}' — extend pgSqlTypeToColumnType in @gscdump/engine/schema`);
89
+ }
90
+ function tableSchemaFrom(tableName) {
91
+ const columns = getTableConfig(drizzleSchema[tableName]).columns.map((col) => ({
92
+ name: col.name,
93
+ type: pgSqlTypeToColumnType(col.getSQLType()),
94
+ nullable: !col.notNull
95
+ }));
96
+ const meta = TABLE_METADATA[tableName];
97
+ return {
98
+ name: tableName,
99
+ columns,
100
+ sortKey: meta.sortKey,
101
+ version: meta.version
102
+ };
103
+ }
104
+ const METRIC_TABLES = [
105
+ "pages",
106
+ "keywords",
107
+ "countries",
108
+ "devices",
109
+ "page_keywords",
110
+ "search_appearance"
111
+ ];
112
+ const SCHEMAS = Object.fromEntries(METRIC_TABLES.map((t) => [t, tableSchemaFrom(t)]));
113
+ function currentSchemaVersion(table) {
114
+ return SCHEMAS[table].version;
115
+ }
116
+ function schemaFor(table) {
117
+ return SCHEMAS[table];
118
+ }
119
+ function allTables() {
120
+ return METRIC_TABLES;
121
+ }
122
+ function inferTable(dimensions) {
123
+ const dims = new Set(dimensions);
124
+ const hasPage = dims.has("page");
125
+ const hasQuery = dims.has("query");
126
+ if (hasPage && hasQuery) return "page_keywords";
127
+ if (hasQuery) return "keywords";
128
+ if (hasPage) return "pages";
129
+ if (dims.has("country")) return "countries";
130
+ if (dims.has("device")) return "devices";
131
+ if (dims.has("searchAppearance")) return "search_appearance";
132
+ return "keywords";
133
+ }
134
+ function dimensionToColumn(dim, _table) {
135
+ if (dim === "page") return "url";
136
+ if (dim === "queryCanonical") return "query_canonical";
137
+ return dim;
138
+ }
139
+ export { inferTable as a, countries as c, keywords as d, page_keywords as f, dimensionToColumn as i, devices as l, search_appearance as m, allTables as n, schemaFor as o, pages as p, currentSchemaVersion as r, TABLE_METADATA as s, SCHEMAS as t, drizzleSchema as u };
@@ -0,0 +1,476 @@
1
+ import { t as ComparisonFilter } from "./types.mjs";
2
+ import { Row, Row as Row$1, TableName, TableName as TableName$1, TenantCtx, TenantCtx as TenantCtx$1 } from "gscdump/contracts";
3
+ import { BuilderState, SearchType, SearchType as SearchType$1 } from "gscdump/query";
4
+ /**
5
+ * Per-tier age threshold in days. Default ladder collapses on these gates:
6
+ * - raw → d7 once a daily file is older than `raw` days (default 7).
7
+ * - d7 → d30 once the entire weekly bucket sits behind `d7` days (default 30).
8
+ * - d30 → d90 once the entire monthly bucket sits behind `d30` days (default 90).
9
+ */
10
+ interface CompactionThresholds {
11
+ raw?: number;
12
+ d7?: number;
13
+ d30?: number;
14
+ }
15
+ declare function enumeratePartitions(startDate: string, endDate: string): string[];
16
+ /**
17
+ * Default `searchType` for entries written before the field landed and for
18
+ * sync paths that don't request a specific type. GSC's own default; the
19
+ * vast majority of stored data is web-search.
20
+ */
21
+ declare const DEFAULT_SEARCH_TYPE: SearchType;
22
+ interface WriteCtx extends TenantCtx {
23
+ table: TableName;
24
+ date?: string;
25
+ now?: () => number;
26
+ /**
27
+ * GSC search-type partition this write belongs to. Defaults to `'web'`.
28
+ * Non-web values (`discover`, `news`, `googleNews`, `image`, `video`)
29
+ * cause the writer to insert the type into the object key path so files
30
+ * for different search types coexist without colliding.
31
+ */
32
+ searchType?: SearchType;
33
+ }
34
+ interface QueryCtx extends TenantCtx {
35
+ table?: TableName;
36
+ signal?: AbortSignal;
37
+ }
38
+ interface GcCtx {
39
+ now?: () => number;
40
+ userId?: string;
41
+ siteId?: string;
42
+ }
43
+ /**
44
+ * Compaction tier of a manifest entry. Determines which compactor stage may
45
+ * pick it up as input:
46
+ * - `raw`: per-day file produced by `writeDay`. Eligible for raw→d7 merge at 7d.
47
+ * - `d7`: weekly compaction output. Eligible for d7→d30 merge at 30d.
48
+ * - `d30`: monthly compaction output (matches the legacy `monthly/` partition
49
+ * shape — pre-tier entries are read as `d30`). Eligible for d30→d90 at 90d.
50
+ * - `d90`: quarterly cold-tier output. Terminal; never recompacted.
51
+ *
52
+ * Without an explicit tier, entries written before this field landed default
53
+ * to `raw` for `daily/` partitions and `d30` for `monthly/` partitions, so
54
+ * the tiered compactor picks the right inputs without a backfill rewrite.
55
+ */
56
+ type CompactionTier = 'raw' | 'd7' | 'd30' | 'd90';
57
+ interface ManifestEntry {
58
+ userId: string;
59
+ siteId?: string;
60
+ table: TableName;
61
+ partition: string;
62
+ objectKey: string;
63
+ rowCount: number;
64
+ bytes: number;
65
+ createdAt: number;
66
+ retiredAt?: number;
67
+ /** Table schema version at write time. Omitted on pre-#27 entries — treat as 1. */
68
+ schemaVersion?: number;
69
+ /**
70
+ * Compaction tier. Omitted on entries written before tiered compaction —
71
+ * treat as `raw` for `daily/` partitions and `d30` for `monthly/` partitions
72
+ * (see {@link inferLegacyTier}).
73
+ */
74
+ tier?: CompactionTier;
75
+ /**
76
+ * GSC search-type this entry covers (web | discover | news | googleNews |
77
+ * image | video). Omitted on entries written before per-type partitioning
78
+ * landed — treat as `web` (see {@link inferSearchType}). Compaction merges
79
+ * only entries with the same searchType.
80
+ */
81
+ searchType?: SearchType;
82
+ }
83
+ /**
84
+ * Resolve the search type for an entry, defaulting legacy entries to `web`.
85
+ * Use this anywhere code needs to bucket entries by searchType.
86
+ */
87
+ declare function inferSearchType(entry: Pick<ManifestEntry, 'searchType'>): SearchType;
88
+ /**
89
+ * Infer the tier for an entry that pre-dates the `tier` field. Daily files
90
+ * are `raw`; monthly files are `d30`. Anything else (already migrated, or
91
+ * a partition shape we haven't seen) returns undefined and the caller must
92
+ * decide how to handle it.
93
+ */
94
+ declare function inferLegacyTier(entry: Pick<ManifestEntry, 'partition' | 'tier'>): CompactionTier | undefined;
95
+ interface ListLiveFilter {
96
+ userId: string;
97
+ siteId?: string;
98
+ table?: TableName;
99
+ partitions?: string[];
100
+ /**
101
+ * Narrow to a single compaction tier. Tier-aware compaction stages set this
102
+ * so the store doesn't have to return (and the caller doesn't have to scan)
103
+ * the entire manifest just to compact the raw cohort. Legacy entries without
104
+ * an explicit `tier` field match on {@link inferLegacyTier}.
105
+ */
106
+ tier?: CompactionTier;
107
+ }
108
+ interface DataSource {
109
+ read: (key: string, range?: {
110
+ offset: number;
111
+ length: number;
112
+ }, signal?: AbortSignal) => Promise<Uint8Array>;
113
+ write: (key: string, bytes: Uint8Array) => Promise<void>;
114
+ delete: (keys: string[]) => Promise<void>;
115
+ /**
116
+ * One-shot listing under a prefix. Implementations may cap the number of
117
+ * returned keys (typically 10k) — callers iterating full tenant space
118
+ * should prefer `streamList` when available or narrow the prefix.
119
+ */
120
+ list: (prefix: string) => Promise<string[]>;
121
+ /**
122
+ * Per-key URI probe. Returns a URI string DuckDB's `httpfs` (or an
123
+ * equivalent engine that fetches its own I/O) can read directly, or
124
+ * `undefined` if the key isn't URI-resolvable on this backend and the
125
+ * caller must fall back to `read(key)` for the bytes.
126
+ *
127
+ * Contracts:
128
+ * - When defined, the returned URI MUST yield byte-identical content to
129
+ * `read(key)`. Callers rely on this for correctness.
130
+ * - Backends with a native URI for every key (filesystem: absolute path,
131
+ * R2 via `httpfs`: signed URL) may always return a string.
132
+ * - Backends without a native URI shape (in-memory) omit the method or
133
+ * return `undefined` per call.
134
+ * - Mixed-per-query is allowed: some keys in one query may return a URI,
135
+ * others may not; the executor branches per key.
136
+ */
137
+ uri?: (key: string) => string | undefined;
138
+ /**
139
+ * Optional — probe the byte size of a key without reading it. Used by
140
+ * the engine to fill in `WriteResult.bytes` when a codec reports 0 or
141
+ * unknown but the file is non-trivial.
142
+ */
143
+ head?: (key: string) => Promise<{
144
+ bytes: number;
145
+ } | undefined>;
146
+ /**
147
+ * Optional streaming variant of `list`. Implementations that page
148
+ * backing-store results (R2, S3) should implement this and yield keys
149
+ * lazily. `list` may return up to an adapter-defined cap (typically
150
+ * 10k keys); callers iterating full tenant space must prefer
151
+ * `streamList` when available, or chunk by narrower prefixes.
152
+ */
153
+ streamList?: (prefix: string) => AsyncIterable<string>;
154
+ }
155
+ interface WatermarkScope {
156
+ userId: string;
157
+ siteId?: string;
158
+ table: TableName;
159
+ }
160
+ interface Watermark extends WatermarkScope {
161
+ newestDateSynced: string;
162
+ oldestDateSynced: string;
163
+ lastSyncAt: number;
164
+ }
165
+ interface WatermarkFilter {
166
+ userId: string;
167
+ siteId?: string;
168
+ table?: TableName;
169
+ }
170
+ type SyncStateKind = 'pending' | 'inflight' | 'done' | 'failed';
171
+ interface SyncStateScope {
172
+ userId: string;
173
+ siteId?: string;
174
+ table: TableName;
175
+ date: string;
176
+ /**
177
+ * GSC search-type this sync state covers. Omitted = `web` (the legacy
178
+ * default; matches pre-#5 sync states stored before per-type sync landed).
179
+ * Lookups must compare via {@link inferSearchType} so a missing field
180
+ * matches an explicit `'web'` and vice versa.
181
+ */
182
+ searchType?: SearchType;
183
+ }
184
+ interface SyncState extends SyncStateScope {
185
+ state: SyncStateKind;
186
+ updatedAt: number;
187
+ attempts: number;
188
+ error?: string;
189
+ }
190
+ interface SyncStateFilter {
191
+ userId: string;
192
+ siteId?: string;
193
+ table?: TableName;
194
+ state?: SyncStateKind;
195
+ searchType?: SearchType;
196
+ }
197
+ interface SyncStateDetail {
198
+ at?: number;
199
+ error?: string;
200
+ }
201
+ interface LockScope {
202
+ userId: string;
203
+ siteId?: string;
204
+ table: TableName;
205
+ partition: string;
206
+ }
207
+ interface PurgeFilter {
208
+ userId: string;
209
+ siteId?: string;
210
+ }
211
+ interface ManifestPurgeResult {
212
+ entriesRemoved: number;
213
+ watermarksRemoved: number;
214
+ syncStatesRemoved: number;
215
+ }
216
+ interface PurgeResult {
217
+ userId: string;
218
+ siteId?: string;
219
+ prefix: string;
220
+ objectsDeleted: number;
221
+ entriesRemoved: number;
222
+ watermarksRemoved: number;
223
+ syncStatesRemoved: number;
224
+ at: number;
225
+ }
226
+ interface PurgeUrlsResult {
227
+ userId: string;
228
+ siteId?: string;
229
+ urlsRequested: number;
230
+ entriesRewritten: number;
231
+ rowsRemoved: number;
232
+ bytesAfter: number;
233
+ at: number;
234
+ }
235
+ interface ManifestStore {
236
+ listLive: (filter: ListLiveFilter) => Promise<ManifestEntry[]>;
237
+ listAll: (filter: ListLiveFilter) => Promise<ManifestEntry[]>;
238
+ registerVersion: (entry: ManifestEntry, superseding?: ManifestEntry[]) => Promise<void>;
239
+ registerVersions: (entries: ManifestEntry[], superseding?: ManifestEntry[]) => Promise<void>;
240
+ listRetired: (olderThan: number) => Promise<ManifestEntry[]>;
241
+ delete: (entries: ManifestEntry[]) => Promise<void>;
242
+ getWatermarks: (filter: WatermarkFilter) => Promise<Watermark[]>;
243
+ bumpWatermark: (scope: WatermarkScope, date: string, at?: number) => Promise<void>;
244
+ getSyncStates: (filter: SyncStateFilter) => Promise<SyncState[]>;
245
+ setSyncState: (scope: SyncStateScope, state: SyncStateKind, detail?: SyncStateDetail) => Promise<void>;
246
+ /**
247
+ * Serialize concurrent writers against the same scope. Held across the
248
+ * write+register window so GC (orphan sweep) won't delete bytes that are
249
+ * midway between `dataSource.write` and `manifestStore.registerVersion`.
250
+ * Scope = tenant × table × partition.
251
+ */
252
+ withLock: <T>(scope: LockScope, fn: () => Promise<T>) => Promise<T>;
253
+ /**
254
+ * GDPR-grade tenant purge. Removes every manifest entry, watermark, and
255
+ * sync-state record matching the filter. Does NOT touch the underlying
256
+ * data-source bytes; callers (typically {@link StorageEngine.purgeTenant})
257
+ * must sweep the tenant prefix separately before invoking this so that
258
+ * mid-flight failures can't leave orphan parquet with no manifest record.
259
+ *
260
+ * On stores with CAS-backed sharding (R2 manifest) this may issue one
261
+ * mutation per shard. On read-only stores (HTTP) this throws.
262
+ */
263
+ purgeTenant: (filter: PurgeFilter) => Promise<ManifestPurgeResult>;
264
+ }
265
+ interface WriteResult {
266
+ bytes: number;
267
+ rowCount: number;
268
+ }
269
+ interface CodecCtx {
270
+ table: TableName;
271
+ }
272
+ /**
273
+ * Key-oriented codec. Each method owns its I/O through `dataSource`:
274
+ * - Node / browser codecs read/write bytes via `dataSource.read` / `.write`.
275
+ * - Workers codecs let DuckDB's httpfs read/write remote URIs directly (via
276
+ * `dataSource.uri`) and never materialise bytes in JS.
277
+ *
278
+ * The engine never touches bytes; it just hands rows + keys to the codec.
279
+ *
280
+ * Invariants every implementation MUST uphold:
281
+ * - `writeRows` with an empty `rows` array MUST still write a file
282
+ * carrying the canonical column set for `ctx.table` — a schema-correct
283
+ * empty file. No placeholder-column shortcuts; readers depend on the
284
+ * schema being present for `union_by_name` merges.
285
+ * - `WriteResult.bytes` MUST be the real byte size written to the
286
+ * data source (not 0, not an estimate) so the engine can enforce the
287
+ * payload ceiling without a second `head` round-trip.
288
+ * - `WriteResult.rowCount` MUST equal `rows.length` (or, for
289
+ * `compactRows`, the sum of input row counts).
290
+ */
291
+ interface ParquetCodec {
292
+ writeRows: (ctx: CodecCtx, rows: Row[], key: string, dataSource: DataSource) => Promise<WriteResult>;
293
+ readRows: (ctx: CodecCtx, key: string, dataSource: DataSource) => Promise<Row[]>;
294
+ compactRows: (ctx: CodecCtx, inputKeys: string[], outputKey: string, dataSource: DataSource) => Promise<WriteResult>;
295
+ }
296
+ interface QueryResult {
297
+ rows: Row[];
298
+ sql: string;
299
+ objectKeys: string[];
300
+ }
301
+ interface ComparisonResult {
302
+ rows: Row[];
303
+ totalCount: number;
304
+ totals: Record<string, unknown>;
305
+ }
306
+ interface ExtraResult {
307
+ key: string;
308
+ rows: Row[];
309
+ }
310
+ interface QueryExecuteOptions {
311
+ sql: string;
312
+ params: unknown[];
313
+ /**
314
+ * Named placeholder → object keys. The executor substitutes `{{NAME}}`
315
+ * occurrences in the SQL with the matching `read_parquet([...])` list,
316
+ * choosing between virtual-FS names or native URIs based on whether
317
+ * `dataSource.uri` is available.
318
+ */
319
+ fileKeys: Record<string, string[]>;
320
+ dataSource: DataSource;
321
+ table: TableName;
322
+ signal?: AbortSignal;
323
+ /**
324
+ * Optional callback invoked by the executor when it detects the DuckDB
325
+ * process is approaching a memory ceiling (e.g. ingesting rows after
326
+ * httpfs decode, or materialising a large temp relation). Callers can
327
+ * shed work, warm a spillover path, or warn the user. Advisory only —
328
+ * not all executors implement it.
329
+ */
330
+ onMemoryPressure?: (info: {
331
+ bytes?: number;
332
+ reason: string;
333
+ }) => void;
334
+ }
335
+ interface QueryExecuteResult {
336
+ rows: Row[];
337
+ /** The final SQL actually run (after placeholder substitution). */
338
+ sql: string;
339
+ /**
340
+ * Optional diagnostics the executor may emit for observability + capacity
341
+ * planning. Undefined on executors that don't instrument their runtime.
342
+ *
343
+ * - `peakBytes`: highest resident memory the engine reported during the
344
+ * query. Callers may use this to decide whether to drop / compact state
345
+ * before the next call.
346
+ * - `resetRecommended`: executor thinks the underlying connection should
347
+ * be recycled (fragmented, near ceiling). Caller-owned decision —
348
+ * honored by `BrowserAnalysisRuntime` consumers but not enforced.
349
+ */
350
+ diagnostics?: {
351
+ peakBytes?: number;
352
+ resetRecommended?: boolean;
353
+ };
354
+ }
355
+ interface QueryExecutor {
356
+ execute: (opts: QueryExecuteOptions) => Promise<QueryExecuteResult>;
357
+ }
358
+ interface FileSetRef {
359
+ table: TableName;
360
+ partitions?: string[];
361
+ }
362
+ interface RunSQLOptions {
363
+ ctx: TenantCtx;
364
+ /**
365
+ * Named partition references. Each name becomes a `{{NAME}}` placeholder
366
+ * substituted into the SQL with the matching list of object keys. The
367
+ * canonical name is `FILES`; analyzers also use `FILES_PREV` for a prior
368
+ * window. Providing zero fileSets runs the SQL against no files.
369
+ */
370
+ fileSets: Record<string, FileSetRef>;
371
+ /** Schema-bearing table; defaults to the first fileSet's table. */
372
+ table?: TableName;
373
+ sql: string;
374
+ params?: unknown[];
375
+ signal?: AbortSignal;
376
+ }
377
+ interface StorageEngine {
378
+ writeDay: (ctx: WriteCtx, rows: Row[]) => Promise<void>;
379
+ query: (ctx: QueryCtx, state: BuilderState) => Promise<QueryResult>;
380
+ /**
381
+ * Two-window comparison query (resolver-compiled). Joins a `current` and
382
+ * `previous` window CTE on dimensions, applies an optional row filter
383
+ * (`new`/`lost`/`improving`/`declining`), and returns the merged rows plus
384
+ * total count and unfiltered totals.
385
+ *
386
+ * Tenant scoping comes from `ctx.userId`/`ctx.siteId` (manifest lookup) —
387
+ * the SQL itself is single-tenant against the parquet adapter, which has
388
+ * `includeSiteId: false`.
389
+ *
390
+ * Throws if `current` and `previous` resolve to different tables.
391
+ */
392
+ queryComparison: (ctx: QueryCtx, current: BuilderState, previous: BuilderState, filter?: ComparisonFilter) => Promise<ComparisonResult>;
393
+ /**
394
+ * Canonical-variant enrichment queries. Returns one result per extra
395
+ * surface; today only `queryCanonical` triggers an extra. Empty array
396
+ * when the state has no extras-eligible dimensions.
397
+ */
398
+ queryExtras: (ctx: QueryCtx, state: BuilderState) => Promise<ExtraResult[]>;
399
+ /**
400
+ * Run arbitrary SQL resolved against named partition sets. Composes
401
+ * manifest lookup + object reads + placeholder substitution + execution
402
+ * so callers don't need to reach into `ManifestStore`/`DataSource`
403
+ * directly.
404
+ */
405
+ runSQL: (opts: RunSQLOptions) => Promise<QueryResult>;
406
+ compactTiered: (ctx: WriteCtx, thresholds?: CompactionThresholds) => Promise<void>;
407
+ gcOrphans: (ctx: GcCtx, graceMs: number) => Promise<{
408
+ deleted: number;
409
+ }>;
410
+ /**
411
+ * GDPR-grade tenant purge. Deletes every object under the tenant prefix
412
+ * (parquet, rollups, entity stores), then removes manifest/watermark/
413
+ * sync-state records via {@link ManifestStore.purgeTenant}.
414
+ *
415
+ * Order matters: bytes are deleted before manifest entries, so a
416
+ * crash mid-purge leaves orphan manifest records (detectable via the
417
+ * normal orphan sweep) rather than orphan bytes with no record.
418
+ *
419
+ * Returns counters suitable for an audit log. Caller is responsible
420
+ * for persisting the audit entry.
421
+ */
422
+ purgeTenant: (ctx: TenantCtx) => Promise<PurgeResult>;
423
+ /**
424
+ * GDPR URL-matcher purge. Deletes rows whose `url` column matches one of
425
+ * `urls` across every live parquet entry for the tenant in tables that
426
+ * carry a `url` column (`pages`, `page_keywords`). Tables without a `url`
427
+ * column (`keywords`, `countries`, `devices`, `search_appearance`) are
428
+ * untouched — they never store per-URL data.
429
+ *
430
+ * For each affected entry the engine reads the file, filters the matching
431
+ * rows out, writes a replacement parquet at a new object key, and registers
432
+ * the new entry as a supersede of the old. Entries with no matches are
433
+ * left untouched. Entries with all rows matching are replaced by a
434
+ * schema-bearing empty-rows file.
435
+ *
436
+ * Narrower counterpart to {@link purgeTenant}: use this for a per-URL
437
+ * takedown request; use `purgeTenant` for full-account deletion.
438
+ */
439
+ purgeUrls: (ctx: TenantCtx, urls: readonly string[]) => Promise<PurgeUrlsResult>;
440
+ listLive: (filter: ListLiveFilter) => Promise<ManifestEntry[]>;
441
+ listAll: (filter: ListLiveFilter) => Promise<ManifestEntry[]>;
442
+ getWatermarks: (filter: WatermarkFilter) => Promise<Watermark[]>;
443
+ getSyncStates: (filter: SyncStateFilter) => Promise<SyncState[]>;
444
+ setSyncState: (scope: SyncStateScope, state: SyncStateKind, detail?: SyncStateDetail) => Promise<void>;
445
+ /** Read the raw bytes of a single object. Rarely needed outside the `dump` CLI. */
446
+ readObject: (key: string) => Promise<Uint8Array>;
447
+ }
448
+ interface EngineOptions {
449
+ dataSource: DataSource;
450
+ manifestStore: ManifestStore;
451
+ codec: ParquetCodec;
452
+ executor: QueryExecutor;
453
+ now?: () => number;
454
+ }
455
+ declare function dayPartition(date: string): string;
456
+ declare function monthPartition(month: string): string;
457
+ /**
458
+ * Weekly partition keyed by the Monday-of-week ISO date (e.g. `weekly/2026-04-20`
459
+ * for the ISO week containing 2026-04-22). Names are stable + sortable; the
460
+ * dashboard never parses them, only reads via the manifest.
461
+ */
462
+ declare function weekPartition(mondayIsoDate: string): string;
463
+ /**
464
+ * Quarterly partition (e.g. `quarterly/2026-Q2` for Apr-Jun 2026). Used as the
465
+ * cold-tier shape for `d90` compaction outputs.
466
+ */
467
+ declare function quarterPartition(quarter: string): string;
468
+ /**
469
+ * Monday-of-week as a YYYY-MM-DD string for the ISO week containing `isoDate`.
470
+ * Used by tiered compaction to bucket raw daily files into weekly groups.
471
+ */
472
+ declare function mondayOfWeek(isoDate: string): string;
473
+ /** YYYY-Qq for the quarter containing the given YYYY-MM month string. */
474
+ declare function quarterOfMonth(month: string): string;
475
+ declare function objectKey(ctx: TenantCtx, table: TableName, partition: string, version: number, searchType?: SearchType): string;
476
+ export { SyncStateFilter as A, dayPartition as B, QueryResult as C, StorageEngine as D, SearchType$1 as E, Watermark as F, objectKey as G, inferSearchType as H, WatermarkFilter as I, weekPartition as J, quarterOfMonth as K, WatermarkScope as L, SyncStateScope as M, TableName$1 as N, SyncState as O, TenantCtx$1 as P, WriteCtx as R, QueryExecutor as S, RunSQLOptions as T, mondayOfWeek as U, inferLegacyTier as V, monthPartition as W, enumeratePartitions as X, CompactionThresholds as Y, PurgeResult as _, DataSource as a, QueryExecuteOptions as b, FileSetRef as c, LockScope as d, ManifestEntry as f, PurgeFilter as g, ParquetCodec as h, DEFAULT_SEARCH_TYPE as i, SyncStateKind as j, SyncStateDetail as k, GcCtx as l, ManifestStore as m, CompactionTier as n, EngineOptions as o, ManifestPurgeResult as p, quarterPartition as q, ComparisonResult as r, ExtraResult as s, CodecCtx as t, ListLiveFilter as u, PurgeUrlsResult as v, Row$1 as w, QueryExecuteResult as x, QueryCtx as y, WriteResult as z };
@@ -0,0 +1,39 @@
1
+ import { MS_PER_DAY, toIsoDate } from "gscdump";
2
+ const DEFAULT_SEARCH_TYPE = "web";
3
+ function inferSearchType(entry) {
4
+ return entry.searchType ?? "web";
5
+ }
6
+ function inferLegacyTier(entry) {
7
+ if (entry.tier !== void 0) return entry.tier;
8
+ if (entry.partition.startsWith("daily/")) return "raw";
9
+ if (entry.partition.startsWith("monthly/")) return "d30";
10
+ }
11
+ function dayPartition(date) {
12
+ return `daily/${date}`;
13
+ }
14
+ function monthPartition(month) {
15
+ return `monthly/${month}`;
16
+ }
17
+ function weekPartition(mondayIsoDate) {
18
+ return `weekly/${mondayIsoDate}`;
19
+ }
20
+ function quarterPartition(quarter) {
21
+ return `quarterly/${quarter}`;
22
+ }
23
+ function mondayOfWeek(isoDate) {
24
+ const ms = Date.parse(`${isoDate}T00:00:00Z`);
25
+ const dow = new Date(ms).getUTCDay();
26
+ const offset = dow === 0 ? -6 : 1 - dow;
27
+ return toIsoDate(new Date(ms + offset * MS_PER_DAY));
28
+ }
29
+ function quarterOfMonth(month) {
30
+ const [y, m] = month.split("-").map(Number);
31
+ return `${y}-Q${Math.floor((m - 1) / 3) + 1}`;
32
+ }
33
+ function objectKey(ctx, table, partition, version, searchType) {
34
+ return `${ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/${table}` : `u_${ctx.userId}/${table}`}/${searchType !== void 0 && searchType !== "web" ? `${searchType}/` : ""}${partition}__v${version}.parquet`;
35
+ }
36
+ function tenantPrefix(ctx) {
37
+ return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/` : `u_${ctx.userId}/`;
38
+ }
39
+ export { mondayOfWeek as a, quarterOfMonth as c, weekPartition as d, inferSearchType as i, quarterPartition as l, dayPartition as n, monthPartition as o, inferLegacyTier as r, objectKey as s, DEFAULT_SEARCH_TYPE as t, tenantPrefix as u };