@gscdump/engine 0.6.1 → 0.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/dist/_chunks/compiler.mjs +288 -0
  2. package/dist/_chunks/duckdb.d.mts +26 -0
  3. package/dist/_chunks/engine.mjs +578 -0
  4. package/dist/_chunks/pg-adapter.mjs +676 -0
  5. package/dist/_chunks/planner.d.mts +15 -0
  6. package/dist/_chunks/schema.d.mts +1258 -0
  7. package/dist/_chunks/schema.mjs +139 -0
  8. package/dist/_chunks/storage.d.mts +476 -0
  9. package/dist/_chunks/storage.mjs +39 -0
  10. package/dist/_chunks/types.d.mts +53 -0
  11. package/dist/adapters/duckdb-node.d.mts +1 -13
  12. package/dist/adapters/duckdb-node.mjs +1 -7
  13. package/dist/adapters/filesystem.d.mts +1 -193
  14. package/dist/adapters/filesystem.mjs +2 -9
  15. package/dist/adapters/http.d.mts +1 -193
  16. package/dist/adapters/http.mjs +1 -5
  17. package/dist/adapters/hyparquet.d.mts +6 -83
  18. package/dist/adapters/hyparquet.mjs +1 -105
  19. package/dist/adapters/inspection-sqlite-browser.d.mts +1 -7
  20. package/dist/adapters/inspection-sqlite-node.d.mts +1 -7
  21. package/dist/adapters/inspection-sqlite-node.mjs +1 -1
  22. package/dist/adapters/node-harness.d.mts +3 -306
  23. package/dist/adapters/node-harness.mjs +4 -1866
  24. package/dist/adapters/r2-manifest.d.mts +4 -149
  25. package/dist/adapters/r2-manifest.mjs +1 -8
  26. package/dist/adapters/r2.d.mts +1 -47
  27. package/dist/contracts.d.mts +1 -435
  28. package/dist/entities.d.mts +1 -47
  29. package/dist/index.d.mts +8 -1844
  30. package/dist/index.mjs +8 -1962
  31. package/dist/ingest.d.mts +1 -1
  32. package/dist/planner.d.mts +3 -16
  33. package/dist/planner.mjs +1 -320
  34. package/dist/resolver/index.d.mts +3 -51
  35. package/dist/resolver/index.mjs +2 -780
  36. package/dist/rollups.d.mts +6 -51
  37. package/dist/rollups.mjs +2 -209
  38. package/dist/schema.d.mts +2 -1258
  39. package/dist/schema.mjs +1 -138
  40. package/package.json +2 -2
@@ -1,110 +1,6 @@
1
+ import { s as TABLE_METADATA, t as SCHEMAS } from "../_chunks/schema.mjs";
1
2
  import { parquetReadObjects } from "hyparquet";
2
3
  import { parquetWriteBuffer } from "hyparquet-writer";
3
- import { date, doublePrecision, getTableConfig, integer, pgTable, varchar } from "drizzle-orm/pg-core";
4
- function metricCols() {
5
- return {
6
- clicks: integer("clicks").notNull(),
7
- impressions: integer("impressions").notNull(),
8
- sum_position: doublePrecision("sum_position").notNull()
9
- };
10
- }
11
- const dateCol = () => date("date").notNull();
12
- const drizzleSchema = {
13
- pages: pgTable("pages", {
14
- url: varchar("url").notNull(),
15
- date: dateCol(),
16
- ...metricCols()
17
- }),
18
- keywords: pgTable("keywords", {
19
- query: varchar("query").notNull(),
20
- query_canonical: varchar("query_canonical"),
21
- date: dateCol(),
22
- ...metricCols()
23
- }),
24
- countries: pgTable("countries", {
25
- country: varchar("country").notNull(),
26
- date: dateCol(),
27
- ...metricCols()
28
- }),
29
- devices: pgTable("devices", {
30
- device: varchar("device").notNull(),
31
- date: dateCol(),
32
- ...metricCols()
33
- }),
34
- page_keywords: pgTable("page_keywords", {
35
- url: varchar("url").notNull(),
36
- query: varchar("query").notNull(),
37
- query_canonical: varchar("query_canonical"),
38
- date: dateCol(),
39
- ...metricCols()
40
- }),
41
- search_appearance: pgTable("search_appearance", {
42
- searchAppearance: varchar("searchAppearance").notNull(),
43
- date: dateCol(),
44
- ...metricCols()
45
- })
46
- };
47
- const TABLE_METADATA = {
48
- pages: {
49
- sortKey: ["date", "url"],
50
- version: 1
51
- },
52
- keywords: {
53
- sortKey: ["date", "query"],
54
- version: 2
55
- },
56
- countries: {
57
- sortKey: ["date", "country"],
58
- version: 1
59
- },
60
- devices: {
61
- sortKey: ["date", "device"],
62
- version: 1
63
- },
64
- page_keywords: {
65
- sortKey: [
66
- "date",
67
- "url",
68
- "query"
69
- ],
70
- version: 2
71
- },
72
- search_appearance: {
73
- sortKey: ["date", "searchAppearance"],
74
- version: 1
75
- }
76
- };
77
- function pgSqlTypeToColumnType(sqlType) {
78
- const t = sqlType.toLowerCase();
79
- if (t.startsWith("varchar") || t === "text" || t.startsWith("char")) return "VARCHAR";
80
- if (t === "date" || t.startsWith("timestamp")) return "DATE";
81
- if (t.startsWith("double") || t === "real" || t.startsWith("numeric") || t.startsWith("decimal")) return "DOUBLE";
82
- if (t === "bigint" || t === "int8") return "BIGINT";
83
- if (t === "integer" || t === "int" || t === "int4" || t === "smallint" || t === "int2") return "INTEGER";
84
- throw new Error(`unmapped pg type '${sqlType}' — extend pgSqlTypeToColumnType in @gscdump/engine/schema`);
85
- }
86
- function tableSchemaFrom(tableName) {
87
- const columns = getTableConfig(drizzleSchema[tableName]).columns.map((col) => ({
88
- name: col.name,
89
- type: pgSqlTypeToColumnType(col.getSQLType()),
90
- nullable: !col.notNull
91
- }));
92
- const meta = TABLE_METADATA[tableName];
93
- return {
94
- name: tableName,
95
- columns,
96
- sortKey: meta.sortKey,
97
- version: meta.version
98
- };
99
- }
100
- const SCHEMAS = Object.fromEntries([
101
- "pages",
102
- "keywords",
103
- "countries",
104
- "devices",
105
- "page_keywords",
106
- "search_appearance"
107
- ].map((t) => [t, tableSchemaFrom(t)]));
108
4
  const ROW_GROUP_SIZE = 25e3;
109
5
  function basicTypeFor(colType) {
110
6
  if (colType === "VARCHAR" || colType === "DATE") return "STRING";
@@ -1,9 +1,3 @@
1
- interface InspectionSqlDriver {
2
- exec: (sql: string) => void | Promise<void>;
3
- run: (sql: string, params: unknown[]) => void | Promise<void>;
4
- all: (sql: string, params: unknown[]) => unknown[] | Promise<unknown[]>;
5
- serialize: () => Uint8Array | Promise<Uint8Array>;
6
- close: () => void | Promise<void>;
7
- }
1
+ import { InspectionSqlDriver } from "../entities.mjs";
8
2
  declare function createWaSqliteDriver(bytes: Uint8Array | undefined): Promise<InspectionSqlDriver>;
9
3
  export { createWaSqliteDriver };
@@ -1,9 +1,3 @@
1
- interface InspectionSqlDriver {
2
- exec: (sql: string) => void | Promise<void>;
3
- run: (sql: string, params: unknown[]) => void | Promise<void>;
4
- all: (sql: string, params: unknown[]) => unknown[] | Promise<unknown[]>;
5
- serialize: () => Uint8Array | Promise<Uint8Array>;
6
- close: () => void | Promise<void>;
7
- }
1
+ import { InspectionSqlDriver } from "../entities.mjs";
8
2
  declare function createBetterSqliteDriver(bytes: Uint8Array | undefined): InspectionSqlDriver;
9
3
  export { createBetterSqliteDriver };
@@ -1,7 +1,7 @@
1
1
  import { createRequire } from "node:module";
2
- import { Buffer } from "node:buffer";
3
2
  import process from "node:process";
4
3
  import { fileURLToPath } from "node:url";
4
+ import { Buffer } from "node:buffer";
5
5
  const require_ = createRequire(typeof __filename !== "undefined" ? __filename : typeof import.meta !== "undefined" ? fileURLToPath(import.meta.url) : process.cwd());
6
6
  function loadBetterSqlite() {
7
7
  const mod = require_("better-sqlite3");
@@ -1,307 +1,4 @@
1
- import { Row, Row as Row$1, TableName, TableName as TableName$1, TenantCtx } from "gscdump/contracts";
2
- import { BuilderState, SearchType } from "gscdump/query";
3
- /**
4
- * Per-tier age threshold in days. Default ladder collapses on these gates:
5
- * - raw → d7 once a daily file is older than `raw` days (default 7).
6
- * - d7 → d30 once the entire weekly bucket sits behind `d7` days (default 30).
7
- * - d30 → d90 once the entire monthly bucket sits behind `d30` days (default 90).
8
- */
9
- interface CompactionThresholds {
10
- raw?: number;
11
- d7?: number;
12
- d30?: number;
13
- }
14
- type ComparisonFilter = 'new' | 'lost' | 'improving' | 'declining';
15
- interface WriteCtx extends TenantCtx {
16
- table: TableName;
17
- date?: string;
18
- now?: () => number;
19
- /**
20
- * GSC search-type partition this write belongs to. Defaults to `'web'`.
21
- * Non-web values (`discover`, `news`, `googleNews`, `image`, `video`)
22
- * cause the writer to insert the type into the object key path so files
23
- * for different search types coexist without colliding.
24
- */
25
- searchType?: SearchType;
26
- }
27
- interface QueryCtx extends TenantCtx {
28
- table?: TableName;
29
- signal?: AbortSignal;
30
- }
31
- interface GcCtx {
32
- now?: () => number;
33
- userId?: string;
34
- siteId?: string;
35
- }
36
- /**
37
- * Compaction tier of a manifest entry. Determines which compactor stage may
38
- * pick it up as input:
39
- * - `raw`: per-day file produced by `writeDay`. Eligible for raw→d7 merge at 7d.
40
- * - `d7`: weekly compaction output. Eligible for d7→d30 merge at 30d.
41
- * - `d30`: monthly compaction output (matches the legacy `monthly/` partition
42
- * shape — pre-tier entries are read as `d30`). Eligible for d30→d90 at 90d.
43
- * - `d90`: quarterly cold-tier output. Terminal; never recompacted.
44
- *
45
- * Without an explicit tier, entries written before this field landed default
46
- * to `raw` for `daily/` partitions and `d30` for `monthly/` partitions, so
47
- * the tiered compactor picks the right inputs without a backfill rewrite.
48
- */
49
- type CompactionTier = 'raw' | 'd7' | 'd30' | 'd90';
50
- interface ManifestEntry {
51
- userId: string;
52
- siteId?: string;
53
- table: TableName;
54
- partition: string;
55
- objectKey: string;
56
- rowCount: number;
57
- bytes: number;
58
- createdAt: number;
59
- retiredAt?: number;
60
- /** Table schema version at write time. Omitted on pre-#27 entries — treat as 1. */
61
- schemaVersion?: number;
62
- /**
63
- * Compaction tier. Omitted on entries written before tiered compaction —
64
- * treat as `raw` for `daily/` partitions and `d30` for `monthly/` partitions
65
- * (see {@link inferLegacyTier}).
66
- */
67
- tier?: CompactionTier;
68
- /**
69
- * GSC search-type this entry covers (web | discover | news | googleNews |
70
- * image | video). Omitted on entries written before per-type partitioning
71
- * landed — treat as `web` (see {@link inferSearchType}). Compaction merges
72
- * only entries with the same searchType.
73
- */
74
- searchType?: SearchType;
75
- }
76
- interface ListLiveFilter {
77
- userId: string;
78
- siteId?: string;
79
- table?: TableName;
80
- partitions?: string[];
81
- /**
82
- * Narrow to a single compaction tier. Tier-aware compaction stages set this
83
- * so the store doesn't have to return (and the caller doesn't have to scan)
84
- * the entire manifest just to compact the raw cohort. Legacy entries without
85
- * an explicit `tier` field match on {@link inferLegacyTier}.
86
- */
87
- tier?: CompactionTier;
88
- }
89
- interface DataSource {
90
- read: (key: string, range?: {
91
- offset: number;
92
- length: number;
93
- }, signal?: AbortSignal) => Promise<Uint8Array>;
94
- write: (key: string, bytes: Uint8Array) => Promise<void>;
95
- delete: (keys: string[]) => Promise<void>;
96
- /**
97
- * One-shot listing under a prefix. Implementations may cap the number of
98
- * returned keys (typically 10k) — callers iterating full tenant space
99
- * should prefer `streamList` when available or narrow the prefix.
100
- */
101
- list: (prefix: string) => Promise<string[]>;
102
- /**
103
- * Per-key URI probe. Returns a URI string DuckDB's `httpfs` (or an
104
- * equivalent engine that fetches its own I/O) can read directly, or
105
- * `undefined` if the key isn't URI-resolvable on this backend and the
106
- * caller must fall back to `read(key)` for the bytes.
107
- *
108
- * Contracts:
109
- * - When defined, the returned URI MUST yield byte-identical content to
110
- * `read(key)`. Callers rely on this for correctness.
111
- * - Backends with a native URI for every key (filesystem: absolute path,
112
- * R2 via `httpfs`: signed URL) may always return a string.
113
- * - Backends without a native URI shape (in-memory) omit the method or
114
- * return `undefined` per call.
115
- * - Mixed-per-query is allowed: some keys in one query may return a URI,
116
- * others may not; the executor branches per key.
117
- */
118
- uri?: (key: string) => string | undefined;
119
- /**
120
- * Optional — probe the byte size of a key without reading it. Used by
121
- * the engine to fill in `WriteResult.bytes` when a codec reports 0 or
122
- * unknown but the file is non-trivial.
123
- */
124
- head?: (key: string) => Promise<{
125
- bytes: number;
126
- } | undefined>;
127
- /**
128
- * Optional streaming variant of `list`. Implementations that page
129
- * backing-store results (R2, S3) should implement this and yield keys
130
- * lazily. `list` may return up to an adapter-defined cap (typically
131
- * 10k keys); callers iterating full tenant space must prefer
132
- * `streamList` when available, or chunk by narrower prefixes.
133
- */
134
- streamList?: (prefix: string) => AsyncIterable<string>;
135
- }
136
- interface WatermarkScope {
137
- userId: string;
138
- siteId?: string;
139
- table: TableName;
140
- }
141
- interface Watermark extends WatermarkScope {
142
- newestDateSynced: string;
143
- oldestDateSynced: string;
144
- lastSyncAt: number;
145
- }
146
- interface WatermarkFilter {
147
- userId: string;
148
- siteId?: string;
149
- table?: TableName;
150
- }
151
- type SyncStateKind = 'pending' | 'inflight' | 'done' | 'failed';
152
- interface SyncStateScope {
153
- userId: string;
154
- siteId?: string;
155
- table: TableName;
156
- date: string;
157
- /**
158
- * GSC search-type this sync state covers. Omitted = `web` (the legacy
159
- * default; matches pre-#5 sync states stored before per-type sync landed).
160
- * Lookups must compare via {@link inferSearchType} so a missing field
161
- * matches an explicit `'web'` and vice versa.
162
- */
163
- searchType?: SearchType;
164
- }
165
- interface SyncState extends SyncStateScope {
166
- state: SyncStateKind;
167
- updatedAt: number;
168
- attempts: number;
169
- error?: string;
170
- }
171
- interface SyncStateFilter {
172
- userId: string;
173
- siteId?: string;
174
- table?: TableName;
175
- state?: SyncStateKind;
176
- searchType?: SearchType;
177
- }
178
- interface SyncStateDetail {
179
- at?: number;
180
- error?: string;
181
- }
182
- interface PurgeResult {
183
- userId: string;
184
- siteId?: string;
185
- prefix: string;
186
- objectsDeleted: number;
187
- entriesRemoved: number;
188
- watermarksRemoved: number;
189
- syncStatesRemoved: number;
190
- at: number;
191
- }
192
- interface PurgeUrlsResult {
193
- userId: string;
194
- siteId?: string;
195
- urlsRequested: number;
196
- entriesRewritten: number;
197
- rowsRemoved: number;
198
- bytesAfter: number;
199
- at: number;
200
- }
201
- interface QueryResult {
202
- rows: Row[];
203
- sql: string;
204
- objectKeys: string[];
205
- }
206
- interface ComparisonResult {
207
- rows: Row[];
208
- totalCount: number;
209
- totals: Record<string, unknown>;
210
- }
211
- interface ExtraResult {
212
- key: string;
213
- rows: Row[];
214
- }
215
- interface FileSetRef {
216
- table: TableName;
217
- partitions?: string[];
218
- }
219
- interface RunSQLOptions {
220
- ctx: TenantCtx;
221
- /**
222
- * Named partition references. Each name becomes a `{{NAME}}` placeholder
223
- * substituted into the SQL with the matching list of object keys. The
224
- * canonical name is `FILES`; analyzers also use `FILES_PREV` for a prior
225
- * window. Providing zero fileSets runs the SQL against no files.
226
- */
227
- fileSets: Record<string, FileSetRef>;
228
- /** Schema-bearing table; defaults to the first fileSet's table. */
229
- table?: TableName;
230
- sql: string;
231
- params?: unknown[];
232
- signal?: AbortSignal;
233
- }
234
- interface StorageEngine {
235
- writeDay: (ctx: WriteCtx, rows: Row[]) => Promise<void>;
236
- query: (ctx: QueryCtx, state: BuilderState) => Promise<QueryResult>;
237
- /**
238
- * Two-window comparison query (resolver-compiled). Joins a `current` and
239
- * `previous` window CTE on dimensions, applies an optional row filter
240
- * (`new`/`lost`/`improving`/`declining`), and returns the merged rows plus
241
- * total count and unfiltered totals.
242
- *
243
- * Tenant scoping comes from `ctx.userId`/`ctx.siteId` (manifest lookup) —
244
- * the SQL itself is single-tenant against the parquet adapter, which has
245
- * `includeSiteId: false`.
246
- *
247
- * Throws if `current` and `previous` resolve to different tables.
248
- */
249
- queryComparison: (ctx: QueryCtx, current: BuilderState, previous: BuilderState, filter?: ComparisonFilter) => Promise<ComparisonResult>;
250
- /**
251
- * Canonical-variant enrichment queries. Returns one result per extra
252
- * surface; today only `queryCanonical` triggers an extra. Empty array
253
- * when the state has no extras-eligible dimensions.
254
- */
255
- queryExtras: (ctx: QueryCtx, state: BuilderState) => Promise<ExtraResult[]>;
256
- /**
257
- * Run arbitrary SQL resolved against named partition sets. Composes
258
- * manifest lookup + object reads + placeholder substitution + execution
259
- * so callers don't need to reach into `ManifestStore`/`DataSource`
260
- * directly.
261
- */
262
- runSQL: (opts: RunSQLOptions) => Promise<QueryResult>;
263
- compactTiered: (ctx: WriteCtx, thresholds?: CompactionThresholds) => Promise<void>;
264
- gcOrphans: (ctx: GcCtx, graceMs: number) => Promise<{
265
- deleted: number;
266
- }>;
267
- /**
268
- * GDPR-grade tenant purge. Deletes every object under the tenant prefix
269
- * (parquet, rollups, entity stores), then removes manifest/watermark/
270
- * sync-state records via {@link ManifestStore.purgeTenant}.
271
- *
272
- * Order matters: bytes are deleted before manifest entries, so a
273
- * crash mid-purge leaves orphan manifest records (detectable via the
274
- * normal orphan sweep) rather than orphan bytes with no record.
275
- *
276
- * Returns counters suitable for an audit log. Caller is responsible
277
- * for persisting the audit entry.
278
- */
279
- purgeTenant: (ctx: TenantCtx) => Promise<PurgeResult>;
280
- /**
281
- * GDPR URL-matcher purge. Deletes rows whose `url` column matches one of
282
- * `urls` across every live parquet entry for the tenant in tables that
283
- * carry a `url` column (`pages`, `page_keywords`). Tables without a `url`
284
- * column (`keywords`, `countries`, `devices`, `search_appearance`) are
285
- * untouched — they never store per-URL data.
286
- *
287
- * For each affected entry the engine reads the file, filters the matching
288
- * rows out, writes a replacement parquet at a new object key, and registers
289
- * the new entry as a supersede of the old. Entries with no matches are
290
- * left untouched. Entries with all rows matching are replaced by a
291
- * schema-bearing empty-rows file.
292
- *
293
- * Narrower counterpart to {@link purgeTenant}: use this for a per-URL
294
- * takedown request; use `purgeTenant` for full-account deletion.
295
- */
296
- purgeUrls: (ctx: TenantCtx, urls: readonly string[]) => Promise<PurgeUrlsResult>;
297
- listLive: (filter: ListLiveFilter) => Promise<ManifestEntry[]>;
298
- listAll: (filter: ListLiveFilter) => Promise<ManifestEntry[]>;
299
- getWatermarks: (filter: WatermarkFilter) => Promise<Watermark[]>;
300
- getSyncStates: (filter: SyncStateFilter) => Promise<SyncState[]>;
301
- setSyncState: (scope: SyncStateScope, state: SyncStateKind, detail?: SyncStateDetail) => Promise<void>;
302
- /** Read the raw bytes of a single object. Rarely needed outside the `dump` CLI. */
303
- readObject: (key: string) => Promise<Uint8Array>;
304
- }
1
+ import { D as StorageEngine, N as TableName, a as DataSource, w as Row } from "../_chunks/storage.mjs";
305
2
  interface NodeHarnessOptions {
306
3
  dataDir: string;
307
4
  /** Tenant user id. Defaults to `'local'` for single-user CLI installs. */
@@ -322,10 +19,10 @@ interface NodeHarness {
322
19
  runRawSql: (opts: {
323
20
  sql: string;
324
21
  siteUrl: string;
325
- table: TableName$1;
22
+ table: TableName;
326
23
  params?: unknown[];
327
24
  }) => Promise<{
328
- rows: Row$1[];
25
+ rows: Row[];
329
26
  sql: string;
330
27
  keys: string[];
331
28
  }>;