@gscdump/engine 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +53 -0
  3. package/dist/adapters/duckdb-node.d.mts +19 -0
  4. package/dist/adapters/duckdb-node.mjs +78 -0
  5. package/dist/adapters/filesystem.d.mts +206 -0
  6. package/dist/adapters/filesystem.mjs +320 -0
  7. package/dist/adapters/http.d.mts +227 -0
  8. package/dist/adapters/http.mjs +119 -0
  9. package/dist/adapters/hyparquet.d.mts +107 -0
  10. package/dist/adapters/hyparquet.mjs +250 -0
  11. package/dist/adapters/inspection-sqlite-browser.d.mts +9 -0
  12. package/dist/adapters/inspection-sqlite-browser.mjs +42 -0
  13. package/dist/adapters/inspection-sqlite-node.d.mts +9 -0
  14. package/dist/adapters/inspection-sqlite-node.mjs +32 -0
  15. package/dist/adapters/node-harness.d.mts +334 -0
  16. package/dist/adapters/node-harness.mjs +1907 -0
  17. package/dist/adapters/r2-manifest.d.mts +227 -0
  18. package/dist/adapters/r2-manifest.mjs +355 -0
  19. package/dist/adapters/r2.d.mts +93 -0
  20. package/dist/adapters/r2.mjs +65 -0
  21. package/dist/arrow-utils.d.mts +14 -0
  22. package/dist/arrow-utils.mjs +8 -0
  23. package/dist/contracts.d.mts +436 -0
  24. package/dist/contracts.mjs +1 -0
  25. package/dist/entities.d.mts +238 -0
  26. package/dist/entities.mjs +359 -0
  27. package/dist/index.d.mts +1849 -0
  28. package/dist/index.mjs +1976 -0
  29. package/dist/ingest.d.mts +96 -0
  30. package/dist/ingest.mjs +187 -0
  31. package/dist/planner.d.mts +16 -0
  32. package/dist/planner.mjs +321 -0
  33. package/dist/resolver/index.d.mts +207 -0
  34. package/dist/resolver/index.mjs +869 -0
  35. package/dist/rollups.d.mts +207 -0
  36. package/dist/rollups.mjs +553 -0
  37. package/dist/schema.d.mts +1258 -0
  38. package/dist/schema.mjs +139 -0
  39. package/dist/scope.d.mts +38 -0
  40. package/dist/scope.mjs +28 -0
  41. package/dist/snapshot.d.mts +14 -0
  42. package/dist/snapshot.mjs +1 -0
  43. package/dist/sql-bind.d.mts +19 -0
  44. package/dist/sql-bind.mjs +92 -0
  45. package/dist/sql-fragments.d.mts +21 -0
  46. package/dist/sql-fragments.mjs +13 -0
  47. package/package.json +168 -0
@@ -0,0 +1,207 @@
1
+ import { ColumnDef, Row, TableName, TenantCtx } from "gscdump/contracts";
2
+ interface DataSource {
3
+ read: (key: string, range?: {
4
+ offset: number;
5
+ length: number;
6
+ }, signal?: AbortSignal) => Promise<Uint8Array>;
7
+ write: (key: string, bytes: Uint8Array) => Promise<void>;
8
+ delete: (keys: string[]) => Promise<void>;
9
+ /**
10
+ * One-shot listing under a prefix. Implementations may cap the number of
11
+ * returned keys (typically 10k) — callers iterating full tenant space
12
+ * should prefer `streamList` when available or narrow the prefix.
13
+ */
14
+ list: (prefix: string) => Promise<string[]>;
15
+ /**
16
+ * Per-key URI probe. Returns a URI string DuckDB's `httpfs` (or an
17
+ * equivalent engine that fetches its own I/O) can read directly, or
18
+ * `undefined` if the key isn't URI-resolvable on this backend and the
19
+ * caller must fall back to `read(key)` for the bytes.
20
+ *
21
+ * Contracts:
22
+ * - When defined, the returned URI MUST yield byte-identical content to
23
+ * `read(key)`. Callers rely on this for correctness.
24
+ * - Backends with a native URI for every key (filesystem: absolute path,
25
+ * R2 via `httpfs`: signed URL) may always return a string.
26
+ * - Backends without a native URI shape (in-memory) omit the method or
27
+ * return `undefined` per call.
28
+ * - Mixed-per-query is allowed: some keys in one query may return a URI,
29
+ * others may not; the executor branches per key.
30
+ */
31
+ uri?: (key: string) => string | undefined;
32
+ /**
33
+ * Optional — probe the byte size of a key without reading it. Used by
34
+ * the engine to fill in `WriteResult.bytes` when a codec reports 0 or
35
+ * unknown but the file is non-trivial.
36
+ */
37
+ head?: (key: string) => Promise<{
38
+ bytes: number;
39
+ } | undefined>;
40
+ /**
41
+ * Optional streaming variant of `list`. Implementations that page
42
+ * backing-store results (R2, S3) should implement this and yield keys
43
+ * lazily. `list` may return up to an adapter-defined cap (typically
44
+ * 10k keys); callers iterating full tenant space must prefer
45
+ * `streamList` when available, or chunk by narrower prefixes.
46
+ */
47
+ streamList?: (prefix: string) => AsyncIterable<string>;
48
+ }
49
+ interface RollupCtx extends TenantCtx {
50
+ /** When the rollup was built. Stamped into payload + filename. */
51
+ builtAt: number;
52
+ }
53
+ /**
54
+ * Tenant-scoped engine surface a rollup builder needs. Subset of
55
+ * `StorageEngine.runSQL` so rollups stay testable without a full engine.
56
+ */
57
+ interface RollupEngine {
58
+ runSQL: (opts: {
59
+ ctx: TenantCtx;
60
+ fileSets: Record<string, {
61
+ table: TableName;
62
+ partitions?: string[];
63
+ }>;
64
+ table?: TableName;
65
+ sql: string;
66
+ params?: unknown[];
67
+ }) => Promise<{
68
+ rows: Row[];
69
+ }>;
70
+ }
71
+ /**
72
+ * One rollup definition. Build runs SQL over the tenant's facts and/or reads
73
+ * from entity stores via `dataSource`, returning a JSON-serializable payload
74
+ * that the runner timestamps + writes.
75
+ */
76
+ interface RollupDef {
77
+ id: string;
78
+ /**
79
+ * Window in days the rollup covers. `null` means full history. Used by
80
+ * the runner to populate `windowDays` in the payload metadata so readers
81
+ * can validate freshness.
82
+ */
83
+ windowDays: number | null;
84
+ /**
85
+ * Storage format. `'json'` (default) wraps the build payload in a
86
+ * `RollupEnvelope` and writes as a JSON blob. `'parquet'` expects `build`
87
+ * to return rows matching `parquetColumns` and writes a parquet file plus
88
+ * a tiny JSON sidecar envelope that points at it, so metadata
89
+ * (`builtAt` / `windowDays`) stays readable without decoding parquet.
90
+ */
91
+ format?: 'json' | 'parquet';
92
+ /**
93
+ * Column schema for parquet output. Required when `format === 'parquet'`.
94
+ * Types map the same way as the fact-table encoder: VARCHAR / DATE go
95
+ * through BYTE_ARRAY/UTF8; BIGINT → INT64; INTEGER → INT32; DOUBLE → DOUBLE.
96
+ */
97
+ parquetColumns?: readonly ColumnDef[];
98
+ /** Sort-key column names for parquet row-group stats. Optional. */
99
+ parquetSortKey?: readonly string[];
100
+ build: (deps: {
101
+ engine: RollupEngine;
102
+ ctx: TenantCtx;
103
+ /**
104
+ * Tenant-scoped object store. Rollups that aggregate over entity
105
+ * snapshots (e.g. indexing metadata) read JSON docs through this.
106
+ * Pure-SQL rollups can ignore it.
107
+ */
108
+ dataSource: DataSource;
109
+ /**
110
+ * Wall-clock millis when the runner started this rollup. Use for
111
+ * derived window cutoffs (e.g. trailing-28d boundary) so the SQL can
112
+ * inline a date literal and stay portable across DuckDB builds that
113
+ * don't bundle the ICU extension (Workers DuckDB, for one — CURRENT_DATE
114
+ * lives in ICU).
115
+ */
116
+ builtAt: number;
117
+ }) => Promise<unknown>;
118
+ }
119
+ /**
120
+ * Wire shape persisted to R2/disk. Readers can rely on the `version` + `builtAt`.
121
+ * Parquet rollups write this envelope as a sidecar whose `payload` points at
122
+ * the co-located `.parquet` object via `{ parquetKey, rowCount }`.
123
+ */
124
+ interface RollupEnvelope<T = unknown> {
125
+ version: 1;
126
+ id: string;
127
+ builtAt: number;
128
+ windowDays: number | null;
129
+ payload: T;
130
+ }
131
+ interface ParquetRollupPointer {
132
+ parquetKey: string;
133
+ rowCount: number;
134
+ }
135
+ declare function rollupKey(ctx: TenantCtx, id: string, builtAt: number): string;
136
+ declare function rollupParquetKey(ctx: TenantCtx, id: string, builtAt: number): string;
137
+ interface RebuildRollupsOptions {
138
+ engine: RollupEngine;
139
+ dataSource: DataSource;
140
+ ctx: TenantCtx;
141
+ defs: readonly RollupDef[];
142
+ now?: () => number;
143
+ }
144
+ interface RebuildRollupResult {
145
+ id: string;
146
+ /** JSON envelope key. For parquet rollups this is the sidecar pointer. */
147
+ objectKey: string;
148
+ /** Parquet payload key. Present only when `format === 'parquet'`. */
149
+ parquetKey?: string;
150
+ /** Envelope byte size; for parquet rollups does NOT include parquet bytes. */
151
+ bytes: number;
152
+ /** Parquet payload byte size when `format === 'parquet'`. */
153
+ parquetBytes?: number;
154
+ builtAt: number;
155
+ }
156
+ declare function rebuildRollups(opts: RebuildRollupsOptions): Promise<RebuildRollupResult[]>;
157
+ /**
158
+ * Daily totals across the full history. One row per (date, table) with
159
+ * clicks + impressions + position. Powers sparklines and headline totals.
160
+ *
161
+ * Includes `anonymizedImpressionsPct` per day computed as
162
+ * 1 - sum(query_grained_impressions) / sum(page_grained_impressions)
163
+ * — surfaces GSC's anonymous-query gap so the dashboard can warn users not
164
+ * to trust query-grained breakdowns as comprehensive.
165
+ */
166
+ declare const dailyTotalsRollup: RollupDef;
167
+ /** Weekly totals, ISO week aligned. Cheap and stable for trend widgets. */
168
+ declare const weeklyTotalsRollup: RollupDef;
169
+ /**
170
+ * Top 1000 pages by clicks over the trailing 28-day window. JSON for v1;
171
+ * promote to parquet (`top_pages_28d.parquet`) when the dashboard needs
172
+ * server-side WHERE filtering on this rollup.
173
+ */
174
+ declare const topPages28dRollup: RollupDef;
175
+ /**
176
+ * Top 250 countries by clicks over the trailing 28-day window. Countries
177
+ * cardinality is bounded (~250 ISO codes), so the list fits in a tiny JSON
178
+ * payload regardless of traffic shape. Powers a geo-overview widget without
179
+ * spinning up DuckDB-WASM.
180
+ */
181
+ declare const topCountries28dRollup: RollupDef;
182
+ /** Top 1000 keywords by clicks over the trailing 28-day window. */
183
+ declare const topKeywords28dRollup: RollupDef;
184
+ /**
185
+ * Parquet-format companion to `topKeywords28dRollup`. Same shape, but persists
186
+ * as a parquet object plus JSON sidecar pointer so widgets that need
187
+ * server-side WHERE (filter by prefix, by clicks threshold, paginate) can scan
188
+ * it directly with DuckDB-WASM instead of loading all 1000 rows into JS.
189
+ *
190
+ * Opt-in: include in the caller's rollup def list alongside (or instead of)
191
+ * the JSON variant; the runner treats the two as independent ids so they can
192
+ * coexist during a migration.
193
+ */
194
+ declare const topKeywords28dParquetRollup: RollupDef;
195
+ /**
196
+ * Aggregates the per-URL Indexing API metadata entity store (populated by
197
+ * `gscdump entities indexing snapshot`) into daily counts of `URL_UPDATED`
198
+ * and `URL_REMOVED` notifications. Covers the third entity-snapshot shape
199
+ * without needing its own parquet family — publish events are sparse and
200
+ * aggregate cleanly into a small JSON rollup.
201
+ *
202
+ * Safe no-op when the entity store is empty: returns `{ totals: {...}, days: [] }`
203
+ * so downstream readers don't have to special-case first-run sites.
204
+ */
205
+ declare const indexingMetadataRollup: RollupDef;
206
+ declare const DEFAULT_ROLLUPS: readonly RollupDef[];
207
+ export { DEFAULT_ROLLUPS, ParquetRollupPointer, RebuildRollupResult, RebuildRollupsOptions, RollupCtx, RollupDef, RollupEngine, RollupEnvelope, dailyTotalsRollup, indexingMetadataRollup, rebuildRollups, rollupKey, rollupParquetKey, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };