@gscdump/engine 0.4.0 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/README.md +20 -3
  2. package/dist/_chunks/compiler.mjs +288 -0
  3. package/dist/_chunks/duckdb.d.mts +26 -0
  4. package/dist/_chunks/engine.mjs +578 -0
  5. package/dist/_chunks/pg-adapter.mjs +676 -0
  6. package/dist/_chunks/planner.d.mts +15 -0
  7. package/dist/_chunks/schema.d.mts +1258 -0
  8. package/dist/_chunks/schema.mjs +139 -0
  9. package/dist/_chunks/storage.d.mts +476 -0
  10. package/dist/_chunks/storage.mjs +39 -0
  11. package/dist/_chunks/types.d.mts +53 -0
  12. package/dist/adapters/duckdb-node.d.mts +1 -13
  13. package/dist/adapters/duckdb-node.mjs +1 -7
  14. package/dist/adapters/filesystem.d.mts +1 -193
  15. package/dist/adapters/filesystem.mjs +2 -9
  16. package/dist/adapters/http.d.mts +1 -193
  17. package/dist/adapters/http.mjs +1 -5
  18. package/dist/adapters/hyparquet.d.mts +6 -83
  19. package/dist/adapters/hyparquet.mjs +1 -105
  20. package/dist/adapters/inspection-sqlite-browser.d.mts +1 -7
  21. package/dist/adapters/inspection-sqlite-node.d.mts +1 -7
  22. package/dist/adapters/inspection-sqlite-node.mjs +1 -1
  23. package/dist/adapters/node-harness.d.mts +3 -306
  24. package/dist/adapters/node-harness.mjs +4 -1866
  25. package/dist/adapters/r2-manifest.d.mts +4 -149
  26. package/dist/adapters/r2-manifest.mjs +1 -8
  27. package/dist/adapters/r2.d.mts +1 -47
  28. package/dist/contracts.d.mts +1 -435
  29. package/dist/entities.d.mts +1 -47
  30. package/dist/index.d.mts +8 -1844
  31. package/dist/index.mjs +8 -1962
  32. package/dist/ingest.d.mts +1 -1
  33. package/dist/planner.d.mts +3 -16
  34. package/dist/planner.mjs +1 -320
  35. package/dist/resolver/index.d.mts +3 -51
  36. package/dist/resolver/index.mjs +2 -780
  37. package/dist/rollups.d.mts +6 -51
  38. package/dist/rollups.mjs +2 -209
  39. package/dist/schema.d.mts +2 -1258
  40. package/dist/schema.mjs +1 -138
  41. package/package.json +5 -5
@@ -1,51 +1,6 @@
1
- import { ColumnDef, Row, TableName, TenantCtx } from "gscdump/contracts";
2
- interface DataSource {
3
- read: (key: string, range?: {
4
- offset: number;
5
- length: number;
6
- }, signal?: AbortSignal) => Promise<Uint8Array>;
7
- write: (key: string, bytes: Uint8Array) => Promise<void>;
8
- delete: (keys: string[]) => Promise<void>;
9
- /**
10
- * One-shot listing under a prefix. Implementations may cap the number of
11
- * returned keys (typically 10k) — callers iterating full tenant space
12
- * should prefer `streamList` when available or narrow the prefix.
13
- */
14
- list: (prefix: string) => Promise<string[]>;
15
- /**
16
- * Per-key URI probe. Returns a URI string DuckDB's `httpfs` (or an
17
- * equivalent engine that fetches its own I/O) can read directly, or
18
- * `undefined` if the key isn't URI-resolvable on this backend and the
19
- * caller must fall back to `read(key)` for the bytes.
20
- *
21
- * Contracts:
22
- * - When defined, the returned URI MUST yield byte-identical content to
23
- * `read(key)`. Callers rely on this for correctness.
24
- * - Backends with a native URI for every key (filesystem: absolute path,
25
- * R2 via `httpfs`: signed URL) may always return a string.
26
- * - Backends without a native URI shape (in-memory) omit the method or
27
- * return `undefined` per call.
28
- * - Mixed-per-query is allowed: some keys in one query may return a URI,
29
- * others may not; the executor branches per key.
30
- */
31
- uri?: (key: string) => string | undefined;
32
- /**
33
- * Optional — probe the byte size of a key without reading it. Used by
34
- * the engine to fill in `WriteResult.bytes` when a codec reports 0 or
35
- * unknown but the file is non-trivial.
36
- */
37
- head?: (key: string) => Promise<{
38
- bytes: number;
39
- } | undefined>;
40
- /**
41
- * Optional streaming variant of `list`. Implementations that page
42
- * backing-store results (R2, S3) should implement this and yield keys
43
- * lazily. `list` may return up to an adapter-defined cap (typically
44
- * 10k keys); callers iterating full tenant space must prefer
45
- * `streamList` when available, or chunk by narrower prefixes.
46
- */
47
- streamList?: (prefix: string) => AsyncIterable<string>;
48
- }
1
+ import { N as TableName$1, a as DataSource, w as Row$1 } from "./_chunks/storage.mjs";
2
+ import { t as ColumnDef } from "./_chunks/schema.mjs";
3
+ import { TenantCtx } from "gscdump/contracts";
49
4
  interface RollupCtx extends TenantCtx {
50
5
  /** When the rollup was built. Stamped into payload + filename. */
51
6
  builtAt: number;
@@ -58,14 +13,14 @@ interface RollupEngine {
58
13
  runSQL: (opts: {
59
14
  ctx: TenantCtx;
60
15
  fileSets: Record<string, {
61
- table: TableName;
16
+ table: TableName$1;
62
17
  partitions?: string[];
63
18
  }>;
64
- table?: TableName;
19
+ table?: TableName$1;
65
20
  sql: string;
66
21
  params?: unknown[];
67
22
  }) => Promise<{
68
- rows: Row[];
23
+ rows: Row$1[];
69
24
  }>;
70
25
  }
71
26
  /**
package/dist/rollups.mjs CHANGED
@@ -1,213 +1,6 @@
1
+ import { createIndexingMetadataStore } from "./entities.mjs";
2
+ import { encodeRowsToParquetFlex } from "./adapters/hyparquet.mjs";
1
3
  import { MS_PER_DAY } from "gscdump";
2
- import { parquetWriteBuffer } from "hyparquet-writer";
3
- import { date, doublePrecision, getTableConfig, integer, pgTable, varchar } from "drizzle-orm/pg-core";
4
- function metricCols() {
5
- return {
6
- clicks: integer("clicks").notNull(),
7
- impressions: integer("impressions").notNull(),
8
- sum_position: doublePrecision("sum_position").notNull()
9
- };
10
- }
11
- const dateCol = () => date("date").notNull();
12
- const drizzleSchema = {
13
- pages: pgTable("pages", {
14
- url: varchar("url").notNull(),
15
- date: dateCol(),
16
- ...metricCols()
17
- }),
18
- keywords: pgTable("keywords", {
19
- query: varchar("query").notNull(),
20
- query_canonical: varchar("query_canonical"),
21
- date: dateCol(),
22
- ...metricCols()
23
- }),
24
- countries: pgTable("countries", {
25
- country: varchar("country").notNull(),
26
- date: dateCol(),
27
- ...metricCols()
28
- }),
29
- devices: pgTable("devices", {
30
- device: varchar("device").notNull(),
31
- date: dateCol(),
32
- ...metricCols()
33
- }),
34
- page_keywords: pgTable("page_keywords", {
35
- url: varchar("url").notNull(),
36
- query: varchar("query").notNull(),
37
- query_canonical: varchar("query_canonical"),
38
- date: dateCol(),
39
- ...metricCols()
40
- }),
41
- search_appearance: pgTable("search_appearance", {
42
- searchAppearance: varchar("searchAppearance").notNull(),
43
- date: dateCol(),
44
- ...metricCols()
45
- })
46
- };
47
- const TABLE_METADATA = {
48
- pages: {
49
- sortKey: ["date", "url"],
50
- version: 1
51
- },
52
- keywords: {
53
- sortKey: ["date", "query"],
54
- version: 2
55
- },
56
- countries: {
57
- sortKey: ["date", "country"],
58
- version: 1
59
- },
60
- devices: {
61
- sortKey: ["date", "device"],
62
- version: 1
63
- },
64
- page_keywords: {
65
- sortKey: [
66
- "date",
67
- "url",
68
- "query"
69
- ],
70
- version: 2
71
- },
72
- search_appearance: {
73
- sortKey: ["date", "searchAppearance"],
74
- version: 1
75
- }
76
- };
77
- function pgSqlTypeToColumnType(sqlType) {
78
- const t = sqlType.toLowerCase();
79
- if (t.startsWith("varchar") || t === "text" || t.startsWith("char")) return "VARCHAR";
80
- if (t === "date" || t.startsWith("timestamp")) return "DATE";
81
- if (t.startsWith("double") || t === "real" || t.startsWith("numeric") || t.startsWith("decimal")) return "DOUBLE";
82
- if (t === "bigint" || t === "int8") return "BIGINT";
83
- if (t === "integer" || t === "int" || t === "int4" || t === "smallint" || t === "int2") return "INTEGER";
84
- throw new Error(`unmapped pg type '${sqlType}' — extend pgSqlTypeToColumnType in @gscdump/engine/schema`);
85
- }
86
- function tableSchemaFrom(tableName) {
87
- const columns = getTableConfig(drizzleSchema[tableName]).columns.map((col) => ({
88
- name: col.name,
89
- type: pgSqlTypeToColumnType(col.getSQLType()),
90
- nullable: !col.notNull
91
- }));
92
- const meta = TABLE_METADATA[tableName];
93
- return {
94
- name: tableName,
95
- columns,
96
- sortKey: meta.sortKey,
97
- version: meta.version
98
- };
99
- }
100
- Object.fromEntries([
101
- "pages",
102
- "keywords",
103
- "countries",
104
- "devices",
105
- "page_keywords",
106
- "search_appearance"
107
- ].map((t) => [t, tableSchemaFrom(t)]));
108
- const ROW_GROUP_SIZE = 25e3;
109
- function basicTypeFor(colType) {
110
- if (colType === "VARCHAR" || colType === "DATE") return "STRING";
111
- if (colType === "BIGINT") return "INT64";
112
- if (colType === "INTEGER") return "INT32";
113
- if (colType === "DOUBLE") return "DOUBLE";
114
- throw new Error(`unsupported column type for parquet encoding: ${colType}`);
115
- }
116
- function coerceValue(value, type) {
117
- if (value === null || value === void 0) return null;
118
- if (type === "STRING") return typeof value === "string" ? value : String(value);
119
- if (type === "INT32") {
120
- const n = typeof value === "number" ? value : Number(value);
121
- if (!Number.isFinite(n)) throw new Error(`non-finite number for INT32: ${String(value)}`);
122
- return Math.trunc(n);
123
- }
124
- if (type === "INT64") {
125
- if (typeof value === "bigint") return value;
126
- const n = typeof value === "number" ? value : Number(value);
127
- if (!Number.isFinite(n)) throw new Error(`non-finite number for INT64: ${String(value)}`);
128
- return BigInt(Math.trunc(n));
129
- }
130
- if (type === "DOUBLE") {
131
- const n = typeof value === "number" ? value : Number(value);
132
- if (!Number.isFinite(n)) throw new Error(`non-finite number for DOUBLE: ${String(value)}`);
133
- return n;
134
- }
135
- return value;
136
- }
137
- function compareValues(a, b) {
138
- if (a === b) return 0;
139
- if (a === null || a === void 0) return -1;
140
- if (b === null || b === void 0) return 1;
141
- if (typeof a === "number" && typeof b === "number") return a - b;
142
- return String(a) < String(b) ? -1 : 1;
143
- }
144
- function encodeRowsToParquetFlex(rows, opts) {
145
- const { columns, sortKey = [], rowGroupSize = ROW_GROUP_SIZE } = opts;
146
- const sorted = sortKey.length === 0 || rows.length <= 1 ? rows : [...rows].sort((a, b) => {
147
- for (const col of sortKey) {
148
- const cmp = compareValues(a[col], b[col]);
149
- if (cmp !== 0) return cmp;
150
- }
151
- return 0;
152
- });
153
- const buffer = parquetWriteBuffer({
154
- columnData: columns.map((col) => {
155
- const type = basicTypeFor(col.type);
156
- const data = sorted.map((r) => coerceValue(r[col.name], type));
157
- return {
158
- name: col.name,
159
- data,
160
- type,
161
- nullable: col.nullable,
162
- columnIndex: true
163
- };
164
- }),
165
- rowGroupSize
166
- });
167
- return new Uint8Array(buffer);
168
- }
169
- function hashUrl(url) {
170
- let hi = 2166136261;
171
- let lo = 3421674724;
172
- for (let i = 0; i < url.length; i++) {
173
- const c = url.charCodeAt(i);
174
- lo ^= c;
175
- const loMul = Math.imul(lo, 435) >>> 0;
176
- const carry = Math.floor(lo * 435 / 4294967296);
177
- const hiMul = Math.imul(hi, 435) + Math.imul(lo, 1) + carry >>> 0;
178
- lo = loMul;
179
- hi = hiMul;
180
- }
181
- return (hi >>> 0).toString(16).padStart(8, "0") + (lo >>> 0).toString(16).padStart(8, "0");
182
- }
183
- function indexingMetadataIndexKey(ctx) {
184
- return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/indexing/index.json` : `u_${ctx.userId}/entities/indexing/index.json`;
185
- }
186
- function createIndexingMetadataStore(opts) {
187
- const ds = opts.dataSource;
188
- const hash = opts.hash ?? hashUrl;
189
- async function readIndex(key) {
190
- return await ds.read(key).then((bytes) => JSON.parse(new TextDecoder().decode(bytes)), () => ({
191
- version: 1,
192
- records: {}
193
- }));
194
- }
195
- return {
196
- async writeBatch(ctx, records) {
197
- if (records.length === 0) return;
198
- const key = indexingMetadataIndexKey(ctx);
199
- const index = await readIndex(key);
200
- for (const r of records) index.records[hash(r.url)] = r;
201
- await ds.write(key, new TextEncoder().encode(JSON.stringify(index)));
202
- },
203
- async loadIndex(ctx) {
204
- return readIndex(indexingMetadataIndexKey(ctx));
205
- },
206
- async getLatest(ctx, url) {
207
- return (await readIndex(indexingMetadataIndexKey(ctx))).records[hash(url)];
208
- }
209
- };
210
- }
211
4
  function rollupPrefix(ctx) {
212
5
  return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/rollups` : `u_${ctx.userId}/rollups`;
213
6
  }