@gscdump/engine 0.6.1 → 0.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/dist/_chunks/compiler.mjs +288 -0
  2. package/dist/_chunks/duckdb.d.mts +26 -0
  3. package/dist/_chunks/engine.mjs +578 -0
  4. package/dist/_chunks/pg-adapter.mjs +676 -0
  5. package/dist/_chunks/planner.d.mts +15 -0
  6. package/dist/_chunks/schema.d.mts +1258 -0
  7. package/dist/_chunks/schema.mjs +139 -0
  8. package/dist/_chunks/storage.d.mts +476 -0
  9. package/dist/_chunks/storage.mjs +39 -0
  10. package/dist/_chunks/types.d.mts +53 -0
  11. package/dist/adapters/duckdb-node.d.mts +1 -13
  12. package/dist/adapters/duckdb-node.mjs +1 -7
  13. package/dist/adapters/filesystem.d.mts +1 -193
  14. package/dist/adapters/filesystem.mjs +2 -9
  15. package/dist/adapters/http.d.mts +1 -193
  16. package/dist/adapters/http.mjs +1 -5
  17. package/dist/adapters/hyparquet.d.mts +6 -83
  18. package/dist/adapters/hyparquet.mjs +1 -105
  19. package/dist/adapters/inspection-sqlite-browser.d.mts +1 -7
  20. package/dist/adapters/inspection-sqlite-node.d.mts +1 -7
  21. package/dist/adapters/inspection-sqlite-node.mjs +1 -1
  22. package/dist/adapters/node-harness.d.mts +3 -306
  23. package/dist/adapters/node-harness.mjs +4 -1866
  24. package/dist/adapters/r2-manifest.d.mts +4 -149
  25. package/dist/adapters/r2-manifest.mjs +1 -8
  26. package/dist/adapters/r2.d.mts +1 -47
  27. package/dist/contracts.d.mts +1 -435
  28. package/dist/entities.d.mts +1 -47
  29. package/dist/index.d.mts +8 -1844
  30. package/dist/index.mjs +8 -1962
  31. package/dist/ingest.d.mts +1 -1
  32. package/dist/planner.d.mts +3 -16
  33. package/dist/planner.mjs +1 -320
  34. package/dist/resolver/index.d.mts +3 -51
  35. package/dist/resolver/index.mjs +2 -780
  36. package/dist/rollups.d.mts +6 -51
  37. package/dist/rollups.mjs +2 -209
  38. package/dist/schema.d.mts +2 -1258
  39. package/dist/schema.mjs +1 -138
  40. package/package.json +2 -2
@@ -1,51 +1,6 @@
1
- import { ColumnDef, Row, TableName, TenantCtx } from "gscdump/contracts";
2
- interface DataSource {
3
- read: (key: string, range?: {
4
- offset: number;
5
- length: number;
6
- }, signal?: AbortSignal) => Promise<Uint8Array>;
7
- write: (key: string, bytes: Uint8Array) => Promise<void>;
8
- delete: (keys: string[]) => Promise<void>;
9
- /**
10
- * One-shot listing under a prefix. Implementations may cap the number of
11
- * returned keys (typically 10k) — callers iterating full tenant space
12
- * should prefer `streamList` when available or narrow the prefix.
13
- */
14
- list: (prefix: string) => Promise<string[]>;
15
- /**
16
- * Per-key URI probe. Returns a URI string DuckDB's `httpfs` (or an
17
- * equivalent engine that fetches its own I/O) can read directly, or
18
- * `undefined` if the key isn't URI-resolvable on this backend and the
19
- * caller must fall back to `read(key)` for the bytes.
20
- *
21
- * Contracts:
22
- * - When defined, the returned URI MUST yield byte-identical content to
23
- * `read(key)`. Callers rely on this for correctness.
24
- * - Backends with a native URI for every key (filesystem: absolute path,
25
- * R2 via `httpfs`: signed URL) may always return a string.
26
- * - Backends without a native URI shape (in-memory) omit the method or
27
- * return `undefined` per call.
28
- * - Mixed-per-query is allowed: some keys in one query may return a URI,
29
- * others may not; the executor branches per key.
30
- */
31
- uri?: (key: string) => string | undefined;
32
- /**
33
- * Optional — probe the byte size of a key without reading it. Used by
34
- * the engine to fill in `WriteResult.bytes` when a codec reports 0 or
35
- * unknown but the file is non-trivial.
36
- */
37
- head?: (key: string) => Promise<{
38
- bytes: number;
39
- } | undefined>;
40
- /**
41
- * Optional streaming variant of `list`. Implementations that page
42
- * backing-store results (R2, S3) should implement this and yield keys
43
- * lazily. `list` may return up to an adapter-defined cap (typically
44
- * 10k keys); callers iterating full tenant space must prefer
45
- * `streamList` when available, or chunk by narrower prefixes.
46
- */
47
- streamList?: (prefix: string) => AsyncIterable<string>;
48
- }
1
+ import { N as TableName$1, a as DataSource, w as Row$1 } from "./_chunks/storage.mjs";
2
+ import { t as ColumnDef } from "./_chunks/schema.mjs";
3
+ import { TenantCtx } from "gscdump/contracts";
49
4
  interface RollupCtx extends TenantCtx {
50
5
  /** When the rollup was built. Stamped into payload + filename. */
51
6
  builtAt: number;
@@ -58,14 +13,14 @@ interface RollupEngine {
58
13
  runSQL: (opts: {
59
14
  ctx: TenantCtx;
60
15
  fileSets: Record<string, {
61
- table: TableName;
16
+ table: TableName$1;
62
17
  partitions?: string[];
63
18
  }>;
64
- table?: TableName;
19
+ table?: TableName$1;
65
20
  sql: string;
66
21
  params?: unknown[];
67
22
  }) => Promise<{
68
- rows: Row[];
23
+ rows: Row$1[];
69
24
  }>;
70
25
  }
71
26
  /**
package/dist/rollups.mjs CHANGED
@@ -1,213 +1,6 @@
1
+ import { createIndexingMetadataStore } from "./entities.mjs";
2
+ import { encodeRowsToParquetFlex } from "./adapters/hyparquet.mjs";
1
3
  import { MS_PER_DAY } from "gscdump";
2
- import { parquetWriteBuffer } from "hyparquet-writer";
3
- import { date, doublePrecision, getTableConfig, integer, pgTable, varchar } from "drizzle-orm/pg-core";
4
- function metricCols() {
5
- return {
6
- clicks: integer("clicks").notNull(),
7
- impressions: integer("impressions").notNull(),
8
- sum_position: doublePrecision("sum_position").notNull()
9
- };
10
- }
11
- const dateCol = () => date("date").notNull();
12
- const drizzleSchema = {
13
- pages: pgTable("pages", {
14
- url: varchar("url").notNull(),
15
- date: dateCol(),
16
- ...metricCols()
17
- }),
18
- keywords: pgTable("keywords", {
19
- query: varchar("query").notNull(),
20
- query_canonical: varchar("query_canonical"),
21
- date: dateCol(),
22
- ...metricCols()
23
- }),
24
- countries: pgTable("countries", {
25
- country: varchar("country").notNull(),
26
- date: dateCol(),
27
- ...metricCols()
28
- }),
29
- devices: pgTable("devices", {
30
- device: varchar("device").notNull(),
31
- date: dateCol(),
32
- ...metricCols()
33
- }),
34
- page_keywords: pgTable("page_keywords", {
35
- url: varchar("url").notNull(),
36
- query: varchar("query").notNull(),
37
- query_canonical: varchar("query_canonical"),
38
- date: dateCol(),
39
- ...metricCols()
40
- }),
41
- search_appearance: pgTable("search_appearance", {
42
- searchAppearance: varchar("searchAppearance").notNull(),
43
- date: dateCol(),
44
- ...metricCols()
45
- })
46
- };
47
- const TABLE_METADATA = {
48
- pages: {
49
- sortKey: ["date", "url"],
50
- version: 1
51
- },
52
- keywords: {
53
- sortKey: ["date", "query"],
54
- version: 2
55
- },
56
- countries: {
57
- sortKey: ["date", "country"],
58
- version: 1
59
- },
60
- devices: {
61
- sortKey: ["date", "device"],
62
- version: 1
63
- },
64
- page_keywords: {
65
- sortKey: [
66
- "date",
67
- "url",
68
- "query"
69
- ],
70
- version: 2
71
- },
72
- search_appearance: {
73
- sortKey: ["date", "searchAppearance"],
74
- version: 1
75
- }
76
- };
77
- function pgSqlTypeToColumnType(sqlType) {
78
- const t = sqlType.toLowerCase();
79
- if (t.startsWith("varchar") || t === "text" || t.startsWith("char")) return "VARCHAR";
80
- if (t === "date" || t.startsWith("timestamp")) return "DATE";
81
- if (t.startsWith("double") || t === "real" || t.startsWith("numeric") || t.startsWith("decimal")) return "DOUBLE";
82
- if (t === "bigint" || t === "int8") return "BIGINT";
83
- if (t === "integer" || t === "int" || t === "int4" || t === "smallint" || t === "int2") return "INTEGER";
84
- throw new Error(`unmapped pg type '${sqlType}' — extend pgSqlTypeToColumnType in @gscdump/engine/schema`);
85
- }
86
- function tableSchemaFrom(tableName) {
87
- const columns = getTableConfig(drizzleSchema[tableName]).columns.map((col) => ({
88
- name: col.name,
89
- type: pgSqlTypeToColumnType(col.getSQLType()),
90
- nullable: !col.notNull
91
- }));
92
- const meta = TABLE_METADATA[tableName];
93
- return {
94
- name: tableName,
95
- columns,
96
- sortKey: meta.sortKey,
97
- version: meta.version
98
- };
99
- }
100
- Object.fromEntries([
101
- "pages",
102
- "keywords",
103
- "countries",
104
- "devices",
105
- "page_keywords",
106
- "search_appearance"
107
- ].map((t) => [t, tableSchemaFrom(t)]));
108
- const ROW_GROUP_SIZE = 25e3;
109
- function basicTypeFor(colType) {
110
- if (colType === "VARCHAR" || colType === "DATE") return "STRING";
111
- if (colType === "BIGINT") return "INT64";
112
- if (colType === "INTEGER") return "INT32";
113
- if (colType === "DOUBLE") return "DOUBLE";
114
- throw new Error(`unsupported column type for parquet encoding: ${colType}`);
115
- }
116
- function coerceValue(value, type) {
117
- if (value === null || value === void 0) return null;
118
- if (type === "STRING") return typeof value === "string" ? value : String(value);
119
- if (type === "INT32") {
120
- const n = typeof value === "number" ? value : Number(value);
121
- if (!Number.isFinite(n)) throw new Error(`non-finite number for INT32: ${String(value)}`);
122
- return Math.trunc(n);
123
- }
124
- if (type === "INT64") {
125
- if (typeof value === "bigint") return value;
126
- const n = typeof value === "number" ? value : Number(value);
127
- if (!Number.isFinite(n)) throw new Error(`non-finite number for INT64: ${String(value)}`);
128
- return BigInt(Math.trunc(n));
129
- }
130
- if (type === "DOUBLE") {
131
- const n = typeof value === "number" ? value : Number(value);
132
- if (!Number.isFinite(n)) throw new Error(`non-finite number for DOUBLE: ${String(value)}`);
133
- return n;
134
- }
135
- return value;
136
- }
137
- function compareValues(a, b) {
138
- if (a === b) return 0;
139
- if (a === null || a === void 0) return -1;
140
- if (b === null || b === void 0) return 1;
141
- if (typeof a === "number" && typeof b === "number") return a - b;
142
- return String(a) < String(b) ? -1 : 1;
143
- }
144
- function encodeRowsToParquetFlex(rows, opts) {
145
- const { columns, sortKey = [], rowGroupSize = ROW_GROUP_SIZE } = opts;
146
- const sorted = sortKey.length === 0 || rows.length <= 1 ? rows : [...rows].sort((a, b) => {
147
- for (const col of sortKey) {
148
- const cmp = compareValues(a[col], b[col]);
149
- if (cmp !== 0) return cmp;
150
- }
151
- return 0;
152
- });
153
- const buffer = parquetWriteBuffer({
154
- columnData: columns.map((col) => {
155
- const type = basicTypeFor(col.type);
156
- const data = sorted.map((r) => coerceValue(r[col.name], type));
157
- return {
158
- name: col.name,
159
- data,
160
- type,
161
- nullable: col.nullable,
162
- columnIndex: true
163
- };
164
- }),
165
- rowGroupSize
166
- });
167
- return new Uint8Array(buffer);
168
- }
169
- function hashUrl(url) {
170
- let hi = 2166136261;
171
- let lo = 3421674724;
172
- for (let i = 0; i < url.length; i++) {
173
- const c = url.charCodeAt(i);
174
- lo ^= c;
175
- const loMul = Math.imul(lo, 435) >>> 0;
176
- const carry = Math.floor(lo * 435 / 4294967296);
177
- const hiMul = Math.imul(hi, 435) + Math.imul(lo, 1) + carry >>> 0;
178
- lo = loMul;
179
- hi = hiMul;
180
- }
181
- return (hi >>> 0).toString(16).padStart(8, "0") + (lo >>> 0).toString(16).padStart(8, "0");
182
- }
183
- function indexingMetadataIndexKey(ctx) {
184
- return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/indexing/index.json` : `u_${ctx.userId}/entities/indexing/index.json`;
185
- }
186
- function createIndexingMetadataStore(opts) {
187
- const ds = opts.dataSource;
188
- const hash = opts.hash ?? hashUrl;
189
- async function readIndex(key) {
190
- return await ds.read(key).then((bytes) => JSON.parse(new TextDecoder().decode(bytes)), () => ({
191
- version: 1,
192
- records: {}
193
- }));
194
- }
195
- return {
196
- async writeBatch(ctx, records) {
197
- if (records.length === 0) return;
198
- const key = indexingMetadataIndexKey(ctx);
199
- const index = await readIndex(key);
200
- for (const r of records) index.records[hash(r.url)] = r;
201
- await ds.write(key, new TextEncoder().encode(JSON.stringify(index)));
202
- },
203
- async loadIndex(ctx) {
204
- return readIndex(indexingMetadataIndexKey(ctx));
205
- },
206
- async getLatest(ctx, url) {
207
- return (await readIndex(indexingMetadataIndexKey(ctx))).records[hash(url)];
208
- }
209
- };
210
- }
211
4
  function rollupPrefix(ctx) {
212
5
  return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/rollups` : `u_${ctx.userId}/rollups`;
213
6
  }