@gscdump/engine 0.6.1 → 0.6.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_chunks/compiler.mjs +288 -0
- package/dist/_chunks/duckdb.d.mts +26 -0
- package/dist/_chunks/engine.mjs +578 -0
- package/dist/_chunks/pg-adapter.mjs +676 -0
- package/dist/_chunks/planner.d.mts +15 -0
- package/dist/_chunks/schema.d.mts +1258 -0
- package/dist/_chunks/schema.mjs +139 -0
- package/dist/_chunks/storage.d.mts +476 -0
- package/dist/_chunks/storage.mjs +39 -0
- package/dist/_chunks/types.d.mts +53 -0
- package/dist/adapters/duckdb-node.d.mts +1 -13
- package/dist/adapters/duckdb-node.mjs +1 -7
- package/dist/adapters/filesystem.d.mts +1 -193
- package/dist/adapters/filesystem.mjs +2 -9
- package/dist/adapters/http.d.mts +1 -193
- package/dist/adapters/http.mjs +1 -5
- package/dist/adapters/hyparquet.d.mts +6 -83
- package/dist/adapters/hyparquet.mjs +1 -105
- package/dist/adapters/inspection-sqlite-browser.d.mts +1 -7
- package/dist/adapters/inspection-sqlite-node.d.mts +1 -7
- package/dist/adapters/inspection-sqlite-node.mjs +1 -1
- package/dist/adapters/node-harness.d.mts +3 -306
- package/dist/adapters/node-harness.mjs +4 -1866
- package/dist/adapters/r2-manifest.d.mts +4 -149
- package/dist/adapters/r2-manifest.mjs +1 -8
- package/dist/adapters/r2.d.mts +1 -47
- package/dist/contracts.d.mts +1 -435
- package/dist/entities.d.mts +1 -47
- package/dist/index.d.mts +8 -1844
- package/dist/index.mjs +8 -1962
- package/dist/ingest.d.mts +1 -1
- package/dist/planner.d.mts +3 -16
- package/dist/planner.mjs +1 -320
- package/dist/resolver/index.d.mts +3 -51
- package/dist/resolver/index.mjs +2 -780
- package/dist/rollups.d.mts +6 -51
- package/dist/rollups.mjs +2 -209
- package/dist/schema.d.mts +2 -1258
- package/dist/schema.mjs +1 -138
- package/package.json +2 -2
package/dist/rollups.d.mts
CHANGED
|
@@ -1,51 +1,6 @@
|
|
|
1
|
-
import {
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
offset: number;
|
|
5
|
-
length: number;
|
|
6
|
-
}, signal?: AbortSignal) => Promise<Uint8Array>;
|
|
7
|
-
write: (key: string, bytes: Uint8Array) => Promise<void>;
|
|
8
|
-
delete: (keys: string[]) => Promise<void>;
|
|
9
|
-
/**
|
|
10
|
-
* One-shot listing under a prefix. Implementations may cap the number of
|
|
11
|
-
* returned keys (typically 10k) — callers iterating full tenant space
|
|
12
|
-
* should prefer `streamList` when available or narrow the prefix.
|
|
13
|
-
*/
|
|
14
|
-
list: (prefix: string) => Promise<string[]>;
|
|
15
|
-
/**
|
|
16
|
-
* Per-key URI probe. Returns a URI string DuckDB's `httpfs` (or an
|
|
17
|
-
* equivalent engine that fetches its own I/O) can read directly, or
|
|
18
|
-
* `undefined` if the key isn't URI-resolvable on this backend and the
|
|
19
|
-
* caller must fall back to `read(key)` for the bytes.
|
|
20
|
-
*
|
|
21
|
-
* Contracts:
|
|
22
|
-
* - When defined, the returned URI MUST yield byte-identical content to
|
|
23
|
-
* `read(key)`. Callers rely on this for correctness.
|
|
24
|
-
* - Backends with a native URI for every key (filesystem: absolute path,
|
|
25
|
-
* R2 via `httpfs`: signed URL) may always return a string.
|
|
26
|
-
* - Backends without a native URI shape (in-memory) omit the method or
|
|
27
|
-
* return `undefined` per call.
|
|
28
|
-
* - Mixed-per-query is allowed: some keys in one query may return a URI,
|
|
29
|
-
* others may not; the executor branches per key.
|
|
30
|
-
*/
|
|
31
|
-
uri?: (key: string) => string | undefined;
|
|
32
|
-
/**
|
|
33
|
-
* Optional — probe the byte size of a key without reading it. Used by
|
|
34
|
-
* the engine to fill in `WriteResult.bytes` when a codec reports 0 or
|
|
35
|
-
* unknown but the file is non-trivial.
|
|
36
|
-
*/
|
|
37
|
-
head?: (key: string) => Promise<{
|
|
38
|
-
bytes: number;
|
|
39
|
-
} | undefined>;
|
|
40
|
-
/**
|
|
41
|
-
* Optional streaming variant of `list`. Implementations that page
|
|
42
|
-
* backing-store results (R2, S3) should implement this and yield keys
|
|
43
|
-
* lazily. `list` may return up to an adapter-defined cap (typically
|
|
44
|
-
* 10k keys); callers iterating full tenant space must prefer
|
|
45
|
-
* `streamList` when available, or chunk by narrower prefixes.
|
|
46
|
-
*/
|
|
47
|
-
streamList?: (prefix: string) => AsyncIterable<string>;
|
|
48
|
-
}
|
|
1
|
+
import { N as TableName$1, a as DataSource, w as Row$1 } from "./_chunks/storage.mjs";
|
|
2
|
+
import { t as ColumnDef } from "./_chunks/schema.mjs";
|
|
3
|
+
import { TenantCtx } from "gscdump/contracts";
|
|
49
4
|
interface RollupCtx extends TenantCtx {
|
|
50
5
|
/** When the rollup was built. Stamped into payload + filename. */
|
|
51
6
|
builtAt: number;
|
|
@@ -58,14 +13,14 @@ interface RollupEngine {
|
|
|
58
13
|
runSQL: (opts: {
|
|
59
14
|
ctx: TenantCtx;
|
|
60
15
|
fileSets: Record<string, {
|
|
61
|
-
table: TableName;
|
|
16
|
+
table: TableName$1;
|
|
62
17
|
partitions?: string[];
|
|
63
18
|
}>;
|
|
64
|
-
table?: TableName;
|
|
19
|
+
table?: TableName$1;
|
|
65
20
|
sql: string;
|
|
66
21
|
params?: unknown[];
|
|
67
22
|
}) => Promise<{
|
|
68
|
-
rows: Row[];
|
|
23
|
+
rows: Row$1[];
|
|
69
24
|
}>;
|
|
70
25
|
}
|
|
71
26
|
/**
|
package/dist/rollups.mjs
CHANGED
|
@@ -1,213 +1,6 @@
|
|
|
1
|
+
import { createIndexingMetadataStore } from "./entities.mjs";
|
|
2
|
+
import { encodeRowsToParquetFlex } from "./adapters/hyparquet.mjs";
|
|
1
3
|
import { MS_PER_DAY } from "gscdump";
|
|
2
|
-
import { parquetWriteBuffer } from "hyparquet-writer";
|
|
3
|
-
import { date, doublePrecision, getTableConfig, integer, pgTable, varchar } from "drizzle-orm/pg-core";
|
|
4
|
-
function metricCols() {
|
|
5
|
-
return {
|
|
6
|
-
clicks: integer("clicks").notNull(),
|
|
7
|
-
impressions: integer("impressions").notNull(),
|
|
8
|
-
sum_position: doublePrecision("sum_position").notNull()
|
|
9
|
-
};
|
|
10
|
-
}
|
|
11
|
-
const dateCol = () => date("date").notNull();
|
|
12
|
-
const drizzleSchema = {
|
|
13
|
-
pages: pgTable("pages", {
|
|
14
|
-
url: varchar("url").notNull(),
|
|
15
|
-
date: dateCol(),
|
|
16
|
-
...metricCols()
|
|
17
|
-
}),
|
|
18
|
-
keywords: pgTable("keywords", {
|
|
19
|
-
query: varchar("query").notNull(),
|
|
20
|
-
query_canonical: varchar("query_canonical"),
|
|
21
|
-
date: dateCol(),
|
|
22
|
-
...metricCols()
|
|
23
|
-
}),
|
|
24
|
-
countries: pgTable("countries", {
|
|
25
|
-
country: varchar("country").notNull(),
|
|
26
|
-
date: dateCol(),
|
|
27
|
-
...metricCols()
|
|
28
|
-
}),
|
|
29
|
-
devices: pgTable("devices", {
|
|
30
|
-
device: varchar("device").notNull(),
|
|
31
|
-
date: dateCol(),
|
|
32
|
-
...metricCols()
|
|
33
|
-
}),
|
|
34
|
-
page_keywords: pgTable("page_keywords", {
|
|
35
|
-
url: varchar("url").notNull(),
|
|
36
|
-
query: varchar("query").notNull(),
|
|
37
|
-
query_canonical: varchar("query_canonical"),
|
|
38
|
-
date: dateCol(),
|
|
39
|
-
...metricCols()
|
|
40
|
-
}),
|
|
41
|
-
search_appearance: pgTable("search_appearance", {
|
|
42
|
-
searchAppearance: varchar("searchAppearance").notNull(),
|
|
43
|
-
date: dateCol(),
|
|
44
|
-
...metricCols()
|
|
45
|
-
})
|
|
46
|
-
};
|
|
47
|
-
const TABLE_METADATA = {
|
|
48
|
-
pages: {
|
|
49
|
-
sortKey: ["date", "url"],
|
|
50
|
-
version: 1
|
|
51
|
-
},
|
|
52
|
-
keywords: {
|
|
53
|
-
sortKey: ["date", "query"],
|
|
54
|
-
version: 2
|
|
55
|
-
},
|
|
56
|
-
countries: {
|
|
57
|
-
sortKey: ["date", "country"],
|
|
58
|
-
version: 1
|
|
59
|
-
},
|
|
60
|
-
devices: {
|
|
61
|
-
sortKey: ["date", "device"],
|
|
62
|
-
version: 1
|
|
63
|
-
},
|
|
64
|
-
page_keywords: {
|
|
65
|
-
sortKey: [
|
|
66
|
-
"date",
|
|
67
|
-
"url",
|
|
68
|
-
"query"
|
|
69
|
-
],
|
|
70
|
-
version: 2
|
|
71
|
-
},
|
|
72
|
-
search_appearance: {
|
|
73
|
-
sortKey: ["date", "searchAppearance"],
|
|
74
|
-
version: 1
|
|
75
|
-
}
|
|
76
|
-
};
|
|
77
|
-
function pgSqlTypeToColumnType(sqlType) {
|
|
78
|
-
const t = sqlType.toLowerCase();
|
|
79
|
-
if (t.startsWith("varchar") || t === "text" || t.startsWith("char")) return "VARCHAR";
|
|
80
|
-
if (t === "date" || t.startsWith("timestamp")) return "DATE";
|
|
81
|
-
if (t.startsWith("double") || t === "real" || t.startsWith("numeric") || t.startsWith("decimal")) return "DOUBLE";
|
|
82
|
-
if (t === "bigint" || t === "int8") return "BIGINT";
|
|
83
|
-
if (t === "integer" || t === "int" || t === "int4" || t === "smallint" || t === "int2") return "INTEGER";
|
|
84
|
-
throw new Error(`unmapped pg type '${sqlType}' — extend pgSqlTypeToColumnType in @gscdump/engine/schema`);
|
|
85
|
-
}
|
|
86
|
-
function tableSchemaFrom(tableName) {
|
|
87
|
-
const columns = getTableConfig(drizzleSchema[tableName]).columns.map((col) => ({
|
|
88
|
-
name: col.name,
|
|
89
|
-
type: pgSqlTypeToColumnType(col.getSQLType()),
|
|
90
|
-
nullable: !col.notNull
|
|
91
|
-
}));
|
|
92
|
-
const meta = TABLE_METADATA[tableName];
|
|
93
|
-
return {
|
|
94
|
-
name: tableName,
|
|
95
|
-
columns,
|
|
96
|
-
sortKey: meta.sortKey,
|
|
97
|
-
version: meta.version
|
|
98
|
-
};
|
|
99
|
-
}
|
|
100
|
-
Object.fromEntries([
|
|
101
|
-
"pages",
|
|
102
|
-
"keywords",
|
|
103
|
-
"countries",
|
|
104
|
-
"devices",
|
|
105
|
-
"page_keywords",
|
|
106
|
-
"search_appearance"
|
|
107
|
-
].map((t) => [t, tableSchemaFrom(t)]));
|
|
108
|
-
const ROW_GROUP_SIZE = 25e3;
|
|
109
|
-
function basicTypeFor(colType) {
|
|
110
|
-
if (colType === "VARCHAR" || colType === "DATE") return "STRING";
|
|
111
|
-
if (colType === "BIGINT") return "INT64";
|
|
112
|
-
if (colType === "INTEGER") return "INT32";
|
|
113
|
-
if (colType === "DOUBLE") return "DOUBLE";
|
|
114
|
-
throw new Error(`unsupported column type for parquet encoding: ${colType}`);
|
|
115
|
-
}
|
|
116
|
-
function coerceValue(value, type) {
|
|
117
|
-
if (value === null || value === void 0) return null;
|
|
118
|
-
if (type === "STRING") return typeof value === "string" ? value : String(value);
|
|
119
|
-
if (type === "INT32") {
|
|
120
|
-
const n = typeof value === "number" ? value : Number(value);
|
|
121
|
-
if (!Number.isFinite(n)) throw new Error(`non-finite number for INT32: ${String(value)}`);
|
|
122
|
-
return Math.trunc(n);
|
|
123
|
-
}
|
|
124
|
-
if (type === "INT64") {
|
|
125
|
-
if (typeof value === "bigint") return value;
|
|
126
|
-
const n = typeof value === "number" ? value : Number(value);
|
|
127
|
-
if (!Number.isFinite(n)) throw new Error(`non-finite number for INT64: ${String(value)}`);
|
|
128
|
-
return BigInt(Math.trunc(n));
|
|
129
|
-
}
|
|
130
|
-
if (type === "DOUBLE") {
|
|
131
|
-
const n = typeof value === "number" ? value : Number(value);
|
|
132
|
-
if (!Number.isFinite(n)) throw new Error(`non-finite number for DOUBLE: ${String(value)}`);
|
|
133
|
-
return n;
|
|
134
|
-
}
|
|
135
|
-
return value;
|
|
136
|
-
}
|
|
137
|
-
function compareValues(a, b) {
|
|
138
|
-
if (a === b) return 0;
|
|
139
|
-
if (a === null || a === void 0) return -1;
|
|
140
|
-
if (b === null || b === void 0) return 1;
|
|
141
|
-
if (typeof a === "number" && typeof b === "number") return a - b;
|
|
142
|
-
return String(a) < String(b) ? -1 : 1;
|
|
143
|
-
}
|
|
144
|
-
function encodeRowsToParquetFlex(rows, opts) {
|
|
145
|
-
const { columns, sortKey = [], rowGroupSize = ROW_GROUP_SIZE } = opts;
|
|
146
|
-
const sorted = sortKey.length === 0 || rows.length <= 1 ? rows : [...rows].sort((a, b) => {
|
|
147
|
-
for (const col of sortKey) {
|
|
148
|
-
const cmp = compareValues(a[col], b[col]);
|
|
149
|
-
if (cmp !== 0) return cmp;
|
|
150
|
-
}
|
|
151
|
-
return 0;
|
|
152
|
-
});
|
|
153
|
-
const buffer = parquetWriteBuffer({
|
|
154
|
-
columnData: columns.map((col) => {
|
|
155
|
-
const type = basicTypeFor(col.type);
|
|
156
|
-
const data = sorted.map((r) => coerceValue(r[col.name], type));
|
|
157
|
-
return {
|
|
158
|
-
name: col.name,
|
|
159
|
-
data,
|
|
160
|
-
type,
|
|
161
|
-
nullable: col.nullable,
|
|
162
|
-
columnIndex: true
|
|
163
|
-
};
|
|
164
|
-
}),
|
|
165
|
-
rowGroupSize
|
|
166
|
-
});
|
|
167
|
-
return new Uint8Array(buffer);
|
|
168
|
-
}
|
|
169
|
-
function hashUrl(url) {
|
|
170
|
-
let hi = 2166136261;
|
|
171
|
-
let lo = 3421674724;
|
|
172
|
-
for (let i = 0; i < url.length; i++) {
|
|
173
|
-
const c = url.charCodeAt(i);
|
|
174
|
-
lo ^= c;
|
|
175
|
-
const loMul = Math.imul(lo, 435) >>> 0;
|
|
176
|
-
const carry = Math.floor(lo * 435 / 4294967296);
|
|
177
|
-
const hiMul = Math.imul(hi, 435) + Math.imul(lo, 1) + carry >>> 0;
|
|
178
|
-
lo = loMul;
|
|
179
|
-
hi = hiMul;
|
|
180
|
-
}
|
|
181
|
-
return (hi >>> 0).toString(16).padStart(8, "0") + (lo >>> 0).toString(16).padStart(8, "0");
|
|
182
|
-
}
|
|
183
|
-
function indexingMetadataIndexKey(ctx) {
|
|
184
|
-
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/indexing/index.json` : `u_${ctx.userId}/entities/indexing/index.json`;
|
|
185
|
-
}
|
|
186
|
-
function createIndexingMetadataStore(opts) {
|
|
187
|
-
const ds = opts.dataSource;
|
|
188
|
-
const hash = opts.hash ?? hashUrl;
|
|
189
|
-
async function readIndex(key) {
|
|
190
|
-
return await ds.read(key).then((bytes) => JSON.parse(new TextDecoder().decode(bytes)), () => ({
|
|
191
|
-
version: 1,
|
|
192
|
-
records: {}
|
|
193
|
-
}));
|
|
194
|
-
}
|
|
195
|
-
return {
|
|
196
|
-
async writeBatch(ctx, records) {
|
|
197
|
-
if (records.length === 0) return;
|
|
198
|
-
const key = indexingMetadataIndexKey(ctx);
|
|
199
|
-
const index = await readIndex(key);
|
|
200
|
-
for (const r of records) index.records[hash(r.url)] = r;
|
|
201
|
-
await ds.write(key, new TextEncoder().encode(JSON.stringify(index)));
|
|
202
|
-
},
|
|
203
|
-
async loadIndex(ctx) {
|
|
204
|
-
return readIndex(indexingMetadataIndexKey(ctx));
|
|
205
|
-
},
|
|
206
|
-
async getLatest(ctx, url) {
|
|
207
|
-
return (await readIndex(indexingMetadataIndexKey(ctx))).records[hash(url)];
|
|
208
|
-
}
|
|
209
|
-
};
|
|
210
|
-
}
|
|
211
4
|
function rollupPrefix(ctx) {
|
|
212
5
|
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/rollups` : `u_${ctx.userId}/rollups`;
|
|
213
6
|
}
|