@gscdump/engine 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +53 -0
- package/dist/adapters/duckdb-node.d.mts +19 -0
- package/dist/adapters/duckdb-node.mjs +78 -0
- package/dist/adapters/filesystem.d.mts +206 -0
- package/dist/adapters/filesystem.mjs +320 -0
- package/dist/adapters/http.d.mts +227 -0
- package/dist/adapters/http.mjs +119 -0
- package/dist/adapters/hyparquet.d.mts +107 -0
- package/dist/adapters/hyparquet.mjs +250 -0
- package/dist/adapters/inspection-sqlite-browser.d.mts +9 -0
- package/dist/adapters/inspection-sqlite-browser.mjs +42 -0
- package/dist/adapters/inspection-sqlite-node.d.mts +9 -0
- package/dist/adapters/inspection-sqlite-node.mjs +32 -0
- package/dist/adapters/node-harness.d.mts +334 -0
- package/dist/adapters/node-harness.mjs +1907 -0
- package/dist/adapters/r2-manifest.d.mts +227 -0
- package/dist/adapters/r2-manifest.mjs +355 -0
- package/dist/adapters/r2.d.mts +93 -0
- package/dist/adapters/r2.mjs +65 -0
- package/dist/arrow-utils.d.mts +14 -0
- package/dist/arrow-utils.mjs +8 -0
- package/dist/contracts.d.mts +436 -0
- package/dist/contracts.mjs +1 -0
- package/dist/entities.d.mts +238 -0
- package/dist/entities.mjs +359 -0
- package/dist/index.d.mts +1849 -0
- package/dist/index.mjs +1976 -0
- package/dist/ingest.d.mts +96 -0
- package/dist/ingest.mjs +187 -0
- package/dist/planner.d.mts +16 -0
- package/dist/planner.mjs +321 -0
- package/dist/resolver/index.d.mts +207 -0
- package/dist/resolver/index.mjs +869 -0
- package/dist/rollups.d.mts +207 -0
- package/dist/rollups.mjs +553 -0
- package/dist/schema.d.mts +1258 -0
- package/dist/schema.mjs +139 -0
- package/dist/scope.d.mts +38 -0
- package/dist/scope.mjs +28 -0
- package/dist/snapshot.d.mts +14 -0
- package/dist/snapshot.mjs +1 -0
- package/dist/sql-bind.d.mts +19 -0
- package/dist/sql-bind.mjs +92 -0
- package/dist/sql-fragments.d.mts +21 -0
- package/dist/sql-fragments.mjs +13 -0
- package/package.json +168 -0
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import { Row, TableName } from "gscdump/contracts";
|
|
2
|
+
/**
|
|
3
|
+
* Canonical GSC API dimension order per table. Consumers hitting the raw
|
|
4
|
+
* `searchanalytics.query` endpoint must request dimensions in this order so
|
|
5
|
+
* that `transformGscRow` / `createRowAccumulator` can decode the resulting
|
|
6
|
+
* `keys[]` tuples. Storage-column names (e.g. `page` → `url`) are handled
|
|
7
|
+
* inside `transformGscRow` — this record stays in GSC-API vocabulary.
|
|
8
|
+
*/
|
|
9
|
+
declare const TABLE_DIMS: Record<TableName, string[]>;
|
|
10
|
+
interface GscApiRow {
|
|
11
|
+
keys: string[];
|
|
12
|
+
clicks: number;
|
|
13
|
+
impressions: number;
|
|
14
|
+
/** Unused by ingest — the `sum_position` column encodes weighted position. */
|
|
15
|
+
ctr?: number;
|
|
16
|
+
position: number;
|
|
17
|
+
}
|
|
18
|
+
interface IngestOptions {
|
|
19
|
+
/**
|
|
20
|
+
* Canonical form of a query string, stored alongside `query` as
|
|
21
|
+
* `query_canonical`. Site-specific (e.g. synonym groups, stemming); if
|
|
22
|
+
* omitted, `query_canonical` is null. Applied to `keywords` +
|
|
23
|
+
* `page_keywords` tables only.
|
|
24
|
+
*/
|
|
25
|
+
normalizeQuery?: (query: string) => string | null | undefined;
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Strip a GSC URL to its pathname. Core analytics stores pages by path so
|
|
29
|
+
* queries don't carry origin-prefix filters.
|
|
30
|
+
*/
|
|
31
|
+
declare function toPath(gscUrl: string): string;
|
|
32
|
+
/**
|
|
33
|
+
* Encode weighted average position as `sum_position`. The raw GSC position
|
|
34
|
+
* is 1-indexed; subtract 1 and weight by impressions so a downstream
|
|
35
|
+
* `SUM(sum_position) / SUM(impressions) + 1` recovers the true mean without
|
|
36
|
+
* ever materialising per-row position values.
|
|
37
|
+
*/
|
|
38
|
+
declare function toSumPosition(apiPosition: number, impressions: number): number;
|
|
39
|
+
/**
|
|
40
|
+
* Map one GSC API row into `{ date, row }` for the given table, or null if
|
|
41
|
+
* the row has no keys (GSC occasionally emits empty-keys placeholders).
|
|
42
|
+
*/
|
|
43
|
+
declare function transformGscRow(table: TableName, apiRow: GscApiRow, options?: IngestOptions): {
|
|
44
|
+
date: string;
|
|
45
|
+
row: Row;
|
|
46
|
+
} | null;
|
|
47
|
+
interface RowAccumulator {
|
|
48
|
+
/**
|
|
49
|
+
* Push a batch of GSC API rows into the accumulator. Returns `false` if
|
|
50
|
+
* the batch pushed total row count past `maxRows`; subsequent pushes
|
|
51
|
+
* become no-ops until `drain()` is called.
|
|
52
|
+
*/
|
|
53
|
+
push: (table: TableName, rows: readonly GscApiRow[]) => boolean;
|
|
54
|
+
/**
|
|
55
|
+
* Consume accumulated rows, grouped by `table → date → rows`. Resets
|
|
56
|
+
* internal state; subsequent pushes behave as on a fresh accumulator.
|
|
57
|
+
*/
|
|
58
|
+
drain: () => Map<TableName, Map<string, Row[]>>;
|
|
59
|
+
/**
|
|
60
|
+
* Drain only buckets for dates strictly older than the most-recent date
|
|
61
|
+
* seen for each table. Requires `trackDateBoundary` to be enabled — without
|
|
62
|
+
* it, returns an empty map. GSC's date-as-dimension queries return rows
|
|
63
|
+
* sorted by date, so any date older than the latest seen is logically
|
|
64
|
+
* complete within the current job slice and safe to flush mid-job.
|
|
65
|
+
*
|
|
66
|
+
* Returned buckets are removed from internal state and `totalRows` is
|
|
67
|
+
* decremented accordingly. Latest-date buckets stay in place for the
|
|
68
|
+
* eventual `drain()` at job end.
|
|
69
|
+
*/
|
|
70
|
+
drainCompleted: () => Map<TableName, Map<string, Row[]>>;
|
|
71
|
+
/** Total row count across all tables/dates since last drain. */
|
|
72
|
+
readonly totalRows: number;
|
|
73
|
+
/** Whether the accumulator has overflowed since last drain. */
|
|
74
|
+
readonly overflowed: boolean;
|
|
75
|
+
}
|
|
76
|
+
interface RowAccumulatorOptions extends IngestOptions {
|
|
77
|
+
/**
|
|
78
|
+
* Soft cap on total accumulated rows before `push` starts returning
|
|
79
|
+
* `false` and dropping rows. Defaults to 500_000 — matches the
|
|
80
|
+
* ~128 MB CF Workers isolate budget at ~200 bytes/row with headroom.
|
|
81
|
+
*/
|
|
82
|
+
maxRows?: number;
|
|
83
|
+
/**
|
|
84
|
+
* Track the most-recent date seen per table so `drainCompleted()` can
|
|
85
|
+
* return older-date buckets mid-job. Off by default — callers that don't
|
|
86
|
+
* stream-flush pay zero overhead for the bookkeeping.
|
|
87
|
+
*
|
|
88
|
+
* Caller contract: only safe when GSC dimensions include `date` so the
|
|
89
|
+
* API returns rows in date-ascending order; without that ordering,
|
|
90
|
+
* "older than latest" doesn't mean "complete" and partial buckets would
|
|
91
|
+
* be flushed prematurely.
|
|
92
|
+
*/
|
|
93
|
+
trackDateBoundary?: boolean;
|
|
94
|
+
}
|
|
95
|
+
declare function createRowAccumulator(options?: RowAccumulatorOptions): RowAccumulator;
|
|
96
|
+
export { GscApiRow, IngestOptions, RowAccumulator, RowAccumulatorOptions, TABLE_DIMS, createRowAccumulator, toPath, toSumPosition, transformGscRow };
|
package/dist/ingest.mjs
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
const TABLE_DIMS = {
|
|
2
|
+
pages: ["page", "date"],
|
|
3
|
+
keywords: ["query", "date"],
|
|
4
|
+
countries: ["country", "date"],
|
|
5
|
+
devices: ["device", "date"],
|
|
6
|
+
page_keywords: [
|
|
7
|
+
"page",
|
|
8
|
+
"query",
|
|
9
|
+
"date"
|
|
10
|
+
],
|
|
11
|
+
search_appearance: ["searchAppearance", "date"]
|
|
12
|
+
};
|
|
13
|
+
function toPath(gscUrl) {
|
|
14
|
+
try {
|
|
15
|
+
return new URL(gscUrl).pathname;
|
|
16
|
+
} catch {
|
|
17
|
+
return gscUrl;
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
function toSumPosition(apiPosition, impressions) {
|
|
21
|
+
return (apiPosition - 1) * Math.max(impressions, 1);
|
|
22
|
+
}
|
|
23
|
+
function transformGscRow(table, apiRow, options = {}) {
|
|
24
|
+
const keys = apiRow.keys;
|
|
25
|
+
if (!keys || keys.length === 0) return null;
|
|
26
|
+
const clicks = apiRow.clicks || 0;
|
|
27
|
+
const impressions = apiRow.impressions || 0;
|
|
28
|
+
const sum_position = toSumPosition(apiRow.position || 0, impressions);
|
|
29
|
+
if (table === "pages") {
|
|
30
|
+
const date = String(keys[1] ?? "");
|
|
31
|
+
return {
|
|
32
|
+
date,
|
|
33
|
+
row: {
|
|
34
|
+
url: toPath(String(keys[0] ?? "")),
|
|
35
|
+
date,
|
|
36
|
+
clicks,
|
|
37
|
+
impressions,
|
|
38
|
+
sum_position
|
|
39
|
+
}
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
if (table === "keywords") {
|
|
43
|
+
const query = String(keys[0] ?? "");
|
|
44
|
+
const date = String(keys[1] ?? "");
|
|
45
|
+
return {
|
|
46
|
+
date,
|
|
47
|
+
row: {
|
|
48
|
+
query,
|
|
49
|
+
query_canonical: options.normalizeQuery?.(query) ?? null,
|
|
50
|
+
date,
|
|
51
|
+
clicks,
|
|
52
|
+
impressions,
|
|
53
|
+
sum_position
|
|
54
|
+
}
|
|
55
|
+
};
|
|
56
|
+
}
|
|
57
|
+
if (table === "countries") {
|
|
58
|
+
const date = String(keys[1] ?? "");
|
|
59
|
+
return {
|
|
60
|
+
date,
|
|
61
|
+
row: {
|
|
62
|
+
country: String(keys[0] ?? ""),
|
|
63
|
+
date,
|
|
64
|
+
clicks,
|
|
65
|
+
impressions,
|
|
66
|
+
sum_position
|
|
67
|
+
}
|
|
68
|
+
};
|
|
69
|
+
}
|
|
70
|
+
if (table === "devices") {
|
|
71
|
+
const date = String(keys[1] ?? "");
|
|
72
|
+
return {
|
|
73
|
+
date,
|
|
74
|
+
row: {
|
|
75
|
+
device: String(keys[0] ?? ""),
|
|
76
|
+
date,
|
|
77
|
+
clicks,
|
|
78
|
+
impressions,
|
|
79
|
+
sum_position
|
|
80
|
+
}
|
|
81
|
+
};
|
|
82
|
+
}
|
|
83
|
+
if (table === "search_appearance") {
|
|
84
|
+
const date = String(keys[1] ?? "");
|
|
85
|
+
return {
|
|
86
|
+
date,
|
|
87
|
+
row: {
|
|
88
|
+
searchAppearance: String(keys[0] ?? ""),
|
|
89
|
+
date,
|
|
90
|
+
clicks,
|
|
91
|
+
impressions,
|
|
92
|
+
sum_position
|
|
93
|
+
}
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
const query = String(keys[1] ?? "");
|
|
97
|
+
const date = String(keys[2] ?? "");
|
|
98
|
+
const query_canonical = options.normalizeQuery?.(query) ?? null;
|
|
99
|
+
return {
|
|
100
|
+
date,
|
|
101
|
+
row: {
|
|
102
|
+
url: toPath(String(keys[0] ?? "")),
|
|
103
|
+
query,
|
|
104
|
+
query_canonical,
|
|
105
|
+
date,
|
|
106
|
+
clicks,
|
|
107
|
+
impressions,
|
|
108
|
+
sum_position
|
|
109
|
+
}
|
|
110
|
+
};
|
|
111
|
+
}
|
|
112
|
+
const DEFAULT_MAX_ROWS = 5e5;
|
|
113
|
+
function createRowAccumulator(options = {}) {
|
|
114
|
+
const maxRows = options.maxRows ?? DEFAULT_MAX_ROWS;
|
|
115
|
+
const trackDateBoundary = options.trackDateBoundary === true;
|
|
116
|
+
let buckets = /* @__PURE__ */ new Map();
|
|
117
|
+
const latestDate = /* @__PURE__ */ new Map();
|
|
118
|
+
let total = 0;
|
|
119
|
+
let overflowed = false;
|
|
120
|
+
function bucketFor(table, date) {
|
|
121
|
+
let byDate = buckets.get(table);
|
|
122
|
+
if (!byDate) {
|
|
123
|
+
byDate = /* @__PURE__ */ new Map();
|
|
124
|
+
buckets.set(table, byDate);
|
|
125
|
+
}
|
|
126
|
+
let rows = byDate.get(date);
|
|
127
|
+
if (!rows) {
|
|
128
|
+
rows = [];
|
|
129
|
+
byDate.set(date, rows);
|
|
130
|
+
}
|
|
131
|
+
return rows;
|
|
132
|
+
}
|
|
133
|
+
return {
|
|
134
|
+
get totalRows() {
|
|
135
|
+
return total;
|
|
136
|
+
},
|
|
137
|
+
get overflowed() {
|
|
138
|
+
return overflowed;
|
|
139
|
+
},
|
|
140
|
+
push(table, rows) {
|
|
141
|
+
if (overflowed) return false;
|
|
142
|
+
for (const r of rows) {
|
|
143
|
+
const t = transformGscRow(table, r, options);
|
|
144
|
+
if (!t || !t.date) continue;
|
|
145
|
+
bucketFor(table, t.date).push(t.row);
|
|
146
|
+
total++;
|
|
147
|
+
if (trackDateBoundary) {
|
|
148
|
+
const prev = latestDate.get(table);
|
|
149
|
+
if (!prev || t.date > prev) latestDate.set(table, t.date);
|
|
150
|
+
}
|
|
151
|
+
if (total > maxRows) {
|
|
152
|
+
overflowed = true;
|
|
153
|
+
return false;
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
return true;
|
|
157
|
+
},
|
|
158
|
+
drain() {
|
|
159
|
+
const out = buckets;
|
|
160
|
+
buckets = /* @__PURE__ */ new Map();
|
|
161
|
+
latestDate.clear();
|
|
162
|
+
total = 0;
|
|
163
|
+
overflowed = false;
|
|
164
|
+
return out;
|
|
165
|
+
},
|
|
166
|
+
drainCompleted() {
|
|
167
|
+
const out = /* @__PURE__ */ new Map();
|
|
168
|
+
if (!trackDateBoundary) return out;
|
|
169
|
+
for (const [table, byDate] of buckets) {
|
|
170
|
+
const latest = latestDate.get(table);
|
|
171
|
+
if (!latest) continue;
|
|
172
|
+
let outBy;
|
|
173
|
+
for (const [date, dateRows] of byDate) if (date < latest) {
|
|
174
|
+
if (!outBy) {
|
|
175
|
+
outBy = /* @__PURE__ */ new Map();
|
|
176
|
+
out.set(table, outBy);
|
|
177
|
+
}
|
|
178
|
+
outBy.set(date, dateRows);
|
|
179
|
+
total -= dateRows.length;
|
|
180
|
+
}
|
|
181
|
+
if (outBy) for (const date of outBy.keys()) byDate.delete(date);
|
|
182
|
+
}
|
|
183
|
+
return out;
|
|
184
|
+
}
|
|
185
|
+
};
|
|
186
|
+
}
|
|
187
|
+
export { TABLE_DIMS, createRowAccumulator, toPath, toSumPosition, transformGscRow };
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import { LogicalQueryPlan } from "gscdump/query/plan";
|
|
2
|
+
import { TableName } from "gscdump/contracts";
|
|
3
|
+
import { BuilderState } from "gscdump/query";
|
|
4
|
+
declare function enumeratePartitions(startDate: string, endDate: string): string[];
|
|
5
|
+
interface ResolvedQuery {
|
|
6
|
+
sql: string;
|
|
7
|
+
params: unknown[];
|
|
8
|
+
partitions: string[];
|
|
9
|
+
table: TableName;
|
|
10
|
+
filesPlaceholder: string;
|
|
11
|
+
}
|
|
12
|
+
declare const FILES_PLACEHOLDER = "{{FILES}}";
|
|
13
|
+
declare function compileLogicalQueryPlan(plan: LogicalQueryPlan, table?: TableName): ResolvedQuery;
|
|
14
|
+
declare function resolveToSQL(state: BuilderState, table?: TableName): ResolvedQuery;
|
|
15
|
+
declare function substituteNamedFiles(sql: string, sets: Record<string, string[]>): string;
|
|
16
|
+
export { FILES_PLACEHOLDER, type ResolvedQuery, compileLogicalQueryPlan, enumeratePartitions, resolveToSQL, substituteNamedFiles };
|
package/dist/planner.mjs
ADDED
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
import { MS_PER_DAY, toIsoDate } from "gscdump";
|
|
2
|
+
import { date, doublePrecision, getTableConfig, integer, pgTable, varchar } from "drizzle-orm/pg-core";
|
|
3
|
+
import { buildLogicalPlan } from "gscdump/query/plan";
|
|
4
|
+
function metricCols() {
|
|
5
|
+
return {
|
|
6
|
+
clicks: integer("clicks").notNull(),
|
|
7
|
+
impressions: integer("impressions").notNull(),
|
|
8
|
+
sum_position: doublePrecision("sum_position").notNull()
|
|
9
|
+
};
|
|
10
|
+
}
|
|
11
|
+
const dateCol = () => date("date").notNull();
|
|
12
|
+
const drizzleSchema = {
|
|
13
|
+
pages: pgTable("pages", {
|
|
14
|
+
url: varchar("url").notNull(),
|
|
15
|
+
date: dateCol(),
|
|
16
|
+
...metricCols()
|
|
17
|
+
}),
|
|
18
|
+
keywords: pgTable("keywords", {
|
|
19
|
+
query: varchar("query").notNull(),
|
|
20
|
+
query_canonical: varchar("query_canonical"),
|
|
21
|
+
date: dateCol(),
|
|
22
|
+
...metricCols()
|
|
23
|
+
}),
|
|
24
|
+
countries: pgTable("countries", {
|
|
25
|
+
country: varchar("country").notNull(),
|
|
26
|
+
date: dateCol(),
|
|
27
|
+
...metricCols()
|
|
28
|
+
}),
|
|
29
|
+
devices: pgTable("devices", {
|
|
30
|
+
device: varchar("device").notNull(),
|
|
31
|
+
date: dateCol(),
|
|
32
|
+
...metricCols()
|
|
33
|
+
}),
|
|
34
|
+
page_keywords: pgTable("page_keywords", {
|
|
35
|
+
url: varchar("url").notNull(),
|
|
36
|
+
query: varchar("query").notNull(),
|
|
37
|
+
query_canonical: varchar("query_canonical"),
|
|
38
|
+
date: dateCol(),
|
|
39
|
+
...metricCols()
|
|
40
|
+
}),
|
|
41
|
+
search_appearance: pgTable("search_appearance", {
|
|
42
|
+
searchAppearance: varchar("searchAppearance").notNull(),
|
|
43
|
+
date: dateCol(),
|
|
44
|
+
...metricCols()
|
|
45
|
+
})
|
|
46
|
+
};
|
|
47
|
+
const TABLE_METADATA = {
|
|
48
|
+
pages: {
|
|
49
|
+
sortKey: ["date", "url"],
|
|
50
|
+
version: 1
|
|
51
|
+
},
|
|
52
|
+
keywords: {
|
|
53
|
+
sortKey: ["date", "query"],
|
|
54
|
+
version: 2
|
|
55
|
+
},
|
|
56
|
+
countries: {
|
|
57
|
+
sortKey: ["date", "country"],
|
|
58
|
+
version: 1
|
|
59
|
+
},
|
|
60
|
+
devices: {
|
|
61
|
+
sortKey: ["date", "device"],
|
|
62
|
+
version: 1
|
|
63
|
+
},
|
|
64
|
+
page_keywords: {
|
|
65
|
+
sortKey: [
|
|
66
|
+
"date",
|
|
67
|
+
"url",
|
|
68
|
+
"query"
|
|
69
|
+
],
|
|
70
|
+
version: 2
|
|
71
|
+
},
|
|
72
|
+
search_appearance: {
|
|
73
|
+
sortKey: ["date", "searchAppearance"],
|
|
74
|
+
version: 1
|
|
75
|
+
}
|
|
76
|
+
};
|
|
77
|
+
function pgSqlTypeToColumnType(sqlType) {
|
|
78
|
+
const t = sqlType.toLowerCase();
|
|
79
|
+
if (t.startsWith("varchar") || t === "text" || t.startsWith("char")) return "VARCHAR";
|
|
80
|
+
if (t === "date" || t.startsWith("timestamp")) return "DATE";
|
|
81
|
+
if (t.startsWith("double") || t === "real" || t.startsWith("numeric") || t.startsWith("decimal")) return "DOUBLE";
|
|
82
|
+
if (t === "bigint" || t === "int8") return "BIGINT";
|
|
83
|
+
if (t === "integer" || t === "int" || t === "int4" || t === "smallint" || t === "int2") return "INTEGER";
|
|
84
|
+
throw new Error(`unmapped pg type '${sqlType}' — extend pgSqlTypeToColumnType in @gscdump/engine/schema`);
|
|
85
|
+
}
|
|
86
|
+
function tableSchemaFrom(tableName) {
|
|
87
|
+
const columns = getTableConfig(drizzleSchema[tableName]).columns.map((col) => ({
|
|
88
|
+
name: col.name,
|
|
89
|
+
type: pgSqlTypeToColumnType(col.getSQLType()),
|
|
90
|
+
nullable: !col.notNull
|
|
91
|
+
}));
|
|
92
|
+
const meta = TABLE_METADATA[tableName];
|
|
93
|
+
return {
|
|
94
|
+
name: tableName,
|
|
95
|
+
columns,
|
|
96
|
+
sortKey: meta.sortKey,
|
|
97
|
+
version: meta.version
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
Object.fromEntries([
|
|
101
|
+
"pages",
|
|
102
|
+
"keywords",
|
|
103
|
+
"countries",
|
|
104
|
+
"devices",
|
|
105
|
+
"page_keywords",
|
|
106
|
+
"search_appearance"
|
|
107
|
+
].map((t) => [t, tableSchemaFrom(t)]));
|
|
108
|
+
function dimensionToColumn(dim, _table) {
|
|
109
|
+
if (dim === "page") return "url";
|
|
110
|
+
if (dim === "queryCanonical") return "query_canonical";
|
|
111
|
+
return dim;
|
|
112
|
+
}
|
|
113
|
+
function dayPartition(date) {
|
|
114
|
+
return `daily/${date}`;
|
|
115
|
+
}
|
|
116
|
+
function monthPartition(month) {
|
|
117
|
+
return `monthly/${month}`;
|
|
118
|
+
}
|
|
119
|
+
function weekPartition(mondayIsoDate) {
|
|
120
|
+
return `weekly/${mondayIsoDate}`;
|
|
121
|
+
}
|
|
122
|
+
function quarterPartition(quarter) {
|
|
123
|
+
return `quarterly/${quarter}`;
|
|
124
|
+
}
|
|
125
|
+
function mondayOfWeek(isoDate) {
|
|
126
|
+
const ms = Date.parse(`${isoDate}T00:00:00Z`);
|
|
127
|
+
const dow = new Date(ms).getUTCDay();
|
|
128
|
+
const offset = dow === 0 ? -6 : 1 - dow;
|
|
129
|
+
return toIsoDate(new Date(ms + offset * MS_PER_DAY));
|
|
130
|
+
}
|
|
131
|
+
function quarterOfMonth(month) {
|
|
132
|
+
const [y, m] = month.split("-").map(Number);
|
|
133
|
+
return `${y}-Q${Math.floor((m - 1) / 3) + 1}`;
|
|
134
|
+
}
|
|
135
|
+
const DEFAULT_THRESHOLDS = {
|
|
136
|
+
raw: 7,
|
|
137
|
+
d7: 30,
|
|
138
|
+
d30: 90
|
|
139
|
+
};
|
|
140
|
+
DEFAULT_THRESHOLDS.raw, DEFAULT_THRESHOLDS.d7, DEFAULT_THRESHOLDS.d30;
|
|
141
|
+
function enumeratePartitions(startDate, endDate) {
|
|
142
|
+
const out = [];
|
|
143
|
+
const [sy, sm, sd] = startDate.split("-").map(Number);
|
|
144
|
+
const [ey, em, ed] = endDate.split("-").map(Number);
|
|
145
|
+
const start = Date.UTC(sy, sm - 1, sd);
|
|
146
|
+
const end = Date.UTC(ey, em - 1, ed);
|
|
147
|
+
if (end < start) return out;
|
|
148
|
+
const seenWeeks = /* @__PURE__ */ new Set();
|
|
149
|
+
const seenMonths = /* @__PURE__ */ new Set();
|
|
150
|
+
const seenQuarters = /* @__PURE__ */ new Set();
|
|
151
|
+
for (let t = start; t <= end; t += 864e5) {
|
|
152
|
+
const d = new Date(t);
|
|
153
|
+
const y = d.getUTCFullYear();
|
|
154
|
+
const m = String(d.getUTCMonth() + 1).padStart(2, "0");
|
|
155
|
+
const isoDay = `${y}-${m}-${String(d.getUTCDate()).padStart(2, "0")}`;
|
|
156
|
+
const isoMonth = `${y}-${m}`;
|
|
157
|
+
out.push(dayPartition(isoDay));
|
|
158
|
+
const monday = mondayOfWeek(isoDay);
|
|
159
|
+
if (!seenWeeks.has(monday)) {
|
|
160
|
+
seenWeeks.add(monday);
|
|
161
|
+
out.push(weekPartition(monday));
|
|
162
|
+
}
|
|
163
|
+
if (!seenMonths.has(isoMonth)) {
|
|
164
|
+
seenMonths.add(isoMonth);
|
|
165
|
+
out.push(monthPartition(isoMonth));
|
|
166
|
+
}
|
|
167
|
+
const quarter = quarterOfMonth(isoMonth);
|
|
168
|
+
if (!seenQuarters.has(quarter)) {
|
|
169
|
+
seenQuarters.add(quarter);
|
|
170
|
+
out.push(quarterPartition(quarter));
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
return out;
|
|
174
|
+
}
|
|
175
|
+
function escapeLike(value) {
|
|
176
|
+
return value.replace(/\\/g, "\\\\").replace(/%/g, "\\%").replace(/_/g, "\\_");
|
|
177
|
+
}
|
|
178
|
+
const METRIC_EXPR = {
|
|
179
|
+
clicks: "CAST(SUM(clicks) AS DOUBLE)",
|
|
180
|
+
impressions: "CAST(SUM(impressions) AS DOUBLE)",
|
|
181
|
+
ctr: "CAST(SUM(clicks) AS DOUBLE) / NULLIF(SUM(impressions), 0)",
|
|
182
|
+
position: "SUM(sum_position) / NULLIF(SUM(impressions), 0) + 1"
|
|
183
|
+
};
|
|
184
|
+
function topLevelPagePredicateSql(pathExpr) {
|
|
185
|
+
return `LENGTH(${pathExpr}) - LENGTH(REPLACE(${pathExpr}, '/', '')) <= 1`;
|
|
186
|
+
}
|
|
187
|
+
const FILES_PLACEHOLDER = "{{FILES}}";
|
|
188
|
+
function buildDimensionWhere(filters, table) {
|
|
189
|
+
const clauses = [];
|
|
190
|
+
const params = [];
|
|
191
|
+
for (const filter of filters) {
|
|
192
|
+
const column = dimensionToColumn(filter.dimension, table);
|
|
193
|
+
switch (filter.operator) {
|
|
194
|
+
case "equals":
|
|
195
|
+
clauses.push(`${column} = ?`);
|
|
196
|
+
params.push(filter.expression);
|
|
197
|
+
break;
|
|
198
|
+
case "notEquals":
|
|
199
|
+
clauses.push(`${column} != ?`);
|
|
200
|
+
params.push(filter.expression);
|
|
201
|
+
break;
|
|
202
|
+
case "contains":
|
|
203
|
+
clauses.push(`${column} LIKE ? ESCAPE '\\'`);
|
|
204
|
+
params.push(`%${escapeLike(filter.expression)}%`);
|
|
205
|
+
break;
|
|
206
|
+
case "notContains":
|
|
207
|
+
clauses.push(`${column} NOT LIKE ? ESCAPE '\\'`);
|
|
208
|
+
params.push(`%${escapeLike(filter.expression)}%`);
|
|
209
|
+
break;
|
|
210
|
+
case "includingRegex":
|
|
211
|
+
clauses.push(`regexp_matches(${column}, ?)`);
|
|
212
|
+
params.push(filter.expression);
|
|
213
|
+
break;
|
|
214
|
+
case "excludingRegex":
|
|
215
|
+
clauses.push(`NOT regexp_matches(${column}, ?)`);
|
|
216
|
+
params.push(filter.expression);
|
|
217
|
+
break;
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
return {
|
|
221
|
+
clause: clauses.join(" AND "),
|
|
222
|
+
params
|
|
223
|
+
};
|
|
224
|
+
}
|
|
225
|
+
function buildTopLevelWhere(plan, table) {
|
|
226
|
+
if (!plan.specialFilters.topLevel) return "";
|
|
227
|
+
return topLevelPagePredicateSql(dimensionToColumn("page", table));
|
|
228
|
+
}
|
|
229
|
+
function buildHaving(filters) {
|
|
230
|
+
if (filters.length === 0) return {
|
|
231
|
+
clause: "",
|
|
232
|
+
params: []
|
|
233
|
+
};
|
|
234
|
+
const clauses = [];
|
|
235
|
+
const params = [];
|
|
236
|
+
for (const filter of filters) {
|
|
237
|
+
const expr = METRIC_EXPR[filter.metric];
|
|
238
|
+
switch (filter.operator) {
|
|
239
|
+
case "metricGte":
|
|
240
|
+
clauses.push(`${expr} >= ?`);
|
|
241
|
+
params.push(filter.expression);
|
|
242
|
+
break;
|
|
243
|
+
case "metricGt":
|
|
244
|
+
clauses.push(`${expr} > ?`);
|
|
245
|
+
params.push(filter.expression);
|
|
246
|
+
break;
|
|
247
|
+
case "metricLte":
|
|
248
|
+
clauses.push(`${expr} <= ?`);
|
|
249
|
+
params.push(filter.expression);
|
|
250
|
+
break;
|
|
251
|
+
case "metricLt":
|
|
252
|
+
clauses.push(`${expr} < ?`);
|
|
253
|
+
params.push(filter.expression);
|
|
254
|
+
break;
|
|
255
|
+
case "metricBetween":
|
|
256
|
+
clauses.push(`${expr} >= ? AND ${expr} <= ?`);
|
|
257
|
+
params.push(filter.expression, filter.expression2 ?? filter.expression);
|
|
258
|
+
break;
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
return {
|
|
262
|
+
clause: clauses.length > 0 ? `HAVING ${clauses.join(" AND ")}` : "",
|
|
263
|
+
params
|
|
264
|
+
};
|
|
265
|
+
}
|
|
266
|
+
function compileLogicalQueryPlan(plan, table = plan.dataset) {
|
|
267
|
+
const partitions = enumeratePartitions(plan.dateRange.startDate, plan.dateRange.endDate);
|
|
268
|
+
const metricSelects = plan.metrics.map((metric) => `${METRIC_EXPR[metric]} AS ${metric}`);
|
|
269
|
+
const dimSelects = plan.groupByDimensions.map((dimension) => {
|
|
270
|
+
const column = dimensionToColumn(dimension, table);
|
|
271
|
+
return column !== dimension ? `${column} AS ${dimension}` : dimension;
|
|
272
|
+
});
|
|
273
|
+
const whereClauses = ["date >= ?", "date <= ?"];
|
|
274
|
+
const whereParams = [plan.dateRange.startDate, plan.dateRange.endDate];
|
|
275
|
+
const dimWhere = buildDimensionWhere(plan.dimensionFilters, table);
|
|
276
|
+
if (dimWhere.clause) {
|
|
277
|
+
whereClauses.push(dimWhere.clause);
|
|
278
|
+
whereParams.push(...dimWhere.params);
|
|
279
|
+
}
|
|
280
|
+
const topLevelClause = buildTopLevelWhere(plan, table);
|
|
281
|
+
if (topLevelClause) whereClauses.push(topLevelClause);
|
|
282
|
+
const having = buildHaving(plan.metricFilters);
|
|
283
|
+
const groupByCols = [...plan.groupByDimensions.map((dimension) => dimensionToColumn(dimension, table)), ...plan.hasDate ? ["date"] : []];
|
|
284
|
+
const groupBy = groupByCols.length > 0 ? `GROUP BY ${groupByCols.join(", ")}` : "";
|
|
285
|
+
const orderBy = plan.orderBy ? `ORDER BY ${plan.orderBy.column} ${plan.orderBy.dir.toUpperCase()}` : "ORDER BY clicks DESC";
|
|
286
|
+
const limit = `LIMIT ${plan.rowLimit ?? 1e3}`;
|
|
287
|
+
const offset = plan.startRow ? `OFFSET ${plan.startRow}` : "";
|
|
288
|
+
return {
|
|
289
|
+
sql: [
|
|
290
|
+
`SELECT ${[
|
|
291
|
+
...dimSelects,
|
|
292
|
+
...plan.hasDate ? ["date"] : [],
|
|
293
|
+
...metricSelects
|
|
294
|
+
].join(", ")}`,
|
|
295
|
+
`FROM read_parquet(${FILES_PLACEHOLDER}, union_by_name = true)`,
|
|
296
|
+
`WHERE ${whereClauses.join(" AND ")}`,
|
|
297
|
+
groupBy,
|
|
298
|
+
having.clause,
|
|
299
|
+
orderBy,
|
|
300
|
+
limit,
|
|
301
|
+
offset
|
|
302
|
+
].filter(Boolean).join(" ").replace(/\s+/g, " ").trim(),
|
|
303
|
+
params: [...whereParams, ...having.params],
|
|
304
|
+
partitions,
|
|
305
|
+
table,
|
|
306
|
+
filesPlaceholder: FILES_PLACEHOLDER
|
|
307
|
+
};
|
|
308
|
+
}
|
|
309
|
+
function resolveToSQL(state, table) {
|
|
310
|
+
const plan = buildLogicalPlan(state, { regex: true });
|
|
311
|
+
return compileLogicalQueryPlan(plan, table ?? plan.dataset);
|
|
312
|
+
}
|
|
313
|
+
function fileList(keys) {
|
|
314
|
+
return keys.length === 0 ? "[]" : `[${keys.map((key) => `'${key.replace(/'/g, "''")}'`).join(", ")}]`;
|
|
315
|
+
}
|
|
316
|
+
function substituteNamedFiles(sql, sets) {
|
|
317
|
+
let out = sql;
|
|
318
|
+
for (const [name, keys] of Object.entries(sets)) out = out.replace(new RegExp(`\\{\\{${name}\\}\\}`, "g"), fileList(keys));
|
|
319
|
+
return out;
|
|
320
|
+
}
|
|
321
|
+
export { FILES_PLACEHOLDER, compileLogicalQueryPlan, enumeratePartitions, resolveToSQL, substituteNamedFiles };
|