@uwdata/mosaic-core 0.8.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,538 @@
1
+ import { Query, agg, sql } from '@uwdata/mosaic-sql';
2
+ import { MosaicClient } from '../MosaicClient.js';
3
+
4
+ export const NO_INDEX = { from: NaN };
5
+
6
+ /**
7
+ * Determine data cube index columns for a given Mosaic client.
8
+ * @param {MosaicClient} client The Mosaic client.
9
+ * @returns An object with necessary column data to generate data
10
+ * cube index columns, null if an invalid or unsupported expression
11
+ * is encountered, or NO_INDEX if the client is not indexable.
12
+ */
13
+ export function indexColumns(client) {
14
+ if (!client.filterIndexable) return NO_INDEX;
15
+ const q = client.query();
16
+ const from = getBaseTable(q);
17
+ if (typeof from !== 'string' || !q.groupby) return NO_INDEX;
18
+ const g = new Set(q.groupby().map(c => c.column));
19
+
20
+ const aggr = []; // list of output aggregate columns
21
+ const dims = []; // list of grouping dimension columns
22
+ const aux = {}; // auxiliary columns needed by aggregates
23
+
24
+ for (const entry of q.select()) {
25
+ const { as, expr: { aggregate, args } } = entry;
26
+ const op = aggregate?.toUpperCase?.();
27
+ switch (op) {
28
+ case 'COUNT':
29
+ case 'SUM':
30
+ // TODO: revisit this DOUBLE cast in the future
31
+ // for now, this sidesteps client-side conversions
32
+ // of bignum and fixed decimal types to JS numbers
33
+ aggr.push({ [as]: agg`SUM("${as}")::DOUBLE` });
34
+ break;
35
+ case 'AVG':
36
+ aggr.push({ [as]: avgExpr(aux, as, args[0]) });
37
+ break;
38
+ case 'ARG_MAX':
39
+ aggr.push({ [as]: argmaxExpr(aux, as, args) });
40
+ break;
41
+ case 'ARG_MIN':
42
+ aggr.push({ [as]: argminExpr(aux, as, args) });
43
+ break;
44
+
45
+ // variance statistics drop the original aggregate operation
46
+ // in favor of tracking auxiliary sufficient statistics
47
+ case 'VARIANCE':
48
+ case 'VAR_SAMP':
49
+ aux[as] = null;
50
+ aggr.push({ [as]: varianceExpr(aux, args[0], from) });
51
+ break;
52
+ case 'VAR_POP':
53
+ aux[as] = null;
54
+ aggr.push({ [as]: varianceExpr(aux, args[0], from, false) });
55
+ break;
56
+ case 'STDDEV':
57
+ case 'STDDEV_SAMP':
58
+ aux[as] = null;
59
+ aggr.push({ [as]: agg`SQRT(${varianceExpr(aux, args[0], from)})` });
60
+ break;
61
+ case 'STDDEV_POP':
62
+ aux[as] = null;
63
+ aggr.push({ [as]: agg`SQRT(${varianceExpr(aux, args[0], from, false)})` });
64
+ break;
65
+ case 'COVAR_SAMP':
66
+ aux[as] = null;
67
+ aggr.push({ [as]: covarianceExpr(aux, args, from) });
68
+ break;
69
+ case 'COVAR_POP':
70
+ aux[as] = null;
71
+ aggr.push({ [as]: covarianceExpr(aux, args, from, false) });
72
+ break;
73
+ case 'CORR':
74
+ aux[as] = null;
75
+ aggr.push({ [as]: corrExpr(aux, args, from) });
76
+ break;
77
+
78
+ // regression statistics
79
+ case 'REGR_COUNT':
80
+ aux[as] = null;
81
+ aggr.push({ [as]: agg`${regrCountExpr(aux, args)}::DOUBLE` });
82
+ break;
83
+ case 'REGR_AVGX':
84
+ aux[as] = null;
85
+ aggr.push({ [as]: regrAvgXExpr(aux, args) });
86
+ break;
87
+ case 'REGR_AVGY':
88
+ aux[as] = null;
89
+ aggr.push({ [as]: regrAvgYExpr(aux, args) });
90
+ break;
91
+ case 'REGR_SYY':
92
+ aux[as] = null;
93
+ aggr.push({ [as]: regrVarExpr(aux, 0, args, from) });
94
+ break;
95
+ case 'REGR_SXX':
96
+ aux[as] = null;
97
+ aggr.push({ [as]: regrVarExpr(aux, 1, args, from) });
98
+ break;
99
+ case 'REGR_SXY':
100
+ aux[as] = null;
101
+ aggr.push({ [as]: covarianceExpr(aux, args, from, null) });
102
+ break;
103
+ case 'REGR_SLOPE':
104
+ aux[as] = null;
105
+ aggr.push({ [as]: regrSlopeExpr(aux, args, from) });
106
+ break;
107
+ case 'REGR_INTERCEPT':
108
+ aux[as] = null;
109
+ aggr.push({ [as]: regrInterceptExpr(aux, args, from) });
110
+ break;
111
+ case 'REGR_R2':
112
+ aux[as] = null;
113
+ aggr.push({ [as]: agg`(${corrExpr(aux, args, from)}) ** 2` });
114
+ break;
115
+
116
+ // aggregates that commute directly
117
+ case 'MAX':
118
+ case 'MIN':
119
+ case 'BIT_AND':
120
+ case 'BIT_OR':
121
+ case 'BIT_XOR':
122
+ case 'BOOL_AND':
123
+ case 'BOOL_OR':
124
+ case 'PRODUCT':
125
+ aggr.push({ [as]: agg`${op}("${as}")` });
126
+ break;
127
+
128
+ // otherwise, check if dimension
129
+ default:
130
+ if (g.has(as)) dims.push(as);
131
+ else return null; // unsupported aggregate
132
+ }
133
+ }
134
+
135
+ return { from, dims, aggr, aux };
136
+ }
137
+
138
+ /**
139
+ * Generate an output column name for use as an auxiliary column
140
+ * (e.g., for sufficient statistics) within a data cube index.
141
+ * @param {string} type The operation type.
142
+ * @param {...any} args The input column arguments.
143
+ * @returns {string} A sanitized auxiliary column name.
144
+ */
145
+ function auxName(type, ...args) {
146
+ const cols = args.length ? '_' + args.map(sanitize).join('_') : '';
147
+ return `__${type}${cols}__`;
148
+ }
149
+
150
+ /**
151
+ * Sanitize a table column reference as a "safe" string value to
152
+ * use as part of derived column names.
153
+ * @param {*} col The source data table column. This may be a string,
154
+ * column reference, SQL expression, or other string-coercible value.
155
+ * @returns {string} The sanitized column name.
156
+ */
157
+ function sanitize(col) {
158
+ return `${col}`
159
+ .replaceAll('"', '')
160
+ .replaceAll(' ', '_');
161
+ }
162
+
163
+ /**
164
+ * Identify a single base (source) table of a query.
165
+ * @param {Query} query The input query.
166
+ * @returns {string | undefined | NaN} the base table name, or
167
+ * `undefined` if there is no source table, or `NaN` if the
168
+ * query operates over multiple source tables.
169
+ */
170
+ function getBaseTable(query) {
171
+ const subq = query.subqueries;
172
+
173
+ // select query
174
+ if (query.select) {
175
+ const from = query.from();
176
+ // @ts-ignore
177
+ if (!from.length) return undefined;
178
+ if (subq.length === 0) return from[0].from.table;
179
+ }
180
+
181
+ // handle set operations / subqueries
182
+ const base = getBaseTable(subq[0]);
183
+ for (let i = 1; i < subq.length; ++i) {
184
+ const from = getBaseTable(subq[i]);
185
+ if (from === undefined) continue;
186
+ if (from !== base) return NaN;
187
+ }
188
+ return base;
189
+ }
190
+
191
+ /**
192
+ * Generate an expression for calculating counts over data partitions.
193
+ * As a side effect, this method adds a column to the input *aux* object
194
+ * to track the count of non-null values per-partition.
195
+ * @param {object} aux An object for auxiliary columns (such as
196
+ * sufficient statistics) to include in the data cube aggregation.
197
+ * @param {any} arg Source data table column. This value may be a string,
198
+ * column reference, SQL expression, or other string-coercible value.
199
+ * @returns An aggregate expression for calculating counts over
200
+ * pre-aggregated data partitions.
201
+ */
202
+ function countExpr(aux, arg) {
203
+ const n = auxName('count', arg);
204
+ aux[n] = agg`COUNT(${arg})`;
205
+ return agg`SUM(${n})`.annotate({ name: n });
206
+ }
207
+
208
+ /**
209
+ * Generate an expression for calculating averages over data partitions.
210
+ * As a side effect, this method adds a column to the input *aux* object
211
+ * to track the count of non-null values per-partition.
212
+ * @param {object} aux An object for auxiliary columns (such as
213
+ * sufficient statistics) to include in the data cube aggregation.
214
+ * @param {string} as The output column for the original aggregate.
215
+ * @param {any} arg Source data table column. This value may be a string,
216
+ * column reference, SQL expression, or other string-coercible value.
217
+ * @returns An aggregate expression for calculating averages over
218
+ * pre-aggregated data partitions.
219
+ */
220
+ function avgExpr(aux, as, arg) {
221
+ const n = countExpr(aux, arg);
222
+ return agg`(SUM("${as}" * ${n.name}) / ${n})`;
223
+ }
224
+
225
+ /**
226
+ * Generate a scalar subquery for a global average.
227
+ * This value can be used to mean-center data.
228
+ * @param {*} x Souce data table column.
229
+ * @param {string} from The source data table name.
230
+ * @returns A scalar aggregate query
231
+ */
232
+ function avg(x, from) {
233
+ return sql`(SELECT AVG(${x}) FROM "${from}")`;
234
+ }
235
+
236
+ /**
237
+ * Generate an expression for calculating argmax over data partitions.
238
+ * As a side effect, this method adds a column to the input *aux* object
239
+ * to track a maximum value per-partition.
240
+ * @param {object} aux An object for auxiliary columns (such as
241
+ * sufficient statistics) to include in the data cube aggregation.
242
+ * @param {string} as The output column for the original aggregate.
243
+ * @param {any[]} args Source data table columns. The entries may be strings,
244
+ * column references, SQL expressions, or other string-coercible values.
245
+ * @returns An aggregate expression for calculating argmax over
246
+ * pre-aggregated data partitions.
247
+ */
248
+ function argmaxExpr(aux, as, [, y]) {
249
+ const max = auxName('max', y);
250
+ aux[max] = agg`MAX(${y})`;
251
+ return agg`ARG_MAX("${as}", ${max})`;
252
+ }
253
+
254
+ /**
255
+ * Generate an expression for calculating argmin over data partitions.
256
+ * As a side effect, this method adds a column to the input *aux* object
257
+ * to track a minimum value per-partition.
258
+ * @param {object} aux An object for auxiliary columns (such as
259
+ * sufficient statistics) to include in the data cube aggregation.
260
+ * @param {string} as The output column for the original aggregate.
261
+ * @param {any[]} args Source data table columns. The entries may be strings,
262
+ * column references, SQL expressions, or other string-coercible values.
263
+ * @returns An aggregate expression for calculating argmin over
264
+ * pre-aggregated data partitions.
265
+ */
266
+ function argminExpr(aux, as, [, y]) {
267
+ const min = auxName('min', y);
268
+ aux[min] = agg`MIN(${y})`;
269
+ return agg`ARG_MIN("${as}", ${min})`;
270
+ }
271
+
272
+ /**
273
+ * Generate an expression for calculating variance over data partitions.
274
+ * This method uses the "textbook" definition of variance (E[X^2] - E[X]^2),
275
+ * but on mean-centered data to reduce floating point error. The variance
276
+ * calculation uses three sufficient statistics: the count of non-null values,
277
+ * the residual sum of squares and the sum of residual (mean-centered) values.
278
+ * As a side effect, this method adds columns for these statistics to the
279
+ * input *aux* object.
280
+ * @param {object} aux An object for auxiliary columns (such as
281
+ * sufficient statistics) to include in the data cube aggregation.
282
+ * @param {*} x The source data table column. This may be a string,
283
+ * column reference, SQL expression, or other string-coercible value.
284
+ * @param {string} from The source data table name.
285
+ * @param {boolean} [correction=true] A flag for whether a Bessel
286
+ * correction should be applied to compute the sample variance
287
+ * rather than the populatation variance.
288
+ * @returns An aggregate expression for calculating variance over
289
+ * pre-aggregated data partitions.
290
+ */
291
+ function varianceExpr(aux, x, from, correction = true) {
292
+ const n = countExpr(aux, x);
293
+ const ssq = auxName('rssq', x); // residual sum of squares
294
+ const sum = auxName('rsum', x); // residual sum
295
+ const delta = sql`${x} - ${avg(x, from)}`;
296
+ aux[ssq] = agg`SUM((${delta}) ** 2)`;
297
+ aux[sum] = agg`SUM(${delta})`;
298
+ const adj = correction ? ` - 1` : ''; // Bessel correction
299
+ return agg`(SUM(${ssq}) - (SUM(${sum}) ** 2 / ${n})) / (${n}${adj})`;
300
+ }
301
+
302
+ /**
303
+ * Generate an expression for calculating covariance over data partitions.
304
+ * This method uses mean-centered data to reduce floating point error. The
305
+ * covariance calculation uses four sufficient statistics: the count of
306
+ * non-null value pairs, the sum of residual products, and residual sums
307
+ * (of mean-centered values) for x and y. As a side effect, this method
308
+ * adds columns for these statistics to the input *aux* object.
309
+ * @param {object} aux An object for auxiliary columns (such as
310
+ * sufficient statistics) to include in the data cube aggregation.
311
+ * @param {any[]} args Source data table columns. The entries may be strings,
312
+ * column references, SQL expressions, or other string-coercible values.
313
+ * @param {string} from The source data table name.
314
+ * @param {boolean|null} [correction=true] A flag for whether a Bessel
315
+ * correction should be applied to compute the sample covariance rather
316
+ * than the populatation covariance. If null, an expression for the
317
+ * unnormalized covariance (no division by sample count) is returned.
318
+ * @returns An aggregate expression for calculating covariance over
319
+ * pre-aggregated data partitions.
320
+ */
321
+ function covarianceExpr(aux, args, from, correction = true) {
322
+ const n = regrCountExpr(aux, args);
323
+ const sxy = regrSumXYExpr(aux, args, from);
324
+ const sx = regrSumExpr(aux, 1, args, from);
325
+ const sy = regrSumExpr(aux, 0, args, from);
326
+ const adj = correction === null ? '' // do not divide by count
327
+ : correction ? ` / (${n} - 1)` // Bessel correction (sample)
328
+ : ` / ${n}`; // no correction (population)
329
+ return agg`(${sxy} - ${sx} * ${sy} / ${n})${adj}`;
330
+ }
331
+
332
+ /**
333
+ * Generate an expression for calculating Pearson product-moment correlation
334
+ * coefficients over data partitions. This method uses mean-centered data
335
+ * to reduce floating point error. The correlation calculation uses six
336
+ * sufficient statistics: the count of non-null value pairs, the sum of
337
+ * residual products, and both residual sums and sums of squares for x and y.
338
+ * As a side effect, this method adds columns for these statistics to the
339
+ * input *aux* object.
340
+ * @param {object} aux An object for auxiliary columns (such as
341
+ * sufficient statistics) to include in the data cube aggregation.
342
+ * @param {any[]} args Source data table columns. The entries may be strings,
343
+ * column references, SQL expressions, or other string-coercible values.
344
+ * @param {string} from The source data table name.
345
+ * @returns An aggregate expression for calculating correlation over
346
+ * pre-aggregated data partitions.
347
+ */
348
+ function corrExpr(aux, args, from) {
349
+ const n = regrCountExpr(aux, args);
350
+ const sxy = regrSumXYExpr(aux, args, from);
351
+ const sxx = regrSumSqExpr(aux, 1, args, from);
352
+ const syy = regrSumSqExpr(aux, 0, args, from);
353
+ const sx = regrSumExpr(aux, 1, args, from);
354
+ const sy = regrSumExpr(aux, 0, args, from);
355
+ const vx = agg`(${sxx} - (${sx} ** 2) / ${n})`;
356
+ const vy = agg`(${syy} - (${sy} ** 2) / ${n})`;
357
+ return agg`(${sxy} - ${sx} * ${sy} / ${n}) / SQRT(${vx} * ${vy})`;
358
+ }
359
+
360
+ /**
361
+ * Generate an expression for the count of non-null (x, y) pairs. As a side
362
+ * effect, this method adds columns to the input *aux* object to the
363
+ * partition-level count of non-null pairs.
364
+ * @param {object} aux An object for auxiliary columns (such as
365
+ * sufficient statistics) to include in the data cube aggregation.
366
+ * @param {any[]} args Source data table columns. The entries may be strings,
367
+ * column references, SQL expressions, or other string-coercible values.
368
+ * @returns An aggregate expression for calculating regression pair counts
369
+ * over pre-aggregated data partitions.
370
+ */
371
+ function regrCountExpr(aux, [y, x]) {
372
+ const n = auxName('count', y, x);
373
+ aux[n] = agg`REGR_COUNT(${y}, ${x})`;
374
+ return agg`SUM(${n})`.annotate({ name: n });
375
+ }
376
+
377
+ /**
378
+ * Generate an expression for calculating sums of residual values for use in
379
+ * covariance and regression queries. Only values corresponding to non-null
380
+ * (x, y) pairs are included. This method uses mean-centered data to reduce
381
+ * floating point error. As a side effect, this method adds a column for
382
+ * partition-level sums to the input *aux* object.
383
+ * @param {object} aux An object for auxiliary columns (such as
384
+ * sufficient statistics) to include in the data cube aggregation.
385
+ * @param {number} i An index indicating which argument column to sum.
386
+ * @param {any[]} args Source data table columns. The entries may be strings,
387
+ * column references, SQL expressions, or other string-coercible values.
388
+ * @param {string} from The source data table name.
389
+ * @returns An aggregate expression over pre-aggregated data partitions.
390
+ */
391
+ function regrSumExpr(aux, i, args, from) {
392
+ const v = args[i];
393
+ const o = args[1 - i];
394
+ const sum = auxName('rs', v);
395
+ aux[sum] = agg`SUM(${v} - ${avg(v, from)}) FILTER (${o} IS NOT NULL)`;
396
+ return agg`SUM(${sum})`
397
+ }
398
+
399
+ /**
400
+ * Generate an expressios for calculating sums of squared residual values for
401
+ * use in covariance and regression queries. Only values corresponding to
402
+ * non-null (x, y) pairs are included. This method uses mean-centered data to
403
+ * reduce floating point error. As a side effect, this method adds a column
404
+ * for partition-level sums to the input *aux* object.
405
+ * @param {object} aux An object for auxiliary columns (such as
406
+ * sufficient statistics) to include in the data cube aggregation.
407
+ * @param {number} i An index indicating which argument column to sum.
408
+ * @param {any[]} args Source data table columns. The entries may be strings,
409
+ * column references, SQL expressions, or other string-coercible values.
410
+ * @param {string} from The source data table name.
411
+ * @returns An aggregate expression over pre-aggregated data partitions.
412
+ */
413
+ function regrSumSqExpr(aux, i, args, from) {
414
+ const v = args[i];
415
+ const u = args[1 - i];
416
+ const ssq = auxName('rss', v);
417
+ aux[ssq] = agg`SUM((${v} - ${avg(v, from)}) ** 2) FILTER (${u} IS NOT NULL)`;
418
+ return agg`SUM(${ssq})`
419
+ }
420
+
421
+ /**
422
+ * Generate an expression for calculating sums of residual product values for
423
+ * use in covariance and regression queries. Only values corresponding to
424
+ * non-null (x, y) pairs are included. This method uses mean-centered data to
425
+ * reduce floating point error. As a side effect, this method adds a column
426
+ * for partition-level sums to the input *aux* object.
427
+ * @param {object} aux An object for auxiliary columns (such as
428
+ * sufficient statistics) to include in the data cube aggregation.
429
+ * @param {any[]} args Source data table columns. The entries may be strings,
430
+ * column references, SQL expressions, or other string-coercible values.
431
+ * @param {string} from The source data table name.
432
+ * @returns An aggregate expression over pre-aggregated data partitions.
433
+ */
434
+ function regrSumXYExpr(aux, args, from) {
435
+ const [y, x] = args;
436
+ const sxy = auxName('sxy', y, x);
437
+ aux[sxy] = agg`SUM((${x} - ${avg(x, from)}) * (${y} - ${avg(y, from)}))`;
438
+ return agg`SUM(${sxy})`;
439
+ }
440
+
441
+ /**
442
+ * Generate an expression for the average x value in a regression context.
443
+ * Only values corresponding to non-null (x, y) pairs are included. As a side
444
+ * effect, this method adds columns to the input *aux* object to track both
445
+ * the count of non-null pairs and partition-level averages.
446
+ * @param {object} aux An object for auxiliary columns (such as
447
+ * sufficient statistics) to include in the data cube aggregation.
448
+ * @param {any[]} args Source data table columns. The entries may be strings,
449
+ * column references, SQL expressions, or other string-coercible values.
450
+ * @returns An aggregate expression over pre-aggregated data partitions.
451
+ */
452
+ function regrAvgXExpr(aux, args) {
453
+ const [y, x] = args;
454
+ const n = regrCountExpr(aux, args);
455
+ const a = auxName('avg', x, y);
456
+ aux[a] = agg`REGR_AVGX(${y}, ${x})`;
457
+ return agg`(SUM(${a} * ${n.name}) / ${n})`;
458
+ }
459
+
460
+ /**
461
+ * Generate an expression for the average y value in a regression context.
462
+ * Only values corresponding to non-null (x, y) pairs are included. As a side
463
+ * effect, this method adds columns to the input *aux* object to track both
464
+ * the count of non-null pairs and partition-level averages.
465
+ * @param {object} aux An object for auxiliary columns (such as
466
+ * sufficient statistics) to include in the data cube aggregation.
467
+ * @param {any[]} args Source data table columns. The entries may be strings,
468
+ * column references, SQL expressions, or other string-coercible values.
469
+ * @returns An aggregate expression over pre-aggregated data partitions.
470
+ */
471
+ function regrAvgYExpr(aux, args) {
472
+ const [y, x] = args;
473
+ const n = regrCountExpr(aux, args);
474
+ const a = auxName('avg', y, x);
475
+ aux[a] = agg`REGR_AVGY(${y}, ${x})`;
476
+ return agg`(SUM(${a} * ${n.name}) / ${n})`;
477
+ }
478
+
479
+ /**
480
+ * Generate an expression for calculating variance over data partitions for
481
+ * use in covariance and regression queries. Only values corresponding to
482
+ * non-null (x, y) pairs are included. This method uses mean-centered data to
483
+ * reduce floating point error. As a side effect, this method adds columns
484
+ * for partition-level count and sums to the input *aux* object.
485
+ * @param {object} aux An object for auxiliary columns (such as
486
+ * sufficient statistics) to include in the data cube aggregation.
487
+ * @param {number} i The index of the argument to compute the variance for.
488
+ * @param {any[]} args Source data table columns. The entries may be strings,
489
+ * column references, SQL expressions, or other string-coercible values.
490
+ * @param {string} from The source data table name.
491
+ * @returns An aggregate expression for calculating variance over
492
+ * pre-aggregated data partitions.
493
+ */
494
+ function regrVarExpr(aux, i, args, from) {
495
+ const n = regrCountExpr(aux, args);
496
+ const sum = regrSumExpr(aux, i, args, from);
497
+ const ssq = regrSumSqExpr(aux, i, args, from);
498
+ return agg`(${ssq} - (${sum} ** 2 / ${n}))`;
499
+ }
500
+
501
+ /**
502
+ * Generate an expression for calculating a regression slope. The slope is
503
+ * computed as the covariance divided by the variance of the x variable. As a
504
+ * side effect, this method adds columns for sufficient statistics to the
505
+ * input *aux* object.
506
+ * @param {object} aux An object for auxiliary columns (such as
507
+ * sufficient statistics) to include in the data cube aggregation.
508
+ * @param {any[]} args Source data table columns. The entries may be strings,
509
+ * column references, SQL expressions, or other string-coercible values.
510
+ * @param {string} from The source data table name.
511
+ * @returns An aggregate expression for calculating regression slopes over
512
+ * pre-aggregated data partitions.
513
+ */
514
+ function regrSlopeExpr(aux, args, from) {
515
+ const cov = covarianceExpr(aux, args, from, null);
516
+ const varx = regrVarExpr(aux, 1, args, from);
517
+ return agg`(${cov}) / ${varx}`;
518
+ }
519
+
520
+ /**
521
+ * Generate an expression for calculating a regression intercept. The intercept
522
+ * is derived from the regression slope and average x and y values. As a
523
+ * side effect, this method adds columns for sufficient statistics to the
524
+ * input *aux* object.
525
+ * @param {object} aux An object for auxiliary columns (such as
526
+ * sufficient statistics) to include in the data cube aggregation.
527
+ * @param {any[]} args Source data table columns. The entries may be strings,
528
+ * column references, SQL expressions, or other string-coercible values.
529
+ * @param {string} from The source data table name.
530
+ * @returns An aggregate expression for calculating regression intercepts over
531
+ * pre-aggregated data partitions.
532
+ */
533
+ function regrInterceptExpr(aux, args, from) {
534
+ const ax = regrAvgXExpr(aux, args);
535
+ const ay = regrAvgYExpr(aux, args);
536
+ const m = regrSlopeExpr(aux, args, from);
537
+ return agg`${ay} - (${m}) * ${ax}`;
538
+ }
@@ -0,0 +1,137 @@
1
+ import { SQLExpression } from '@uwdata/mosaic-sql';
2
+ import { MosaicClient } from '../MosaicClient.js';
3
+
4
+ /**
5
+ * Selection clause metadata to guide possible query optimizations.
6
+ * Sub-interfaces provide more information about the specifics of a
7
+ * given selection based on the selection type.
8
+ */
9
+ export interface ClauseMetadata {
10
+ /**
11
+ * The selection type, such as `'point'`, `'interval'`, or `'match'`.
12
+ */
13
+ type: string;
14
+ }
15
+
16
+ /**
17
+ * Selection clause metadata indicating selection of one or more discrete
18
+ * point values, typically based on equality or is distinctiveness checks.
19
+ */
20
+ export interface PointMetadata extends ClauseMetadata {
21
+ type: 'point';
22
+ }
23
+
24
+ /** Text search matching methods. */
25
+ export type MatchMethod =
26
+ | 'contains'
27
+ | 'prefix'
28
+ | 'suffix'
29
+ | 'regexp'
30
+ | (string & {});
31
+
32
+ /**
33
+ * Selection clause metadata indicating text search matching.
34
+ */
35
+ export interface MatchMetadata extends ClauseMetadata {
36
+ type: MatchMethod;
37
+ /** The text search matching method used. */
38
+ method?: 'contains' | 'prefix' | 'suffix' | 'regexp' | (string & {});
39
+ }
40
+
41
+ /** Quantitative scale types. */
42
+ export type ScaleType =
43
+ | 'identity'
44
+ | 'linear'
45
+ | 'log'
46
+ | 'sqrt'
47
+ | 'pow'
48
+ | 'symlog'
49
+ | 'time'
50
+ | 'utc';
51
+
52
+ /** A data value interval extent. */
53
+ export type Extent = [number, number] | [Date, Date];
54
+
55
+ /**
56
+ * Descriptor for a scale that maps a data domain to screen pixels.
57
+ */
58
+ export interface Scale {
59
+ /** The scale type, such as `'linear'`, `'log'`, etc. */
60
+ type: ScaleType;
61
+ /** The scale domain, as an array of start and end data values. */
62
+ domain: Extent;
63
+ /**
64
+ * The scale range, as an array of start and end screen pixels.
65
+ * The range may be omitted for *identity* scales.
66
+ */
67
+ range?: [number, number];
68
+ /** The base of the logarithm. For `'log'` scales only. */
69
+ base?: number;
70
+ /** The constant parameter. For `'symlog'` scales only. */
71
+ constant?: number;
72
+ /** The exponent parameter. For `'pow'` scales only. */
73
+ exponent?: number;
74
+ }
75
+
76
+ /** A binning method name. */
77
+ export type BinMethod = 'floor' | 'ceil' | 'round';
78
+
79
+ /**
80
+ * Selection clause metadata for one or more selected intervals. This
81
+ * metadata can be used to determine appropriate data-space binning
82
+ * schemes that correspond to pixel-level bins in screen space.
83
+ */
84
+ export interface IntervalMetadata extends ClauseMetadata {
85
+ type: 'interval';
86
+ /**
87
+ * The interactive pixel size used by the generating component.
88
+ * Values larger than one indicate intervals that "snap-to" values
89
+ * greater than a single pixel. If unspecified, assumed to be `1`.
90
+ */
91
+ pixelSize?: number;
92
+ /**
93
+ * An array of one or more scale descriptors that describe the
94
+ * mapping from data values to screen pixels.
95
+ */
96
+ scales?: Scale[];
97
+ /**
98
+ * A hint for the binning method to use when discretizing the
99
+ * interval domain. If unspecified, the default is `'floor'`.
100
+ */
101
+ bin?: BinMethod
102
+ }
103
+
104
+ /**
105
+ * A selection clause representing filtering criteria
106
+ * to apply within a Mosiac Selection.
107
+ */
108
+ export interface SelectionClause {
109
+ /**
110
+ * A unique identifier (according to object equality) for the source
111
+ * component that generated this clause. In many cases, this is a
112
+ * reference to the originating component itself.
113
+ */
114
+ source: any;
115
+ /**
116
+ * A set of Mosaic clients associated with this clause that should not
117
+ * be updated when this clause is applied in a cross-filtering context.
118
+ */
119
+ clients?: Set<MosaicClient>;
120
+ /**
121
+ * A selected value associated with this clause. For example, for a 1D
122
+ * interval selection clause the value may be a [lo, hi] array.
123
+ */
124
+ value: any;
125
+ /**
126
+ * A predicate SQL expression suitable for use in a query WHERE clause.
127
+ * The predicate should apply filtering criteria consistent with this
128
+ * clause's *value* property.
129
+ */
130
+ predicate: SQLExpression | null;
131
+ /**
132
+ * Optional clause metadata that varies based on the selection type.
133
+ * The metadata can be used to optimize selection queries, for example
134
+ * by creating pre-aggregated data cubes when applicable.
135
+ */
136
+ meta?: ClauseMetadata;
137
+ }