@uwdata/mosaic-core 0.11.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/README.md +1 -1
  2. package/dist/mosaic-core.js +11612 -10855
  3. package/dist/mosaic-core.min.js +7 -7
  4. package/dist/types/Coordinator.d.ts +169 -0
  5. package/dist/types/MosaicClient.d.ts +94 -0
  6. package/dist/types/Param.d.ts +47 -0
  7. package/dist/types/QueryConsolidator.d.ts +9 -0
  8. package/dist/types/QueryManager.d.ts +64 -0
  9. package/dist/types/Selection.d.ts +224 -0
  10. package/dist/types/SelectionClause.d.ts +105 -0
  11. package/dist/types/connectors/rest.d.ts +17 -0
  12. package/dist/types/connectors/socket.d.ts +18 -0
  13. package/dist/types/connectors/wasm.d.ts +16 -0
  14. package/dist/types/index.d.ts +25 -0
  15. package/dist/types/preagg/PreAggregator.d.ts +178 -0
  16. package/dist/types/preagg/preagg-columns.d.ts +14 -0
  17. package/dist/types/preagg/sufficient-statistics.d.ts +13 -0
  18. package/dist/types/util/AsyncDispatch.d.ts +100 -0
  19. package/dist/types/util/cache.d.ts +13 -0
  20. package/dist/types/util/decode-ipc.d.ts +7 -0
  21. package/dist/types/util/distinct.d.ts +2 -0
  22. package/dist/types/util/field-info.d.ts +13 -0
  23. package/dist/types/util/hash.d.ts +1 -0
  24. package/dist/types/util/is-arrow-table.d.ts +8 -0
  25. package/dist/types/util/js-type.d.ts +1 -0
  26. package/dist/types/util/priority-queue.d.ts +37 -0
  27. package/dist/types/util/query-result.d.ts +44 -0
  28. package/dist/types/util/selection-types.d.ts +114 -0
  29. package/dist/types/util/synchronizer.d.ts +29 -0
  30. package/dist/types/util/throttle.d.ts +11 -0
  31. package/dist/types/util/to-data-columns.d.ts +29 -0
  32. package/dist/types/util/void-logger.d.ts +7 -0
  33. package/jsconfig.json +11 -0
  34. package/package.json +10 -8
  35. package/src/Coordinator.js +14 -14
  36. package/src/MosaicClient.js +5 -4
  37. package/src/QueryConsolidator.js +22 -33
  38. package/src/QueryManager.js +76 -45
  39. package/src/Selection.js +8 -5
  40. package/src/SelectionClause.js +19 -22
  41. package/src/connectors/rest.js +3 -1
  42. package/src/connectors/socket.js +3 -1
  43. package/src/connectors/wasm.js +1 -1
  44. package/src/index.js +13 -0
  45. package/src/preagg/PreAggregator.js +407 -0
  46. package/src/preagg/preagg-columns.js +103 -0
  47. package/src/preagg/sufficient-statistics.js +439 -0
  48. package/src/util/field-info.js +16 -5
  49. package/src/util/hash.js +1 -1
  50. package/src/util/query-result.js +44 -2
  51. package/src/util/selection-types.ts +3 -3
  52. package/src/util/throttle.js +11 -9
  53. package/src/util/void-logger.js +6 -5
  54. package/tsconfig.json +11 -0
  55. package/src/DataCubeIndexer.js +0 -378
  56. package/src/util/index-columns.js +0 -537
@@ -1,537 +0,0 @@
1
- import { Query, agg, sql } from '@uwdata/mosaic-sql';
2
- import { MosaicClient } from '../MosaicClient.js';
3
-
4
- /**
5
- * Determine data cube index columns for a given Mosaic client.
6
- * @param {MosaicClient} client The Mosaic client.
7
- * @returns An object with necessary column data to generate data
8
- * cube index columns, or null if the client is not indexable or
9
- * the client query contains an invalid or unsupported expression.
10
- */
11
- export function indexColumns(client) {
12
- if (!client.filterIndexable) return null;
13
- const q = client.query();
14
- const from = getBase(q, q => q.from()?.[0].from.table);
15
-
16
- // bail if no base table or the query is not analyzable
17
- if (typeof from !== 'string' || !q.select) return null;
18
-
19
- const aggr = []; // list of output aggregate columns
20
- const dims = []; // list of grouping dimension columns
21
- const aux = {}; // auxiliary columns needed by aggregates
22
-
23
- const avg = ref => {
24
- const name = ref.column;
25
- // @ts-ignore
26
- const expr = getBase(q, q => q.select().find(c => c.as === name)?.expr);
27
- return `(SELECT AVG(${expr ?? ref}) FROM "${from}")`;
28
- };
29
-
30
- for (const entry of q.select()) {
31
- const { as, expr: { aggregate, args } } = entry;
32
- const op = aggregate?.toUpperCase?.();
33
- switch (op) {
34
- case 'COUNT':
35
- case 'SUM':
36
- // TODO: revisit this DOUBLE cast in the future
37
- // for now, this sidesteps client-side conversions
38
- // of bignum and fixed decimal types to JS numbers
39
- aggr.push({ [as]: agg`SUM("${as}")::DOUBLE` });
40
- break;
41
- case 'AVG':
42
- aggr.push({ [as]: avgExpr(aux, as, args[0]) });
43
- break;
44
- case 'ARG_MAX':
45
- aggr.push({ [as]: argmaxExpr(aux, as, args) });
46
- break;
47
- case 'ARG_MIN':
48
- aggr.push({ [as]: argminExpr(aux, as, args) });
49
- break;
50
-
51
- // variance statistics drop the original aggregate operation
52
- // in favor of tracking auxiliary sufficient statistics
53
- case 'VARIANCE':
54
- case 'VAR_SAMP':
55
- aux[as] = null;
56
- aggr.push({ [as]: varianceExpr(aux, args[0], avg) });
57
- break;
58
- case 'VAR_POP':
59
- aux[as] = null;
60
- aggr.push({ [as]: varianceExpr(aux, args[0], avg, false) });
61
- break;
62
- case 'STDDEV':
63
- case 'STDDEV_SAMP':
64
- aux[as] = null;
65
- aggr.push({ [as]: agg`SQRT(${varianceExpr(aux, args[0], avg)})` });
66
- break;
67
- case 'STDDEV_POP':
68
- aux[as] = null;
69
- aggr.push({ [as]: agg`SQRT(${varianceExpr(aux, args[0], avg, false)})` });
70
- break;
71
- case 'COVAR_SAMP':
72
- aux[as] = null;
73
- aggr.push({ [as]: covarianceExpr(aux, args, avg) });
74
- break;
75
- case 'COVAR_POP':
76
- aux[as] = null;
77
- aggr.push({ [as]: covarianceExpr(aux, args, avg, false) });
78
- break;
79
- case 'CORR':
80
- aux[as] = null;
81
- aggr.push({ [as]: corrExpr(aux, args, avg) });
82
- break;
83
-
84
- // regression statistics
85
- case 'REGR_COUNT':
86
- aux[as] = null;
87
- aggr.push({ [as]: agg`${regrCountExpr(aux, args)}::DOUBLE` });
88
- break;
89
- case 'REGR_AVGX':
90
- aux[as] = null;
91
- aggr.push({ [as]: regrAvgXExpr(aux, args) });
92
- break;
93
- case 'REGR_AVGY':
94
- aux[as] = null;
95
- aggr.push({ [as]: regrAvgYExpr(aux, args) });
96
- break;
97
- case 'REGR_SYY':
98
- aux[as] = null;
99
- aggr.push({ [as]: regrVarExpr(aux, 0, args, avg) });
100
- break;
101
- case 'REGR_SXX':
102
- aux[as] = null;
103
- aggr.push({ [as]: regrVarExpr(aux, 1, args, avg) });
104
- break;
105
- case 'REGR_SXY':
106
- aux[as] = null;
107
- aggr.push({ [as]: covarianceExpr(aux, args, avg, null) });
108
- break;
109
- case 'REGR_SLOPE':
110
- aux[as] = null;
111
- aggr.push({ [as]: regrSlopeExpr(aux, args, avg) });
112
- break;
113
- case 'REGR_INTERCEPT':
114
- aux[as] = null;
115
- aggr.push({ [as]: regrInterceptExpr(aux, args, avg) });
116
- break;
117
- case 'REGR_R2':
118
- aux[as] = null;
119
- aggr.push({ [as]: agg`(${corrExpr(aux, args, avg)}) ** 2` });
120
- break;
121
-
122
- // aggregates that commute directly
123
- case 'MAX':
124
- case 'MIN':
125
- case 'BIT_AND':
126
- case 'BIT_OR':
127
- case 'BIT_XOR':
128
- case 'BOOL_AND':
129
- case 'BOOL_OR':
130
- case 'PRODUCT':
131
- aggr.push({ [as]: agg`${op}("${as}")` });
132
- break;
133
-
134
- // otherwise, check if dimension
135
- default:
136
- if (!aggregate) dims.push(as);
137
- else return null; // unsupported aggregate
138
- }
139
- }
140
-
141
- // bail if the query has no aggregates
142
- if (!aggr.length) return null;
143
-
144
- return { from, dims, aggr, aux };
145
- }
146
-
147
- /**
148
- * Generate an output column name for use as an auxiliary column
149
- * (e.g., for sufficient statistics) within a data cube index.
150
- * @param {string} type The operation type.
151
- * @param {...any} args The input column arguments.
152
- * @returns {string} A sanitized auxiliary column name.
153
- */
154
- function auxName(type, ...args) {
155
- const cols = args.length ? '_' + args.map(sanitize).join('_') : '';
156
- return `__${type}${cols}__`;
157
- }
158
-
159
- /**
160
- * Sanitize a table column reference as a "safe" string value to
161
- * use as part of derived column names.
162
- * @param {*} col The source data table column. This may be a string,
163
- * column reference, SQL expression, or other string-coercible value.
164
- * @returns {string} The sanitized column name.
165
- */
166
- function sanitize(col) {
167
- return `${col}`
168
- .replaceAll('"', '')
169
- .replaceAll(' ', '_');
170
- }
171
-
172
- /**
173
- * Identify a shared base (source) query and extract a value from it.
174
- * This method is used to find a shared base table name or extract
175
- * the original column name within a base table.
176
- * @param {Query} query The input query.
177
- * @param {(q: Query) => any} get A getter function to extract
178
- * a value from a base query.
179
- * @returns {string | undefined | NaN} the base query value, or
180
- * `undefined` if there is no source table, or `NaN` if the
181
- * query operates over multiple source tables.
182
- */
183
- function getBase(query, get) {
184
- const subq = query.subqueries;
185
-
186
- // select query
187
- if (query.select && subq.length === 0) {
188
- return get(query);
189
- }
190
-
191
- // handle set operations / subqueries
192
- const base = getBase(subq[0], get);
193
- for (let i = 1; i < subq.length; ++i) {
194
- const value = getBase(subq[i], get);
195
- if (value === undefined) continue;
196
- if (value !== base) return NaN;
197
- }
198
- return base;
199
- }
200
-
201
- /**
202
- * Generate an expression for calculating counts over data partitions.
203
- * As a side effect, this method adds a column to the input *aux* object
204
- * to track the count of non-null values per-partition.
205
- * @param {object} aux An object for auxiliary columns (such as
206
- * sufficient statistics) to include in the data cube aggregation.
207
- * @param {any} arg Source data table column. This value may be a string,
208
- * column reference, SQL expression, or other string-coercible value.
209
- * @returns An aggregate expression for calculating counts over
210
- * pre-aggregated data partitions.
211
- */
212
- function countExpr(aux, arg) {
213
- const n = auxName('count', arg);
214
- aux[n] = agg`COUNT(${arg})`;
215
- return agg`SUM(${n})`.annotate({ name: n });
216
- }
217
-
218
- /**
219
- * Generate an expression for calculating averages over data partitions.
220
- * As a side effect, this method adds a column to the input *aux* object
221
- * to track the count of non-null values per-partition.
222
- * @param {object} aux An object for auxiliary columns (such as
223
- * sufficient statistics) to include in the data cube aggregation.
224
- * @param {string} as The output column for the original aggregate.
225
- * @param {any} arg Source data table column. This value may be a string,
226
- * column reference, SQL expression, or other string-coercible value.
227
- * @returns An aggregate expression for calculating averages over
228
- * pre-aggregated data partitions.
229
- */
230
- function avgExpr(aux, as, arg) {
231
- const n = countExpr(aux, arg);
232
- return agg`(SUM("${as}" * ${n.name}) / ${n})`;
233
- }
234
-
235
- /**
236
- * Generate an expression for calculating argmax over data partitions.
237
- * As a side effect, this method adds a column to the input *aux* object
238
- * to track a maximum value per-partition.
239
- * @param {object} aux An object for auxiliary columns (such as
240
- * sufficient statistics) to include in the data cube aggregation.
241
- * @param {string} as The output column for the original aggregate.
242
- * @param {any[]} args Source data table columns. The entries may be strings,
243
- * column references, SQL expressions, or other string-coercible values.
244
- * @returns An aggregate expression for calculating argmax over
245
- * pre-aggregated data partitions.
246
- */
247
- function argmaxExpr(aux, as, [, y]) {
248
- const max = auxName('max', y);
249
- aux[max] = agg`MAX(${y})`;
250
- return agg`ARG_MAX("${as}", ${max})`;
251
- }
252
-
253
- /**
254
- * Generate an expression for calculating argmin over data partitions.
255
- * As a side effect, this method adds a column to the input *aux* object
256
- * to track a minimum value per-partition.
257
- * @param {object} aux An object for auxiliary columns (such as
258
- * sufficient statistics) to include in the data cube aggregation.
259
- * @param {string} as The output column for the original aggregate.
260
- * @param {any[]} args Source data table columns. The entries may be strings,
261
- * column references, SQL expressions, or other string-coercible values.
262
- * @returns An aggregate expression for calculating argmin over
263
- * pre-aggregated data partitions.
264
- */
265
- function argminExpr(aux, as, [, y]) {
266
- const min = auxName('min', y);
267
- aux[min] = agg`MIN(${y})`;
268
- return agg`ARG_MIN("${as}", ${min})`;
269
- }
270
-
271
- /**
272
- * Generate an expression for calculating variance over data partitions.
273
- * This method uses the "textbook" definition of variance (E[X^2] - E[X]^2),
274
- * but on mean-centered data to reduce floating point error. The variance
275
- * calculation uses three sufficient statistics: the count of non-null values,
276
- * the residual sum of squares and the sum of residual (mean-centered) values.
277
- * As a side effect, this method adds columns for these statistics to the
278
- * input *aux* object.
279
- * @param {object} aux An object for auxiliary columns (such as
280
- * sufficient statistics) to include in the data cube aggregation.
281
- * @param {*} x The source data table column. This may be a string,
282
- * column reference, SQL expression, or other string-coercible value.
283
- * @param {(field: any) => string} avg Global average query generator.
284
- * @param {boolean} [correction=true] A flag for whether a Bessel
285
- * correction should be applied to compute the sample variance
286
- * rather than the populatation variance.
287
- * @returns An aggregate expression for calculating variance over
288
- * pre-aggregated data partitions.
289
- */
290
- function varianceExpr(aux, x, avg, correction = true) {
291
- const n = countExpr(aux, x);
292
- const ssq = auxName('rssq', x); // residual sum of squares
293
- const sum = auxName('rsum', x); // residual sum
294
- const delta = sql`${x} - ${avg(x)}`;
295
- aux[ssq] = agg`SUM((${delta}) ** 2)`;
296
- aux[sum] = agg`SUM(${delta})`;
297
- const adj = correction ? ` - 1` : ''; // Bessel correction
298
- return agg`(SUM(${ssq}) - (SUM(${sum}) ** 2 / ${n})) / (${n}${adj})`;
299
- }
300
-
301
- /**
302
- * Generate an expression for calculating covariance over data partitions.
303
- * This method uses mean-centered data to reduce floating point error. The
304
- * covariance calculation uses four sufficient statistics: the count of
305
- * non-null value pairs, the sum of residual products, and residual sums
306
- * (of mean-centered values) for x and y. As a side effect, this method
307
- * adds columns for these statistics to the input *aux* object.
308
- * @param {object} aux An object for auxiliary columns (such as
309
- * sufficient statistics) to include in the data cube aggregation.
310
- * @param {any[]} args Source data table columns. The entries may be strings,
311
- * column references, SQL expressions, or other string-coercible values.
312
- * @param {(field: any) => string} avg Global average query generator.
313
- * @param {boolean|null} [correction=true] A flag for whether a Bessel
314
- * correction should be applied to compute the sample covariance rather
315
- * than the populatation covariance. If null, an expression for the
316
- * unnormalized covariance (no division by sample count) is returned.
317
- * @returns An aggregate expression for calculating covariance over
318
- * pre-aggregated data partitions.
319
- */
320
- function covarianceExpr(aux, args, avg, correction = true) {
321
- const n = regrCountExpr(aux, args);
322
- const sxy = regrSumXYExpr(aux, args, avg);
323
- const sx = regrSumExpr(aux, 1, args, avg);
324
- const sy = regrSumExpr(aux, 0, args, avg);
325
- const adj = correction === null ? '' // do not divide by count
326
- : correction ? ` / (${n} - 1)` // Bessel correction (sample)
327
- : ` / ${n}`; // no correction (population)
328
- return agg`(${sxy} - ${sx} * ${sy} / ${n})${adj}`;
329
- }
330
-
331
- /**
332
- * Generate an expression for calculating Pearson product-moment correlation
333
- * coefficients over data partitions. This method uses mean-centered data
334
- * to reduce floating point error. The correlation calculation uses six
335
- * sufficient statistics: the count of non-null value pairs, the sum of
336
- * residual products, and both residual sums and sums of squares for x and y.
337
- * As a side effect, this method adds columns for these statistics to the
338
- * input *aux* object.
339
- * @param {object} aux An object for auxiliary columns (such as
340
- * sufficient statistics) to include in the data cube aggregation.
341
- * @param {any[]} args Source data table columns. The entries may be strings,
342
- * column references, SQL expressions, or other string-coercible values.
343
- * @param {(field: any) => string} avg Global average query generator.
344
- * @returns An aggregate expression for calculating correlation over
345
- * pre-aggregated data partitions.
346
- */
347
- function corrExpr(aux, args, avg) {
348
- const n = regrCountExpr(aux, args);
349
- const sxy = regrSumXYExpr(aux, args, avg);
350
- const sxx = regrSumSqExpr(aux, 1, args, avg);
351
- const syy = regrSumSqExpr(aux, 0, args, avg);
352
- const sx = regrSumExpr(aux, 1, args, avg);
353
- const sy = regrSumExpr(aux, 0, args, avg);
354
- const vx = agg`(${sxx} - (${sx} ** 2) / ${n})`;
355
- const vy = agg`(${syy} - (${sy} ** 2) / ${n})`;
356
- return agg`(${sxy} - ${sx} * ${sy} / ${n}) / SQRT(${vx} * ${vy})`;
357
- }
358
-
359
- /**
360
- * Generate an expression for the count of non-null (x, y) pairs. As a side
361
- * effect, this method adds columns to the input *aux* object to the
362
- * partition-level count of non-null pairs.
363
- * @param {object} aux An object for auxiliary columns (such as
364
- * sufficient statistics) to include in the data cube aggregation.
365
- * @param {any[]} args Source data table columns. The entries may be strings,
366
- * column references, SQL expressions, or other string-coercible values.
367
- * @returns An aggregate expression for calculating regression pair counts
368
- * over pre-aggregated data partitions.
369
- */
370
- function regrCountExpr(aux, [y, x]) {
371
- const n = auxName('count', y, x);
372
- aux[n] = agg`REGR_COUNT(${y}, ${x})`;
373
- return agg`SUM(${n})`.annotate({ name: n });
374
- }
375
-
376
- /**
377
- * Generate an expression for calculating sums of residual values for use in
378
- * covariance and regression queries. Only values corresponding to non-null
379
- * (x, y) pairs are included. This method uses mean-centered data to reduce
380
- * floating point error. As a side effect, this method adds a column for
381
- * partition-level sums to the input *aux* object.
382
- * @param {object} aux An object for auxiliary columns (such as
383
- * sufficient statistics) to include in the data cube aggregation.
384
- * @param {number} i An index indicating which argument column to sum.
385
- * @param {any[]} args Source data table columns. The entries may be strings,
386
- * column references, SQL expressions, or other string-coercible values.
387
- * @param {(field: any) => string} avg Global average query generator.
388
- * @returns An aggregate expression over pre-aggregated data partitions.
389
- */
390
- function regrSumExpr(aux, i, args, avg) {
391
- const v = args[i];
392
- const o = args[1 - i];
393
- const sum = auxName('rs', v);
394
- aux[sum] = agg`SUM(${v} - ${avg(v)}) FILTER (${o} IS NOT NULL)`;
395
- return agg`SUM(${sum})`
396
- }
397
-
398
- /**
399
- * Generate an expressios for calculating sums of squared residual values for
400
- * use in covariance and regression queries. Only values corresponding to
401
- * non-null (x, y) pairs are included. This method uses mean-centered data to
402
- * reduce floating point error. As a side effect, this method adds a column
403
- * for partition-level sums to the input *aux* object.
404
- * @param {object} aux An object for auxiliary columns (such as
405
- * sufficient statistics) to include in the data cube aggregation.
406
- * @param {number} i An index indicating which argument column to sum.
407
- * @param {any[]} args Source data table columns. The entries may be strings,
408
- * column references, SQL expressions, or other string-coercible values.
409
- * @param {(field: any) => string} avg Global average query generator.
410
- * @returns An aggregate expression over pre-aggregated data partitions.
411
- */
412
- function regrSumSqExpr(aux, i, args, avg) {
413
- const v = args[i];
414
- const u = args[1 - i];
415
- const ssq = auxName('rss', v);
416
- aux[ssq] = agg`SUM((${v} - ${avg(v)}) ** 2) FILTER (${u} IS NOT NULL)`;
417
- return agg`SUM(${ssq})`
418
- }
419
-
420
- /**
421
- * Generate an expression for calculating sums of residual product values for
422
- * use in covariance and regression queries. Only values corresponding to
423
- * non-null (x, y) pairs are included. This method uses mean-centered data to
424
- * reduce floating point error. As a side effect, this method adds a column
425
- * for partition-level sums to the input *aux* object.
426
- * @param {object} aux An object for auxiliary columns (such as
427
- * sufficient statistics) to include in the data cube aggregation.
428
- * @param {any[]} args Source data table columns. The entries may be strings,
429
- * column references, SQL expressions, or other string-coercible values.
430
- * @param {(field: any) => string} avg Global average query generator.
431
- * @returns An aggregate expression over pre-aggregated data partitions.
432
- */
433
- function regrSumXYExpr(aux, args, avg) {
434
- const [y, x] = args;
435
- const sxy = auxName('sxy', y, x);
436
- aux[sxy] = agg`SUM((${x} - ${avg(x)}) * (${y} - ${avg(y)}))`;
437
- return agg`SUM(${sxy})`;
438
- }
439
-
440
- /**
441
- * Generate an expression for the average x value in a regression context.
442
- * Only values corresponding to non-null (x, y) pairs are included. As a side
443
- * effect, this method adds columns to the input *aux* object to track both
444
- * the count of non-null pairs and partition-level averages.
445
- * @param {object} aux An object for auxiliary columns (such as
446
- * sufficient statistics) to include in the data cube aggregation.
447
- * @param {any[]} args Source data table columns. The entries may be strings,
448
- * column references, SQL expressions, or other string-coercible values.
449
- * @returns An aggregate expression over pre-aggregated data partitions.
450
- */
451
- function regrAvgXExpr(aux, args) {
452
- const [y, x] = args;
453
- const n = regrCountExpr(aux, args);
454
- const a = auxName('avg', x, y);
455
- aux[a] = agg`REGR_AVGX(${y}, ${x})`;
456
- return agg`(SUM(${a} * ${n.name}) / ${n})`;
457
- }
458
-
459
- /**
460
- * Generate an expression for the average y value in a regression context.
461
- * Only values corresponding to non-null (x, y) pairs are included. As a side
462
- * effect, this method adds columns to the input *aux* object to track both
463
- * the count of non-null pairs and partition-level averages.
464
- * @param {object} aux An object for auxiliary columns (such as
465
- * sufficient statistics) to include in the data cube aggregation.
466
- * @param {any[]} args Source data table columns. The entries may be strings,
467
- * column references, SQL expressions, or other string-coercible values.
468
- * @returns An aggregate expression over pre-aggregated data partitions.
469
- */
470
- function regrAvgYExpr(aux, args) {
471
- const [y, x] = args;
472
- const n = regrCountExpr(aux, args);
473
- const a = auxName('avg', y, x);
474
- aux[a] = agg`REGR_AVGY(${y}, ${x})`;
475
- return agg`(SUM(${a} * ${n.name}) / ${n})`;
476
- }
477
-
478
- /**
479
- * Generate an expression for calculating variance over data partitions for
480
- * use in covariance and regression queries. Only values corresponding to
481
- * non-null (x, y) pairs are included. This method uses mean-centered data to
482
- * reduce floating point error. As a side effect, this method adds columns
483
- * for partition-level count and sums to the input *aux* object.
484
- * @param {object} aux An object for auxiliary columns (such as
485
- * sufficient statistics) to include in the data cube aggregation.
486
- * @param {number} i The index of the argument to compute the variance for.
487
- * @param {any[]} args Source data table columns. The entries may be strings,
488
- * column references, SQL expressions, or other string-coercible values.
489
- * @param {(field: any) => string} avg Global average query generator.
490
- * @returns An aggregate expression for calculating variance over
491
- * pre-aggregated data partitions.
492
- */
493
- function regrVarExpr(aux, i, args, avg) {
494
- const n = regrCountExpr(aux, args);
495
- const sum = regrSumExpr(aux, i, args, avg);
496
- const ssq = regrSumSqExpr(aux, i, args, avg);
497
- return agg`(${ssq} - (${sum} ** 2 / ${n}))`;
498
- }
499
-
500
- /**
501
- * Generate an expression for calculating a regression slope. The slope is
502
- * computed as the covariance divided by the variance of the x variable. As a
503
- * side effect, this method adds columns for sufficient statistics to the
504
- * input *aux* object.
505
- * @param {object} aux An object for auxiliary columns (such as
506
- * sufficient statistics) to include in the data cube aggregation.
507
- * @param {any[]} args Source data table columns. The entries may be strings,
508
- * column references, SQL expressions, or other string-coercible values.
509
- * @param {(field: any) => string} avg Global average query generator.
510
- * @returns An aggregate expression for calculating regression slopes over
511
- * pre-aggregated data partitions.
512
- */
513
- function regrSlopeExpr(aux, args, avg) {
514
- const cov = covarianceExpr(aux, args, avg, null);
515
- const varx = regrVarExpr(aux, 1, args, avg);
516
- return agg`(${cov}) / ${varx}`;
517
- }
518
-
519
- /**
520
- * Generate an expression for calculating a regression intercept. The intercept
521
- * is derived from the regression slope and average x and y values. As a
522
- * side effect, this method adds columns for sufficient statistics to the
523
- * input *aux* object.
524
- * @param {object} aux An object for auxiliary columns (such as
525
- * sufficient statistics) to include in the data cube aggregation.
526
- * @param {any[]} args Source data table columns. The entries may be strings,
527
- * column references, SQL expressions, or other string-coercible values.
528
- * @param {(field: any) => string} avg Global average query generator.
529
- * @returns An aggregate expression for calculating regression intercepts over
530
- * pre-aggregated data partitions.
531
- */
532
- function regrInterceptExpr(aux, args, avg) {
533
- const ax = regrAvgXExpr(aux, args);
534
- const ay = regrAvgYExpr(aux, args);
535
- const m = regrSlopeExpr(aux, args, avg);
536
- return agg`${ay} - (${m}) * ${ax}`;
537
- }