@uwdata/mosaic-core 0.10.0 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/mosaic-core.js +12960 -21458
- package/dist/mosaic-core.min.js +7 -16
- package/dist/types/Coordinator.d.ts +169 -0
- package/dist/types/MosaicClient.d.ts +94 -0
- package/dist/types/Param.d.ts +47 -0
- package/dist/types/QueryConsolidator.d.ts +9 -0
- package/dist/types/QueryManager.d.ts +64 -0
- package/dist/types/Selection.d.ts +224 -0
- package/dist/types/SelectionClause.d.ts +105 -0
- package/dist/types/connectors/rest.d.ts +17 -0
- package/dist/types/connectors/socket.d.ts +18 -0
- package/dist/types/connectors/wasm.d.ts +16 -0
- package/dist/types/index.d.ts +25 -0
- package/dist/types/preagg/PreAggregator.d.ts +178 -0
- package/dist/types/preagg/preagg-columns.d.ts +14 -0
- package/dist/types/preagg/sufficient-statistics.d.ts +13 -0
- package/dist/types/util/AsyncDispatch.d.ts +100 -0
- package/dist/types/util/cache.d.ts +13 -0
- package/dist/types/util/decode-ipc.d.ts +7 -0
- package/dist/types/util/distinct.d.ts +2 -0
- package/dist/types/util/field-info.d.ts +13 -0
- package/dist/types/util/hash.d.ts +1 -0
- package/dist/types/util/is-arrow-table.d.ts +8 -0
- package/dist/types/util/js-type.d.ts +1 -0
- package/dist/types/util/priority-queue.d.ts +37 -0
- package/dist/types/util/query-result.d.ts +44 -0
- package/dist/types/util/selection-types.d.ts +114 -0
- package/dist/types/util/synchronizer.d.ts +29 -0
- package/dist/types/util/throttle.d.ts +11 -0
- package/dist/types/util/to-data-columns.d.ts +29 -0
- package/dist/types/util/void-logger.d.ts +7 -0
- package/jsconfig.json +11 -0
- package/package.json +10 -8
- package/src/Coordinator.js +66 -41
- package/src/MosaicClient.js +14 -4
- package/src/QueryConsolidator.js +32 -39
- package/src/QueryManager.js +85 -48
- package/src/Selection.js +49 -15
- package/src/SelectionClause.js +19 -22
- package/src/connectors/rest.js +6 -4
- package/src/connectors/socket.js +7 -4
- package/src/connectors/wasm.js +20 -4
- package/src/index.js +16 -8
- package/src/preagg/PreAggregator.js +407 -0
- package/src/preagg/preagg-columns.js +103 -0
- package/src/preagg/sufficient-statistics.js +439 -0
- package/src/util/decode-ipc.js +11 -0
- package/src/util/field-info.js +19 -16
- package/src/util/hash.js +1 -1
- package/src/util/is-arrow-table.js +10 -0
- package/src/util/priority-queue.js +75 -76
- package/src/util/query-result.js +44 -2
- package/src/util/selection-types.ts +3 -3
- package/src/util/throttle.js +21 -9
- package/src/util/to-data-columns.js +4 -15
- package/src/util/void-logger.js +6 -5
- package/tsconfig.json +11 -0
- package/src/DataCubeIndexer.js +0 -320
- package/src/util/convert-arrow.js +0 -145
- package/src/util/index-columns.js +0 -540
|
@@ -0,0 +1,439 @@
|
|
|
1
|
+
import { AggregateNode, and, argmax, argmin, count, div, ExprNode, isNotNull, max, min, mul, pow, regrAvgX, regrAvgY, regrCount, sql, sqrt, sub, sum } from '@uwdata/mosaic-sql';
|
|
2
|
+
import { fnv_hash } from '../util/hash.js';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Determine sufficient statistics to preaggregate the given node. This
|
|
6
|
+
* method populates the *preagg* and *aggrs* arguments with necessary
|
|
7
|
+
* information for preaggregation optimization.
|
|
8
|
+
* @param {AggregateNode} node An aggregate function.
|
|
9
|
+
* @param {Record<string, ExprNode>} preagg Map of column names to
|
|
10
|
+
* expressions to include in the preaggregation table.
|
|
11
|
+
* @returns {ExprNode} Output aggregate expression that uses preaggregated
|
|
12
|
+
* sufficient statistics to service updates.
|
|
13
|
+
*/
|
|
14
|
+
export function sufficientStatistics(node, preagg, avg) {
|
|
15
|
+
switch (node.name) {
|
|
16
|
+
case 'count':
|
|
17
|
+
case 'sum':
|
|
18
|
+
return sumExpr(preagg, node);
|
|
19
|
+
case 'avg':
|
|
20
|
+
return avgExpr(preagg, node);
|
|
21
|
+
case 'arg_max':
|
|
22
|
+
return argmaxExpr(preagg, node);
|
|
23
|
+
case 'arg_min':
|
|
24
|
+
return argminExpr(preagg, node);
|
|
25
|
+
|
|
26
|
+
// variance statistics drop the original aggregate operation
|
|
27
|
+
// in favor of tracking sufficient statistics
|
|
28
|
+
case 'variance':
|
|
29
|
+
case 'var_samp':
|
|
30
|
+
return varianceExpr(preagg, node, avg);
|
|
31
|
+
case 'var_pop':
|
|
32
|
+
return varianceExpr(preagg, node, avg, false);
|
|
33
|
+
case 'stddev':
|
|
34
|
+
case 'stddev_samp':
|
|
35
|
+
return sqrt(varianceExpr(preagg, node, avg));
|
|
36
|
+
case 'stddev_pop':
|
|
37
|
+
return sqrt(varianceExpr(preagg, node, avg, false));
|
|
38
|
+
case 'covar_samp':
|
|
39
|
+
return covarianceExpr(preagg, node, avg);
|
|
40
|
+
case 'covar_pop':
|
|
41
|
+
return covarianceExpr(preagg, node, avg, false);
|
|
42
|
+
case 'corr':
|
|
43
|
+
return corrExpr(preagg, node, avg);
|
|
44
|
+
|
|
45
|
+
// regression statistics
|
|
46
|
+
case 'regr_count':
|
|
47
|
+
return regrCountExpr(preagg, node).expr;
|
|
48
|
+
case 'regr_avgx':
|
|
49
|
+
return regrAvgXExpr(preagg, node);
|
|
50
|
+
case 'regr_avgy':
|
|
51
|
+
return regrAvgYExpr(preagg, node);
|
|
52
|
+
case 'regr_syy':
|
|
53
|
+
return regrVarExpr(preagg, 0, node, avg);
|
|
54
|
+
case 'regr_sxx':
|
|
55
|
+
return regrVarExpr(preagg, 1, node, avg);
|
|
56
|
+
case 'regr_sxy':
|
|
57
|
+
return covarianceExpr(preagg, node, avg, null);
|
|
58
|
+
case 'regr_slope':
|
|
59
|
+
return regrSlopeExpr(preagg, node, avg);
|
|
60
|
+
case 'regr_intercept':
|
|
61
|
+
return regrInterceptExpr(preagg, node, avg);
|
|
62
|
+
case 'regr_r2':
|
|
63
|
+
return pow(corrExpr(preagg, node, avg), 2);
|
|
64
|
+
|
|
65
|
+
// aggregates that commute directly
|
|
66
|
+
case 'max':
|
|
67
|
+
case 'min':
|
|
68
|
+
case 'bit_and':
|
|
69
|
+
case 'bit_or':
|
|
70
|
+
case 'bit_xor':
|
|
71
|
+
case 'bool_and':
|
|
72
|
+
case 'bool_or':
|
|
73
|
+
case 'product': {
|
|
74
|
+
const name = colName(node);
|
|
75
|
+
preagg[name] = node;
|
|
76
|
+
return sql`${node.name}("${name}")`;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// unsupported aggregate, return null to indicate failure
|
|
80
|
+
default: return null;
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* Generate a column name for the given aggregate node. The name is
|
|
86
|
+
* made from a hash of the string-serialized SQL expression.
|
|
87
|
+
* @param {AggregateNode} node The aggregate node to name.
|
|
88
|
+
* @returns {string} The generated column name.
|
|
89
|
+
*/
|
|
90
|
+
function colName(node) {
|
|
91
|
+
return 'pre_' + fnv_hash(`${node}`).toString(16);
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* Add a sufficient statistic to the preaggregation column set.
|
|
96
|
+
* Generates a unique column name for the statistic and propagates
|
|
97
|
+
* a FILTER clause if one exists on the original aggregate node.
|
|
98
|
+
* @param {Record<string, ExprNode>} preagg A map of columns (such as
|
|
99
|
+
* sufficient statistics) to pre-aggregate.
|
|
100
|
+
* @param {AggregateNode} expr The aggregate statistic to add.
|
|
101
|
+
* @param {AggregateNode} [node] The originating aggregate function call.
|
|
102
|
+
* @returns {string} The name of the statistic column.
|
|
103
|
+
*/
|
|
104
|
+
function addStat(preagg, expr, node) {
|
|
105
|
+
const filter = node?.filter;
|
|
106
|
+
if (filter) {
|
|
107
|
+
// push filter clause to preaggregate expr
|
|
108
|
+
expr = expr.filter
|
|
109
|
+
? expr.where(and(filter, expr.filter))
|
|
110
|
+
: expr.where(filter);
|
|
111
|
+
}
|
|
112
|
+
const name = colName(expr);
|
|
113
|
+
preagg[name] = expr;
|
|
114
|
+
return name;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Generate an expression for calculating counts over data dimensions.
|
|
119
|
+
* As a side effect, this method adds a column to the input *preagg* object
|
|
120
|
+
* to track the count of non-null values per-partition.
|
|
121
|
+
* @param {Record<string, ExprNode>} preagg A map of columns (such as
|
|
122
|
+
* sufficient statistics) to pre-aggregate.
|
|
123
|
+
* @param {AggregateNode} node The originating aggregate function call.
|
|
124
|
+
* @returns {{ expr: ExprNode, name: string }} An aggregate expression over
|
|
125
|
+
* pre-aggregated dimensions and associated column name.
|
|
126
|
+
*/
|
|
127
|
+
function countExpr(preagg, node) {
|
|
128
|
+
const name = addStat(preagg, count(node.args[0]), node);
|
|
129
|
+
return { expr: sum(name), name };
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/**
|
|
133
|
+
* Generate an expression for calculating counts or sums over data dimensions.
|
|
134
|
+
* @param {Record<string, ExprNode>} preagg A map of columns (such as
|
|
135
|
+
* sufficient statistics) to pre-aggregate.
|
|
136
|
+
* @param {AggregateNode} node The originating aggregate function call.
|
|
137
|
+
* @returns {ExprNode} An aggregate expression over pre-aggregated dimensions.
|
|
138
|
+
*/
|
|
139
|
+
function sumExpr(preagg, node) {
|
|
140
|
+
return sum(addStat(preagg, node));
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* Generate an expression for calculating averages over data dimensions.
|
|
145
|
+
* As a side effect, this method adds a column to the input *preagg* object
|
|
146
|
+
* to track the count of non-null values per-partition.
|
|
147
|
+
* @param {Record<string, ExprNode>} preagg A map of columns (such as
|
|
148
|
+
* sufficient statistics) to pre-aggregate.
|
|
149
|
+
* @param {AggregateNode} [node] The originating aggregate function call.
|
|
150
|
+
* @returns {ExprNode} An aggregate expression over pre-aggregated dimensions.
|
|
151
|
+
*/
|
|
152
|
+
function avgExpr(preagg, node) {
|
|
153
|
+
const as = addStat(preagg, node);
|
|
154
|
+
const { expr, name } = countExpr(preagg, node);
|
|
155
|
+
return div(sum(mul(as, name)), expr);
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
/**
|
|
159
|
+
* Generate an expression for calculating argmax over data dimensions.
|
|
160
|
+
* As a side effect, this method adds a column to the input *preagg* object
|
|
161
|
+
* to track a maximum value per-partition.
|
|
162
|
+
* @param {Record<string, ExprNode>} preagg A map of columns (such as
|
|
163
|
+
* sufficient statistics) to pre-aggregate.
|
|
164
|
+
* @param {AggregateNode} node The originating aggregate function call.
|
|
165
|
+
* @returns {ExprNode} An aggregate expression over pre-aggregated dimensions.
|
|
166
|
+
*/
|
|
167
|
+
function argmaxExpr(preagg, node) {
|
|
168
|
+
const expr = addStat(preagg, node);
|
|
169
|
+
const maxy = addStat(preagg, max(node.args[1]), node);
|
|
170
|
+
return argmax(expr, maxy);
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
/**
|
|
174
|
+
* Generate an expression for calculating argmin over data dimensions.
|
|
175
|
+
* As a side effect, this method adds a column to the input *preagg* object
|
|
176
|
+
* to track a minimum value per-partition.
|
|
177
|
+
* @param {object} preagg An object for columns (such as
|
|
178
|
+
* sufficient statistics) to include in the pre-aggregation.
|
|
179
|
+
* @param {AggregateNode} node Source data table columns. The entries may be strings,
|
|
180
|
+
* column references, SQL expressions, or other string-coercible values.
|
|
181
|
+
* @returns {ExprNode} An aggregate expression over pre-aggregated dimensions.
|
|
182
|
+
*/
|
|
183
|
+
function argminExpr(preagg, node) {
|
|
184
|
+
const expr = addStat(preagg, node);
|
|
185
|
+
const miny = addStat(preagg, min(node.args[1]), node);
|
|
186
|
+
return argmin(expr, miny);
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
/**
|
|
190
|
+
* Generate an expression for calculating variance over data dimensions.
|
|
191
|
+
* This method uses the "textbook" definition of variance (E[X^2] - E[X]^2),
|
|
192
|
+
* but on mean-centered data to reduce floating point error. The variance
|
|
193
|
+
* calculation uses three sufficient statistics: the count of non-null values,
|
|
194
|
+
* the residual sum of squares and the sum of residual (mean-centered) values.
|
|
195
|
+
* As a side effect, this method adds columns for these statistics to the
|
|
196
|
+
* input *preagg* object.
|
|
197
|
+
* @param {Record<string, ExprNode>} preagg A map of columns (such as
|
|
198
|
+
* sufficient statistics) to pre-aggregate.
|
|
199
|
+
* @param {AggregateNode} node The originating aggregate function call.
|
|
200
|
+
* @param {(field: any) => ExprNode} avg Global average query generator.
|
|
201
|
+
* @param {boolean} [correction=true] A flag for whether a Bessel
|
|
202
|
+
* correction should be applied to compute the sample variance
|
|
203
|
+
* rather than the populatation variance.
|
|
204
|
+
* @returns {ExprNode} An aggregate expression over pre-aggregated dimensions.
|
|
205
|
+
*/
|
|
206
|
+
function varianceExpr(preagg, node, avg, correction = true) {
|
|
207
|
+
const x = node.args[0];
|
|
208
|
+
const { expr: n } = countExpr(preagg, node);
|
|
209
|
+
const delta = sub(x, avg(x));
|
|
210
|
+
const rssq = addStat(preagg, sum(pow(delta, 2)), node); // residual sum of squares
|
|
211
|
+
const rsum = addStat(preagg, sum(delta), node); // residual sum
|
|
212
|
+
const denom = correction ? sub(n, 1) : n; // Bessel correction
|
|
213
|
+
return div(sub(sum(rssq), div(pow(sum(rsum), 2), n)), denom);
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
/**
|
|
217
|
+
* Generate an expression for calculating covariance over data dimensions.
|
|
218
|
+
* This method uses mean-centered data to reduce floating point error. The
|
|
219
|
+
* covariance calculation uses four sufficient statistics: the count of
|
|
220
|
+
* non-null value pairs, the sum of residual products, and residual sums
|
|
221
|
+
* (of mean-centered values) for x and y. As a side effect, this method
|
|
222
|
+
* adds columns for these statistics to the input *preagg* object.
|
|
223
|
+
* @param {Record<string, ExprNode>} preagg A map of columns (such as
|
|
224
|
+
* sufficient statistics) to pre-aggregate.
|
|
225
|
+
* @param {AggregateNode} node The originating aggregate function call.
|
|
226
|
+
* @param {(field: any) => ExprNode} avg Global average query generator.
|
|
227
|
+
* @param {boolean|null} [correction=true] A flag for whether a Bessel
|
|
228
|
+
* correction should be applied to compute the sample covariance rather
|
|
229
|
+
* than the populatation covariance. If null, an expression for the
|
|
230
|
+
* unnormalized covariance (no division by sample count) is returned.
|
|
231
|
+
* @returns {ExprNode} An aggregate expression over pre-aggregated dimensions.
|
|
232
|
+
*/
|
|
233
|
+
function covarianceExpr(preagg, node, avg, correction = true) {
|
|
234
|
+
const { expr: n } = regrCountExpr(preagg, node);
|
|
235
|
+
const sxy = regrSumXYExpr(preagg, node, avg);
|
|
236
|
+
const sx = regrSumExpr(preagg, 1, node, avg);
|
|
237
|
+
const sy = regrSumExpr(preagg, 0, node, avg);
|
|
238
|
+
const num = sub(sxy, div(mul(sx, sy), n));
|
|
239
|
+
return correction === null ? num // do not divide by count
|
|
240
|
+
: correction ? div(num, sub(n, 1)) // Bessel correction (sample)
|
|
241
|
+
: div(num, n); // no correction (population)
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
/**
|
|
245
|
+
* Generate an expression for calculating Pearson product-moment correlation
|
|
246
|
+
* coefficients over data dimensions. This method uses mean-centered data
|
|
247
|
+
* to reduce floating point error. The correlation calculation uses six
|
|
248
|
+
* sufficient statistics: the count of non-null value pairs, the sum of
|
|
249
|
+
* residual products, and both residual sums and sums of squares for x and y.
|
|
250
|
+
* As a side effect, this method adds columns for these statistics to the
|
|
251
|
+
* input *preagg* object.
|
|
252
|
+
* @param {Record<string, ExprNode>} preagg A map of columns (such as
|
|
253
|
+
* sufficient statistics) to pre-aggregate.
|
|
254
|
+
* @param {AggregateNode} node The originating aggregate function call.
|
|
255
|
+
* @param {(field: any) => ExprNode} avg Global average query generator.
|
|
256
|
+
* @returns {ExprNode} An aggregate expression over pre-aggregated dimensions.
|
|
257
|
+
*/
|
|
258
|
+
function corrExpr(preagg, node, avg) {
|
|
259
|
+
const { expr: n } = regrCountExpr(preagg, node);
|
|
260
|
+
const sxy = regrSumXYExpr(preagg, node, avg);
|
|
261
|
+
const sxx = regrSumSqExpr(preagg, 1, node, avg);
|
|
262
|
+
const syy = regrSumSqExpr(preagg, 0, node, avg);
|
|
263
|
+
const sx = regrSumExpr(preagg, 1, node, avg);
|
|
264
|
+
const sy = regrSumExpr(preagg, 0, node, avg);
|
|
265
|
+
const vx = sub(sxx, div(pow(sx, 2), n));
|
|
266
|
+
const vy = sub(syy, div(pow(sy, 2), n));
|
|
267
|
+
return div(
|
|
268
|
+
sub(sxy, div(mul(sx, sy), n)),
|
|
269
|
+
sqrt(mul(vx, vy))
|
|
270
|
+
);
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
/**
|
|
274
|
+
* Generate an expression for the count of non-null (x, y) pairs. As a side
|
|
275
|
+
* effect, this method adds columns to the input *preagg* object to the
|
|
276
|
+
* partition-level count of non-null pairs.
|
|
277
|
+
* @param {Record<string, ExprNode>} preagg A map of columns (such as
|
|
278
|
+
* sufficient statistics) to pre-aggregate.
|
|
279
|
+
* @param {AggregateNode} node The originating aggregate function call.
|
|
280
|
+
* @returns {{ expr: ExprNode, name: string }} An aggregate expression over
|
|
281
|
+
* pre-aggregated dimensions and associated column name.
|
|
282
|
+
*/
|
|
283
|
+
function regrCountExpr(preagg, node) {
|
|
284
|
+
const [x, y] = node.args;
|
|
285
|
+
const n = addStat(preagg, regrCount(x, y), node);
|
|
286
|
+
return { expr: sum(n), name: n };
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
/**
|
|
290
|
+
* Generate an expression for calculating sums of residual values for use in
|
|
291
|
+
* covariance and regression queries. Only values corresponding to non-null
|
|
292
|
+
* (x, y) pairs are included. This method uses mean-centered data to reduce
|
|
293
|
+
* floating point error. As a side effect, this method adds a column for
|
|
294
|
+
* partition-level sums to the input *preagg* object.
|
|
295
|
+
* @param {Record<string, ExprNode>} preagg A map of columns (such as
|
|
296
|
+
* sufficient statistics) to pre-aggregate.
|
|
297
|
+
* @param {number} i An index indicating which argument column to sum.
|
|
298
|
+
* @param {AggregateNode} node The originating aggregate function call.
|
|
299
|
+
* @param {(field: any) => ExprNode} avg Global average query generator.
|
|
300
|
+
* @returns {ExprNode} An aggregate expression over pre-aggregated dimensions.
|
|
301
|
+
*/
|
|
302
|
+
function regrSumExpr(preagg, i, node, avg) {
|
|
303
|
+
const args = node.args;
|
|
304
|
+
const v = args[i];
|
|
305
|
+
const o = args[1 - i];
|
|
306
|
+
const rsum = sum(sub(v, avg(v))).where(isNotNull(o));
|
|
307
|
+
return sum(addStat(preagg, rsum, node));
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
/**
|
|
311
|
+
* Generate an expressios for calculating sums of squared residual values for
|
|
312
|
+
* use in covariance and regression queries. Only values corresponding to
|
|
313
|
+
* non-null (x, y) pairs are included. This method uses mean-centered data to
|
|
314
|
+
* reduce floating point error. As a side effect, this method adds a column
|
|
315
|
+
* for partition-level sums to the input *preagg* object.
|
|
316
|
+
* @param {Record<string, ExprNode>} preagg A map of columns (such as
|
|
317
|
+
* sufficient statistics) to pre-aggregate.
|
|
318
|
+
* @param {number} i An index indicating which argument column to sum.
|
|
319
|
+
* @param {AggregateNode} node The originating aggregate function call.
|
|
320
|
+
* @param {(field: any) => ExprNode} avg Global average query generator.
|
|
321
|
+
* @returns {ExprNode} An aggregate expression over pre-aggregated dimensions.
|
|
322
|
+
*/
|
|
323
|
+
function regrSumSqExpr(preagg, i, node, avg) {
|
|
324
|
+
const args = node.args;
|
|
325
|
+
const v = args[i];
|
|
326
|
+
const u = args[1 - i];
|
|
327
|
+
const ssq = sum(pow(sub(v, avg(v)), 2)).where(isNotNull(u));
|
|
328
|
+
return sum(addStat(preagg, ssq, node));
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
/**
|
|
332
|
+
* Generate an expression for calculating sums of residual product values for
|
|
333
|
+
* use in covariance and regression queries. Only values corresponding to
|
|
334
|
+
* non-null (x, y) pairs are included. This method uses mean-centered data to
|
|
335
|
+
* reduce floating point error. As a side effect, this method adds a column
|
|
336
|
+
* for partition-level sums to the input *preagg* object.
|
|
337
|
+
* @param {Record<string, ExprNode>} preagg A map of columns (such as
|
|
338
|
+
* sufficient statistics) to pre-aggregate.
|
|
339
|
+
* @param {AggregateNode} node The originating aggregate function call.
|
|
340
|
+
* @param {(field: any) => ExprNode} avg Global average query generator.
|
|
341
|
+
* @returns {ExprNode} An aggregate expression over pre-aggregated dimensions.
|
|
342
|
+
*/
|
|
343
|
+
function regrSumXYExpr(preagg, node, avg) {
|
|
344
|
+
const [y, x] = node.args;
|
|
345
|
+
const sxy = sum(mul(sub(x, avg(x)), sub(y, avg(y))));
|
|
346
|
+
return sum(addStat(preagg, sxy, node));
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
/**
|
|
350
|
+
* Generate an expression for the average x value in a regression context.
|
|
351
|
+
* Only values corresponding to non-null (x, y) pairs are included. As a side
|
|
352
|
+
* effect, this method adds columns to the input *preagg* object to track both
|
|
353
|
+
* the count of non-null pairs and partition-level averages.
|
|
354
|
+
* @param {Record<string, ExprNode>} preagg A map of columns (such as
|
|
355
|
+
* sufficient statistics) to pre-aggregate.
|
|
356
|
+
* @param {AggregateNode} node The originating aggregate function call.
|
|
357
|
+
* @returns {ExprNode} An aggregate expression over pre-aggregated dimensions.
|
|
358
|
+
*/
|
|
359
|
+
function regrAvgXExpr(preagg, node) {
|
|
360
|
+
const [y, x] = node.args;
|
|
361
|
+
const { expr: n, name } = regrCountExpr(preagg, node);
|
|
362
|
+
const a = addStat(preagg, regrAvgX(y, x), node);
|
|
363
|
+
return div(sum(mul(a, name)), n);
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
/**
|
|
367
|
+
* Generate an expression for the average y value in a regression context.
|
|
368
|
+
* Only values corresponding to non-null (x, y) pairs are included. As a side
|
|
369
|
+
* effect, this method adds columns to the input *preagg* object to track both
|
|
370
|
+
* the count of non-null pairs and partition-level averages.
|
|
371
|
+
* @param {Record<string, ExprNode>} preagg A map of columns (such as
|
|
372
|
+
* sufficient statistics) to pre-aggregate.
|
|
373
|
+
* @param {AggregateNode} node The originating aggregate function call.
|
|
374
|
+
* @returns {ExprNode} An aggregate expression over pre-aggregated dimensions.
|
|
375
|
+
*/
|
|
376
|
+
function regrAvgYExpr(preagg, node) {
|
|
377
|
+
const [y, x] = node.args;
|
|
378
|
+
const { expr: n, name } = regrCountExpr(preagg, node);
|
|
379
|
+
const a = addStat(preagg, regrAvgY(y, x), node);
|
|
380
|
+
return div(sum(mul(a, name)), n);
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
/**
|
|
384
|
+
* Generate an expression for calculating variance over data dimensions for
|
|
385
|
+
* use in covariance and regression queries. Only values corresponding to
|
|
386
|
+
* non-null (x, y) pairs are included. This method uses mean-centered data to
|
|
387
|
+
* reduce floating point error. As a side effect, this method adds columns
|
|
388
|
+
* for partition-level count and sums to the input *preagg* object.
|
|
389
|
+
* @param {Record<string, ExprNode>} preagg A map of columns (such as
|
|
390
|
+
* sufficient statistics) to pre-aggregate.
|
|
391
|
+
* @param {number} i The index of the argument to compute the variance for.
|
|
392
|
+
* @param {AggregateNode} node The originating aggregate function call.
|
|
393
|
+
* @param {(field: any) => ExprNode} avg Global average query generator.
|
|
394
|
+
* @returns {ExprNode} An aggregate expression for calculating variance
|
|
395
|
+
* over pre-aggregated data dimensions.
|
|
396
|
+
*/
|
|
397
|
+
function regrVarExpr(preagg, i, node, avg) {
|
|
398
|
+
const { expr: n } = regrCountExpr(preagg, node);
|
|
399
|
+
const sum = regrSumExpr(preagg, i, node, avg);
|
|
400
|
+
const ssq = regrSumSqExpr(preagg, i, node, avg);
|
|
401
|
+
return sub(ssq, div(pow(sum, 2), n));
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
/**
|
|
405
|
+
* Generate an expression for calculating a regression slope. The slope is
|
|
406
|
+
* computed as the covariance divided by the variance of the x variable. As a
|
|
407
|
+
* side effect, this method adds columns for sufficient statistics to the
|
|
408
|
+
* input *preagg* object.
|
|
409
|
+
* @param {Record<string, ExprNode>} preagg A map of columns (such as
|
|
410
|
+
* sufficient statistics) to pre-aggregate.
|
|
411
|
+
* @param {AggregateNode} node The originating aggregate function call.
|
|
412
|
+
* @param {(field: any) => ExprNode} avg Global average query generator.
|
|
413
|
+
* @returns {ExprNode} An aggregate expression for calculating regression
|
|
414
|
+
* slopes over pre-aggregated data dimensions.
|
|
415
|
+
*/
|
|
416
|
+
function regrSlopeExpr(preagg, node, avg) {
|
|
417
|
+
const cov = covarianceExpr(preagg, node, avg, null);
|
|
418
|
+
const varx = regrVarExpr(preagg, 1, node, avg);
|
|
419
|
+
return div(cov, varx);
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
/**
|
|
423
|
+
* Generate an expression for calculating a regression intercept. The intercept
|
|
424
|
+
* is derived from the regression slope and average x and y values. As a
|
|
425
|
+
* side effect, this method adds columns for sufficient statistics to the
|
|
426
|
+
* input *preagg* object.
|
|
427
|
+
* @param {Record<string, ExprNode>} preagg A map of columns (such as
|
|
428
|
+
* sufficient statistics) to pre-aggregate.
|
|
429
|
+
* @param {AggregateNode} node The originating aggregate function call.
|
|
430
|
+
* @param {(field: any) => ExprNode} avg Global average query generator.
|
|
431
|
+
* @returns {ExprNode} An aggregate expression for calculating regression
|
|
432
|
+
* intercepts over pre-aggregated data dimensions.
|
|
433
|
+
*/
|
|
434
|
+
function regrInterceptExpr(preagg, node, avg) {
|
|
435
|
+
const ax = regrAvgXExpr(preagg, node);
|
|
436
|
+
const ay = regrAvgYExpr(preagg, node);
|
|
437
|
+
const m = regrSlopeExpr(preagg, node, avg);
|
|
438
|
+
return sub(ay, mul(m, ax));
|
|
439
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import { tableFromIPC } from '@uwdata/flechette';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Decode Arrow IPC bytes to a table instance, with an option to map date and
|
|
5
|
+
* timestamp values to JS Date objects.
|
|
6
|
+
* @param {ArrayBuffer | Uint8Array} data Arrow IPC bytes.
|
|
7
|
+
* @returns {import('@uwdata/flechette').Table} A table instance.
|
|
8
|
+
*/
|
|
9
|
+
export function decodeIPC(data) {
|
|
10
|
+
return tableFromIPC(data, { useDate: true });
|
|
11
|
+
}
|
package/src/util/field-info.js
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
|
-
import { Query,
|
|
1
|
+
import { AggregateNode, Query, asTableRef, count, isNull, max, min, sql } from '@uwdata/mosaic-sql';
|
|
2
2
|
import { jsType } from './js-type.js';
|
|
3
|
-
import { convertArrowValue } from './convert-arrow.js';
|
|
4
3
|
|
|
5
4
|
export const Count = 'count';
|
|
6
5
|
export const Nulls = 'nulls';
|
|
@@ -9,6 +8,9 @@ export const Min = 'min';
|
|
|
9
8
|
export const Distinct = 'distinct';
|
|
10
9
|
export const Stats = { Count, Nulls, Max, Min, Distinct };
|
|
11
10
|
|
|
11
|
+
/**
|
|
12
|
+
* @type {Record<string, (column: string) => AggregateNode>}
|
|
13
|
+
*/
|
|
12
14
|
const statMap = {
|
|
13
15
|
[Count]: count,
|
|
14
16
|
[Distinct]: column => count(column).distinct(),
|
|
@@ -17,14 +19,21 @@ const statMap = {
|
|
|
17
19
|
[Nulls]: column => count().where(isNull(column))
|
|
18
20
|
};
|
|
19
21
|
|
|
22
|
+
/**
|
|
23
|
+
*
|
|
24
|
+
* @param {string} table
|
|
25
|
+
* @param {string} column
|
|
26
|
+
* @param {string[]|Set<string>} stats
|
|
27
|
+
* @returns
|
|
28
|
+
*/
|
|
20
29
|
function summarize(table, column, stats) {
|
|
21
30
|
return Query
|
|
22
31
|
.from(table)
|
|
23
|
-
.select(Array.from(stats, s => [s
|
|
32
|
+
.select(Array.from(stats, s => ({[s]: statMap[s](column)})));
|
|
24
33
|
}
|
|
25
34
|
|
|
26
35
|
export async function queryFieldInfo(mc, fields) {
|
|
27
|
-
if (fields.length === 1 &&
|
|
36
|
+
if (fields.length === 1 && fields[0].column === '*') {
|
|
28
37
|
return getTableInfo(mc, fields[0].table);
|
|
29
38
|
} else {
|
|
30
39
|
return (await Promise
|
|
@@ -36,7 +45,8 @@ export async function queryFieldInfo(mc, fields) {
|
|
|
36
45
|
async function getFieldInfo(mc, { table, column, stats }) {
|
|
37
46
|
// generate and issue a query for field metadata info
|
|
38
47
|
// use GROUP BY ALL to differentiate & consolidate aggregates
|
|
39
|
-
const q = Query
|
|
48
|
+
const q = Query
|
|
49
|
+
.from({ source: table })
|
|
40
50
|
.select({ column })
|
|
41
51
|
.groupby(column.aggregate ? sql`ALL` : []);
|
|
42
52
|
const [desc] = Array.from(await mc.query(Query.describe(q)));
|
|
@@ -52,24 +62,17 @@ async function getFieldInfo(mc, { table, column, stats }) {
|
|
|
52
62
|
if (!(stats?.length || stats?.size)) return info;
|
|
53
63
|
|
|
54
64
|
// query for summary stats
|
|
55
|
-
const result = await mc.query(
|
|
65
|
+
const [result] = await mc.query(
|
|
56
66
|
summarize(table, column, stats),
|
|
57
67
|
{ persist: true }
|
|
58
68
|
);
|
|
59
69
|
|
|
60
|
-
// extract summary stats, copy to field info
|
|
61
|
-
|
|
62
|
-
const { name } = result.schema.fields[i];
|
|
63
|
-
const child = result.getChildAt(i);
|
|
64
|
-
const convert = convertArrowValue(child.type);
|
|
65
|
-
info[name] = convert(child.get(0));
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
return info;
|
|
70
|
+
// extract summary stats, copy to field info, and return
|
|
71
|
+
return Object.assign(info, result);
|
|
69
72
|
}
|
|
70
73
|
|
|
71
74
|
async function getTableInfo(mc, table) {
|
|
72
|
-
const result = await mc.query(`DESCRIBE ${
|
|
75
|
+
const result = await mc.query(`DESCRIBE ${asTableRef(table)}`);
|
|
73
76
|
return Array.from(result).map(desc => ({
|
|
74
77
|
table,
|
|
75
78
|
column: desc.column_name,
|
package/src/util/hash.js
CHANGED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Test if a value is a Flechette Arrow table.
|
|
3
|
+
* We use a "duck typing" approach and check for a getChild function.
|
|
4
|
+
* @param {*} values The value to test
|
|
5
|
+
* @returns {values is import('@uwdata/flechette').Table}
|
|
6
|
+
* true if the value duck types as Arrow data
|
|
7
|
+
*/
|
|
8
|
+
export function isArrowTable(values) {
|
|
9
|
+
return typeof values?.getChild === 'function';
|
|
10
|
+
}
|