@uwdata/mosaic-core 0.7.1 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/mosaic-core.js +2126 -1655
- package/dist/mosaic-core.min.js +8 -8
- package/package.json +9 -6
- package/src/Coordinator.js +11 -5
- package/src/DataCubeIndexer.js +78 -108
- package/src/FilterGroup.js +27 -10
- package/src/MosaicClient.js +13 -5
- package/src/Param.js +4 -2
- package/src/QueryConsolidator.js +6 -0
- package/src/QueryManager.js +101 -93
- package/src/Selection.js +33 -17
- package/src/SelectionClause.js +147 -0
- package/src/connectors/rest.js +7 -0
- package/src/connectors/socket.js +7 -0
- package/src/connectors/wasm.js +4 -4
- package/src/index.js +1 -0
- package/src/util/AsyncDispatch.js +15 -12
- package/src/util/convert-arrow.js +32 -35
- package/src/util/index-columns.js +538 -0
- package/src/util/js-type.js +1 -0
- package/src/util/query-result.js +4 -3
- package/src/util/selection-types.ts +137 -0
|
@@ -1,34 +1,29 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
1
|
+
import { DataType } from 'apache-arrow';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* @typedef {import('apache-arrow').Vector} Vector
|
|
5
|
+
*/
|
|
6
6
|
|
|
7
7
|
/**
|
|
8
8
|
* Test if a value is an Apache Arrow table.
|
|
9
9
|
* As sometimes multiple Arrow versions may be used simultaneously,
|
|
10
10
|
* we use a "duck typing" approach and check for a getChild function.
|
|
11
11
|
* @param {*} values The value to test
|
|
12
|
-
* @returns true if the value duck types as Apache Arrow data
|
|
12
|
+
* @returns {values is import('apache-arrow').Table} true if the value duck types as Apache Arrow data
|
|
13
13
|
*/
|
|
14
14
|
export function isArrowTable(values) {
|
|
15
15
|
return typeof values?.getChild === 'function';
|
|
16
16
|
}
|
|
17
17
|
|
|
18
18
|
/**
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
19
|
+
* Return a JavaScript array type for an Apache Arrow column type.
|
|
20
|
+
* @param {DataType} type an Apache Arrow column type
|
|
21
|
+
* @returns a JavaScript array constructor
|
|
22
|
+
*/
|
|
23
23
|
export function convertArrowArrayType(type) {
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
case DECIMAL:
|
|
28
|
-
return Float64Array;
|
|
29
|
-
default:
|
|
30
|
-
return Array;
|
|
31
|
-
}
|
|
24
|
+
return DataType.isInt(type) || DataType.isFloat(type) || DataType.isDecimal(type)
|
|
25
|
+
? Float64Array
|
|
26
|
+
: Array;
|
|
32
27
|
}
|
|
33
28
|
|
|
34
29
|
/**
|
|
@@ -37,24 +32,22 @@ export function convertArrowArrayType(type) {
|
|
|
37
32
|
* Large integers (BigInt) are converted to Float64 numbers.
|
|
38
33
|
* Fixed-point decimal values are convert to Float64 numbers.
|
|
39
34
|
* Otherwise, the default Arrow values are used.
|
|
40
|
-
* @param {
|
|
35
|
+
* @param {DataType} type an Apache Arrow column type
|
|
41
36
|
* @returns a value conversion function
|
|
42
37
|
*/
|
|
43
38
|
export function convertArrowValue(type) {
|
|
44
|
-
const { typeId } = type;
|
|
45
|
-
|
|
46
39
|
// map timestamp numbers to date objects
|
|
47
|
-
if (
|
|
40
|
+
if (DataType.isTimestamp(type)) {
|
|
48
41
|
return v => v == null ? v : new Date(v);
|
|
49
42
|
}
|
|
50
43
|
|
|
51
44
|
// map bigint to number
|
|
52
|
-
if (
|
|
45
|
+
if (DataType.isInt(type) && type.bitWidth >= 64) {
|
|
53
46
|
return v => v == null ? v : Number(v);
|
|
54
47
|
}
|
|
55
48
|
|
|
56
49
|
// map decimal to number
|
|
57
|
-
if (
|
|
50
|
+
if (DataType.isDecimal(type)) {
|
|
58
51
|
const scale = 1 / Math.pow(10, type.scale);
|
|
59
52
|
return v => v == null ? v : decimalToNumber(v, scale);
|
|
60
53
|
}
|
|
@@ -69,15 +62,14 @@ export function convertArrowValue(type) {
|
|
|
69
62
|
* Large integers (BigInt) are converted to Float64 numbers.
|
|
70
63
|
* Fixed-point decimal values are convert to Float64 numbers.
|
|
71
64
|
* Otherwise, the default Arrow values are used.
|
|
72
|
-
* @param {
|
|
65
|
+
* @param {Vector} column An Apache Arrow column
|
|
73
66
|
* @returns an array of values
|
|
74
67
|
*/
|
|
75
68
|
export function convertArrowColumn(column) {
|
|
76
69
|
const { type } = column;
|
|
77
|
-
const { typeId } = type;
|
|
78
70
|
|
|
79
71
|
// map timestamp numbers to date objects
|
|
80
|
-
if (
|
|
72
|
+
if (DataType.isTimestamp(type)) {
|
|
81
73
|
const size = column.length;
|
|
82
74
|
const array = new Array(size);
|
|
83
75
|
for (let row = 0; row < size; ++row) {
|
|
@@ -88,28 +80,33 @@ export function convertArrowColumn(column) {
|
|
|
88
80
|
}
|
|
89
81
|
|
|
90
82
|
// map bigint to number
|
|
91
|
-
if (
|
|
83
|
+
if (DataType.isInt(type) && type.bitWidth >= 64) {
|
|
92
84
|
const size = column.length;
|
|
93
|
-
const array = new Float64Array(size);
|
|
85
|
+
const array = column.nullCount ? new Array(size) : new Float64Array(size);
|
|
94
86
|
for (let row = 0; row < size; ++row) {
|
|
95
87
|
const v = column.get(row);
|
|
96
|
-
array[row] = v == null ?
|
|
88
|
+
array[row] = v == null ? null : Number(v);
|
|
97
89
|
}
|
|
98
90
|
return array;
|
|
99
91
|
}
|
|
100
92
|
|
|
101
93
|
// map decimal to number
|
|
102
|
-
if (
|
|
94
|
+
if (DataType.isDecimal(type)) {
|
|
103
95
|
const scale = 1 / Math.pow(10, type.scale);
|
|
104
96
|
const size = column.length;
|
|
105
|
-
const array = new Float64Array(size);
|
|
97
|
+
const array = column.nullCount ? new Array(size) : new Float64Array(size);
|
|
106
98
|
for (let row = 0; row < size; ++row) {
|
|
107
99
|
const v = column.get(row);
|
|
108
|
-
array[row] = v == null ?
|
|
100
|
+
array[row] = v == null ? null : decimalToNumber(v, scale);
|
|
109
101
|
}
|
|
110
102
|
return array;
|
|
111
103
|
}
|
|
112
104
|
|
|
105
|
+
// if there are null values, use a standard array
|
|
106
|
+
if (column.nullCount) {
|
|
107
|
+
return Array.from(column);
|
|
108
|
+
}
|
|
109
|
+
|
|
113
110
|
// otherwise use Arrow JS defaults
|
|
114
111
|
return column.toArray();
|
|
115
112
|
}
|
|
@@ -124,10 +121,10 @@ const BASE32 = Array.from(
|
|
|
124
121
|
/**
|
|
125
122
|
* Convert a fixed point decimal value to a double precision number.
|
|
126
123
|
* Note: if the value is sufficiently large the conversion may be lossy!
|
|
127
|
-
* @param {Uint32Array} v a fixed decimal value
|
|
124
|
+
* @param {Uint32Array & { signed: boolean }} v a fixed decimal value
|
|
128
125
|
* @param {number} scale a scale factor, corresponding to the
|
|
129
126
|
* number of fractional decimal digits in the fixed point value
|
|
130
|
-
* @returns the resulting number
|
|
127
|
+
* @returns {number} the resulting number
|
|
131
128
|
*/
|
|
132
129
|
function decimalToNumber(v, scale) {
|
|
133
130
|
const n = v.length;
|
|
@@ -0,0 +1,538 @@
|
|
|
1
|
+
import { Query, agg, sql } from '@uwdata/mosaic-sql';
|
|
2
|
+
import { MosaicClient } from '../MosaicClient.js';
|
|
3
|
+
|
|
4
|
+
export const NO_INDEX = { from: NaN };
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Determine data cube index columns for a given Mosaic client.
|
|
8
|
+
* @param {MosaicClient} client The Mosaic client.
|
|
9
|
+
* @returns An object with necessary column data to generate data
|
|
10
|
+
* cube index columns, null if an invalid or unsupported expression
|
|
11
|
+
* is encountered, or NO_INDEX if the client is not indexable.
|
|
12
|
+
*/
|
|
13
|
+
export function indexColumns(client) {
|
|
14
|
+
if (!client.filterIndexable) return NO_INDEX;
|
|
15
|
+
const q = client.query();
|
|
16
|
+
const from = getBaseTable(q);
|
|
17
|
+
if (typeof from !== 'string' || !q.groupby) return NO_INDEX;
|
|
18
|
+
const g = new Set(q.groupby().map(c => c.column));
|
|
19
|
+
|
|
20
|
+
const aggr = []; // list of output aggregate columns
|
|
21
|
+
const dims = []; // list of grouping dimension columns
|
|
22
|
+
const aux = {}; // auxiliary columns needed by aggregates
|
|
23
|
+
|
|
24
|
+
for (const entry of q.select()) {
|
|
25
|
+
const { as, expr: { aggregate, args } } = entry;
|
|
26
|
+
const op = aggregate?.toUpperCase?.();
|
|
27
|
+
switch (op) {
|
|
28
|
+
case 'COUNT':
|
|
29
|
+
case 'SUM':
|
|
30
|
+
// TODO: revisit this DOUBLE cast in the future
|
|
31
|
+
// for now, this sidesteps client-side conversions
|
|
32
|
+
// of bignum and fixed decimal types to JS numbers
|
|
33
|
+
aggr.push({ [as]: agg`SUM("${as}")::DOUBLE` });
|
|
34
|
+
break;
|
|
35
|
+
case 'AVG':
|
|
36
|
+
aggr.push({ [as]: avgExpr(aux, as, args[0]) });
|
|
37
|
+
break;
|
|
38
|
+
case 'ARG_MAX':
|
|
39
|
+
aggr.push({ [as]: argmaxExpr(aux, as, args) });
|
|
40
|
+
break;
|
|
41
|
+
case 'ARG_MIN':
|
|
42
|
+
aggr.push({ [as]: argminExpr(aux, as, args) });
|
|
43
|
+
break;
|
|
44
|
+
|
|
45
|
+
// variance statistics drop the original aggregate operation
|
|
46
|
+
// in favor of tracking auxiliary sufficient statistics
|
|
47
|
+
case 'VARIANCE':
|
|
48
|
+
case 'VAR_SAMP':
|
|
49
|
+
aux[as] = null;
|
|
50
|
+
aggr.push({ [as]: varianceExpr(aux, args[0], from) });
|
|
51
|
+
break;
|
|
52
|
+
case 'VAR_POP':
|
|
53
|
+
aux[as] = null;
|
|
54
|
+
aggr.push({ [as]: varianceExpr(aux, args[0], from, false) });
|
|
55
|
+
break;
|
|
56
|
+
case 'STDDEV':
|
|
57
|
+
case 'STDDEV_SAMP':
|
|
58
|
+
aux[as] = null;
|
|
59
|
+
aggr.push({ [as]: agg`SQRT(${varianceExpr(aux, args[0], from)})` });
|
|
60
|
+
break;
|
|
61
|
+
case 'STDDEV_POP':
|
|
62
|
+
aux[as] = null;
|
|
63
|
+
aggr.push({ [as]: agg`SQRT(${varianceExpr(aux, args[0], from, false)})` });
|
|
64
|
+
break;
|
|
65
|
+
case 'COVAR_SAMP':
|
|
66
|
+
aux[as] = null;
|
|
67
|
+
aggr.push({ [as]: covarianceExpr(aux, args, from) });
|
|
68
|
+
break;
|
|
69
|
+
case 'COVAR_POP':
|
|
70
|
+
aux[as] = null;
|
|
71
|
+
aggr.push({ [as]: covarianceExpr(aux, args, from, false) });
|
|
72
|
+
break;
|
|
73
|
+
case 'CORR':
|
|
74
|
+
aux[as] = null;
|
|
75
|
+
aggr.push({ [as]: corrExpr(aux, args, from) });
|
|
76
|
+
break;
|
|
77
|
+
|
|
78
|
+
// regression statistics
|
|
79
|
+
case 'REGR_COUNT':
|
|
80
|
+
aux[as] = null;
|
|
81
|
+
aggr.push({ [as]: agg`${regrCountExpr(aux, args)}::DOUBLE` });
|
|
82
|
+
break;
|
|
83
|
+
case 'REGR_AVGX':
|
|
84
|
+
aux[as] = null;
|
|
85
|
+
aggr.push({ [as]: regrAvgXExpr(aux, args) });
|
|
86
|
+
break;
|
|
87
|
+
case 'REGR_AVGY':
|
|
88
|
+
aux[as] = null;
|
|
89
|
+
aggr.push({ [as]: regrAvgYExpr(aux, args) });
|
|
90
|
+
break;
|
|
91
|
+
case 'REGR_SYY':
|
|
92
|
+
aux[as] = null;
|
|
93
|
+
aggr.push({ [as]: regrVarExpr(aux, 0, args, from) });
|
|
94
|
+
break;
|
|
95
|
+
case 'REGR_SXX':
|
|
96
|
+
aux[as] = null;
|
|
97
|
+
aggr.push({ [as]: regrVarExpr(aux, 1, args, from) });
|
|
98
|
+
break;
|
|
99
|
+
case 'REGR_SXY':
|
|
100
|
+
aux[as] = null;
|
|
101
|
+
aggr.push({ [as]: covarianceExpr(aux, args, from, null) });
|
|
102
|
+
break;
|
|
103
|
+
case 'REGR_SLOPE':
|
|
104
|
+
aux[as] = null;
|
|
105
|
+
aggr.push({ [as]: regrSlopeExpr(aux, args, from) });
|
|
106
|
+
break;
|
|
107
|
+
case 'REGR_INTERCEPT':
|
|
108
|
+
aux[as] = null;
|
|
109
|
+
aggr.push({ [as]: regrInterceptExpr(aux, args, from) });
|
|
110
|
+
break;
|
|
111
|
+
case 'REGR_R2':
|
|
112
|
+
aux[as] = null;
|
|
113
|
+
aggr.push({ [as]: agg`(${corrExpr(aux, args, from)}) ** 2` });
|
|
114
|
+
break;
|
|
115
|
+
|
|
116
|
+
// aggregates that commute directly
|
|
117
|
+
case 'MAX':
|
|
118
|
+
case 'MIN':
|
|
119
|
+
case 'BIT_AND':
|
|
120
|
+
case 'BIT_OR':
|
|
121
|
+
case 'BIT_XOR':
|
|
122
|
+
case 'BOOL_AND':
|
|
123
|
+
case 'BOOL_OR':
|
|
124
|
+
case 'PRODUCT':
|
|
125
|
+
aggr.push({ [as]: agg`${op}("${as}")` });
|
|
126
|
+
break;
|
|
127
|
+
|
|
128
|
+
// otherwise, check if dimension
|
|
129
|
+
default:
|
|
130
|
+
if (g.has(as)) dims.push(as);
|
|
131
|
+
else return null; // unsupported aggregate
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
return { from, dims, aggr, aux };
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
* Generate an output column name for use as an auxiliary column
|
|
140
|
+
* (e.g., for sufficient statistics) within a data cube index.
|
|
141
|
+
* @param {string} type The operation type.
|
|
142
|
+
* @param {...any} args The input column arguments.
|
|
143
|
+
* @returns {string} A sanitized auxiliary column name.
|
|
144
|
+
*/
|
|
145
|
+
function auxName(type, ...args) {
|
|
146
|
+
const cols = args.length ? '_' + args.map(sanitize).join('_') : '';
|
|
147
|
+
return `__${type}${cols}__`;
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
/**
|
|
151
|
+
* Sanitize a table column reference as a "safe" string value to
|
|
152
|
+
* use as part of derived column names.
|
|
153
|
+
* @param {*} col The source data table column. This may be a string,
|
|
154
|
+
* column reference, SQL expression, or other string-coercible value.
|
|
155
|
+
* @returns {string} The sanitized column name.
|
|
156
|
+
*/
|
|
157
|
+
function sanitize(col) {
|
|
158
|
+
return `${col}`
|
|
159
|
+
.replaceAll('"', '')
|
|
160
|
+
.replaceAll(' ', '_');
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
/**
|
|
164
|
+
* Identify a single base (source) table of a query.
|
|
165
|
+
* @param {Query} query The input query.
|
|
166
|
+
* @returns {string | undefined | NaN} the base table name, or
|
|
167
|
+
* `undefined` if there is no source table, or `NaN` if the
|
|
168
|
+
* query operates over multiple source tables.
|
|
169
|
+
*/
|
|
170
|
+
function getBaseTable(query) {
|
|
171
|
+
const subq = query.subqueries;
|
|
172
|
+
|
|
173
|
+
// select query
|
|
174
|
+
if (query.select) {
|
|
175
|
+
const from = query.from();
|
|
176
|
+
// @ts-ignore
|
|
177
|
+
if (!from.length) return undefined;
|
|
178
|
+
if (subq.length === 0) return from[0].from.table;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
// handle set operations / subqueries
|
|
182
|
+
const base = getBaseTable(subq[0]);
|
|
183
|
+
for (let i = 1; i < subq.length; ++i) {
|
|
184
|
+
const from = getBaseTable(subq[i]);
|
|
185
|
+
if (from === undefined) continue;
|
|
186
|
+
if (from !== base) return NaN;
|
|
187
|
+
}
|
|
188
|
+
return base;
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
/**
|
|
192
|
+
* Generate an expression for calculating counts over data partitions.
|
|
193
|
+
* As a side effect, this method adds a column to the input *aux* object
|
|
194
|
+
* to track the count of non-null values per-partition.
|
|
195
|
+
* @param {object} aux An object for auxiliary columns (such as
|
|
196
|
+
* sufficient statistics) to include in the data cube aggregation.
|
|
197
|
+
* @param {any} arg Source data table column. This value may be a string,
|
|
198
|
+
* column reference, SQL expression, or other string-coercible value.
|
|
199
|
+
* @returns An aggregate expression for calculating counts over
|
|
200
|
+
* pre-aggregated data partitions.
|
|
201
|
+
*/
|
|
202
|
+
function countExpr(aux, arg) {
|
|
203
|
+
const n = auxName('count', arg);
|
|
204
|
+
aux[n] = agg`COUNT(${arg})`;
|
|
205
|
+
return agg`SUM(${n})`.annotate({ name: n });
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
/**
|
|
209
|
+
* Generate an expression for calculating averages over data partitions.
|
|
210
|
+
* As a side effect, this method adds a column to the input *aux* object
|
|
211
|
+
* to track the count of non-null values per-partition.
|
|
212
|
+
* @param {object} aux An object for auxiliary columns (such as
|
|
213
|
+
* sufficient statistics) to include in the data cube aggregation.
|
|
214
|
+
* @param {string} as The output column for the original aggregate.
|
|
215
|
+
* @param {any} arg Source data table column. This value may be a string,
|
|
216
|
+
* column reference, SQL expression, or other string-coercible value.
|
|
217
|
+
* @returns An aggregate expression for calculating averages over
|
|
218
|
+
* pre-aggregated data partitions.
|
|
219
|
+
*/
|
|
220
|
+
function avgExpr(aux, as, arg) {
|
|
221
|
+
const n = countExpr(aux, arg);
|
|
222
|
+
return agg`(SUM("${as}" * ${n.name}) / ${n})`;
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
/**
|
|
226
|
+
* Generate a scalar subquery for a global average.
|
|
227
|
+
* This value can be used to mean-center data.
|
|
228
|
+
* @param {*} x Souce data table column.
|
|
229
|
+
* @param {string} from The source data table name.
|
|
230
|
+
* @returns A scalar aggregate query
|
|
231
|
+
*/
|
|
232
|
+
function avg(x, from) {
|
|
233
|
+
return sql`(SELECT AVG(${x}) FROM "${from}")`;
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
/**
|
|
237
|
+
* Generate an expression for calculating argmax over data partitions.
|
|
238
|
+
* As a side effect, this method adds a column to the input *aux* object
|
|
239
|
+
* to track a maximum value per-partition.
|
|
240
|
+
* @param {object} aux An object for auxiliary columns (such as
|
|
241
|
+
* sufficient statistics) to include in the data cube aggregation.
|
|
242
|
+
* @param {string} as The output column for the original aggregate.
|
|
243
|
+
* @param {any[]} args Source data table columns. The entries may be strings,
|
|
244
|
+
* column references, SQL expressions, or other string-coercible values.
|
|
245
|
+
* @returns An aggregate expression for calculating argmax over
|
|
246
|
+
* pre-aggregated data partitions.
|
|
247
|
+
*/
|
|
248
|
+
function argmaxExpr(aux, as, [, y]) {
|
|
249
|
+
const max = auxName('max', y);
|
|
250
|
+
aux[max] = agg`MAX(${y})`;
|
|
251
|
+
return agg`ARG_MAX("${as}", ${max})`;
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
/**
|
|
255
|
+
* Generate an expression for calculating argmin over data partitions.
|
|
256
|
+
* As a side effect, this method adds a column to the input *aux* object
|
|
257
|
+
* to track a minimum value per-partition.
|
|
258
|
+
* @param {object} aux An object for auxiliary columns (such as
|
|
259
|
+
* sufficient statistics) to include in the data cube aggregation.
|
|
260
|
+
* @param {string} as The output column for the original aggregate.
|
|
261
|
+
* @param {any[]} args Source data table columns. The entries may be strings,
|
|
262
|
+
* column references, SQL expressions, or other string-coercible values.
|
|
263
|
+
* @returns An aggregate expression for calculating argmin over
|
|
264
|
+
* pre-aggregated data partitions.
|
|
265
|
+
*/
|
|
266
|
+
function argminExpr(aux, as, [, y]) {
|
|
267
|
+
const min = auxName('min', y);
|
|
268
|
+
aux[min] = agg`MIN(${y})`;
|
|
269
|
+
return agg`ARG_MIN("${as}", ${min})`;
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
/**
|
|
273
|
+
* Generate an expression for calculating variance over data partitions.
|
|
274
|
+
* This method uses the "textbook" definition of variance (E[X^2] - E[X]^2),
|
|
275
|
+
* but on mean-centered data to reduce floating point error. The variance
|
|
276
|
+
* calculation uses three sufficient statistics: the count of non-null values,
|
|
277
|
+
* the residual sum of squares and the sum of residual (mean-centered) values.
|
|
278
|
+
* As a side effect, this method adds columns for these statistics to the
|
|
279
|
+
* input *aux* object.
|
|
280
|
+
* @param {object} aux An object for auxiliary columns (such as
|
|
281
|
+
* sufficient statistics) to include in the data cube aggregation.
|
|
282
|
+
* @param {*} x The source data table column. This may be a string,
|
|
283
|
+
* column reference, SQL expression, or other string-coercible value.
|
|
284
|
+
* @param {string} from The source data table name.
|
|
285
|
+
* @param {boolean} [correction=true] A flag for whether a Bessel
|
|
286
|
+
* correction should be applied to compute the sample variance
|
|
287
|
+
* rather than the populatation variance.
|
|
288
|
+
* @returns An aggregate expression for calculating variance over
|
|
289
|
+
* pre-aggregated data partitions.
|
|
290
|
+
*/
|
|
291
|
+
function varianceExpr(aux, x, from, correction = true) {
|
|
292
|
+
const n = countExpr(aux, x);
|
|
293
|
+
const ssq = auxName('rssq', x); // residual sum of squares
|
|
294
|
+
const sum = auxName('rsum', x); // residual sum
|
|
295
|
+
const delta = sql`${x} - ${avg(x, from)}`;
|
|
296
|
+
aux[ssq] = agg`SUM((${delta}) ** 2)`;
|
|
297
|
+
aux[sum] = agg`SUM(${delta})`;
|
|
298
|
+
const adj = correction ? ` - 1` : ''; // Bessel correction
|
|
299
|
+
return agg`(SUM(${ssq}) - (SUM(${sum}) ** 2 / ${n})) / (${n}${adj})`;
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
/**
|
|
303
|
+
* Generate an expression for calculating covariance over data partitions.
|
|
304
|
+
* This method uses mean-centered data to reduce floating point error. The
|
|
305
|
+
* covariance calculation uses four sufficient statistics: the count of
|
|
306
|
+
* non-null value pairs, the sum of residual products, and residual sums
|
|
307
|
+
* (of mean-centered values) for x and y. As a side effect, this method
|
|
308
|
+
* adds columns for these statistics to the input *aux* object.
|
|
309
|
+
* @param {object} aux An object for auxiliary columns (such as
|
|
310
|
+
* sufficient statistics) to include in the data cube aggregation.
|
|
311
|
+
* @param {any[]} args Source data table columns. The entries may be strings,
|
|
312
|
+
* column references, SQL expressions, or other string-coercible values.
|
|
313
|
+
* @param {string} from The source data table name.
|
|
314
|
+
* @param {boolean|null} [correction=true] A flag for whether a Bessel
|
|
315
|
+
* correction should be applied to compute the sample covariance rather
|
|
316
|
+
* than the populatation covariance. If null, an expression for the
|
|
317
|
+
* unnormalized covariance (no division by sample count) is returned.
|
|
318
|
+
* @returns An aggregate expression for calculating covariance over
|
|
319
|
+
* pre-aggregated data partitions.
|
|
320
|
+
*/
|
|
321
|
+
function covarianceExpr(aux, args, from, correction = true) {
|
|
322
|
+
const n = regrCountExpr(aux, args);
|
|
323
|
+
const sxy = regrSumXYExpr(aux, args, from);
|
|
324
|
+
const sx = regrSumExpr(aux, 1, args, from);
|
|
325
|
+
const sy = regrSumExpr(aux, 0, args, from);
|
|
326
|
+
const adj = correction === null ? '' // do not divide by count
|
|
327
|
+
: correction ? ` / (${n} - 1)` // Bessel correction (sample)
|
|
328
|
+
: ` / ${n}`; // no correction (population)
|
|
329
|
+
return agg`(${sxy} - ${sx} * ${sy} / ${n})${adj}`;
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
/**
|
|
333
|
+
* Generate an expression for calculating Pearson product-moment correlation
|
|
334
|
+
* coefficients over data partitions. This method uses mean-centered data
|
|
335
|
+
* to reduce floating point error. The correlation calculation uses six
|
|
336
|
+
* sufficient statistics: the count of non-null value pairs, the sum of
|
|
337
|
+
* residual products, and both residual sums and sums of squares for x and y.
|
|
338
|
+
* As a side effect, this method adds columns for these statistics to the
|
|
339
|
+
* input *aux* object.
|
|
340
|
+
* @param {object} aux An object for auxiliary columns (such as
|
|
341
|
+
* sufficient statistics) to include in the data cube aggregation.
|
|
342
|
+
* @param {any[]} args Source data table columns. The entries may be strings,
|
|
343
|
+
* column references, SQL expressions, or other string-coercible values.
|
|
344
|
+
* @param {string} from The source data table name.
|
|
345
|
+
* @returns An aggregate expression for calculating correlation over
|
|
346
|
+
* pre-aggregated data partitions.
|
|
347
|
+
*/
|
|
348
|
+
function corrExpr(aux, args, from) {
|
|
349
|
+
const n = regrCountExpr(aux, args);
|
|
350
|
+
const sxy = regrSumXYExpr(aux, args, from);
|
|
351
|
+
const sxx = regrSumSqExpr(aux, 1, args, from);
|
|
352
|
+
const syy = regrSumSqExpr(aux, 0, args, from);
|
|
353
|
+
const sx = regrSumExpr(aux, 1, args, from);
|
|
354
|
+
const sy = regrSumExpr(aux, 0, args, from);
|
|
355
|
+
const vx = agg`(${sxx} - (${sx} ** 2) / ${n})`;
|
|
356
|
+
const vy = agg`(${syy} - (${sy} ** 2) / ${n})`;
|
|
357
|
+
return agg`(${sxy} - ${sx} * ${sy} / ${n}) / SQRT(${vx} * ${vy})`;
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
/**
|
|
361
|
+
* Generate an expression for the count of non-null (x, y) pairs. As a side
|
|
362
|
+
* effect, this method adds columns to the input *aux* object to the
|
|
363
|
+
* partition-level count of non-null pairs.
|
|
364
|
+
* @param {object} aux An object for auxiliary columns (such as
|
|
365
|
+
* sufficient statistics) to include in the data cube aggregation.
|
|
366
|
+
* @param {any[]} args Source data table columns. The entries may be strings,
|
|
367
|
+
* column references, SQL expressions, or other string-coercible values.
|
|
368
|
+
* @returns An aggregate expression for calculating regression pair counts
|
|
369
|
+
* over pre-aggregated data partitions.
|
|
370
|
+
*/
|
|
371
|
+
function regrCountExpr(aux, [y, x]) {
|
|
372
|
+
const n = auxName('count', y, x);
|
|
373
|
+
aux[n] = agg`REGR_COUNT(${y}, ${x})`;
|
|
374
|
+
return agg`SUM(${n})`.annotate({ name: n });
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
/**
|
|
378
|
+
* Generate an expression for calculating sums of residual values for use in
|
|
379
|
+
* covariance and regression queries. Only values corresponding to non-null
|
|
380
|
+
* (x, y) pairs are included. This method uses mean-centered data to reduce
|
|
381
|
+
* floating point error. As a side effect, this method adds a column for
|
|
382
|
+
* partition-level sums to the input *aux* object.
|
|
383
|
+
* @param {object} aux An object for auxiliary columns (such as
|
|
384
|
+
* sufficient statistics) to include in the data cube aggregation.
|
|
385
|
+
* @param {number} i An index indicating which argument column to sum.
|
|
386
|
+
* @param {any[]} args Source data table columns. The entries may be strings,
|
|
387
|
+
* column references, SQL expressions, or other string-coercible values.
|
|
388
|
+
* @param {string} from The source data table name.
|
|
389
|
+
* @returns An aggregate expression over pre-aggregated data partitions.
|
|
390
|
+
*/
|
|
391
|
+
function regrSumExpr(aux, i, args, from) {
|
|
392
|
+
const v = args[i];
|
|
393
|
+
const o = args[1 - i];
|
|
394
|
+
const sum = auxName('rs', v);
|
|
395
|
+
aux[sum] = agg`SUM(${v} - ${avg(v, from)}) FILTER (${o} IS NOT NULL)`;
|
|
396
|
+
return agg`SUM(${sum})`
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
/**
|
|
400
|
+
* Generate an expressios for calculating sums of squared residual values for
|
|
401
|
+
* use in covariance and regression queries. Only values corresponding to
|
|
402
|
+
* non-null (x, y) pairs are included. This method uses mean-centered data to
|
|
403
|
+
* reduce floating point error. As a side effect, this method adds a column
|
|
404
|
+
* for partition-level sums to the input *aux* object.
|
|
405
|
+
* @param {object} aux An object for auxiliary columns (such as
|
|
406
|
+
* sufficient statistics) to include in the data cube aggregation.
|
|
407
|
+
* @param {number} i An index indicating which argument column to sum.
|
|
408
|
+
* @param {any[]} args Source data table columns. The entries may be strings,
|
|
409
|
+
* column references, SQL expressions, or other string-coercible values.
|
|
410
|
+
* @param {string} from The source data table name.
|
|
411
|
+
* @returns An aggregate expression over pre-aggregated data partitions.
|
|
412
|
+
*/
|
|
413
|
+
function regrSumSqExpr(aux, i, args, from) {
|
|
414
|
+
const v = args[i];
|
|
415
|
+
const u = args[1 - i];
|
|
416
|
+
const ssq = auxName('rss', v);
|
|
417
|
+
aux[ssq] = agg`SUM((${v} - ${avg(v, from)}) ** 2) FILTER (${u} IS NOT NULL)`;
|
|
418
|
+
return agg`SUM(${ssq})`
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
/**
|
|
422
|
+
* Generate an expression for calculating sums of residual product values for
|
|
423
|
+
* use in covariance and regression queries. Only values corresponding to
|
|
424
|
+
* non-null (x, y) pairs are included. This method uses mean-centered data to
|
|
425
|
+
* reduce floating point error. As a side effect, this method adds a column
|
|
426
|
+
* for partition-level sums to the input *aux* object.
|
|
427
|
+
* @param {object} aux An object for auxiliary columns (such as
|
|
428
|
+
* sufficient statistics) to include in the data cube aggregation.
|
|
429
|
+
* @param {any[]} args Source data table columns. The entries may be strings,
|
|
430
|
+
* column references, SQL expressions, or other string-coercible values.
|
|
431
|
+
* @param {string} from The source data table name.
|
|
432
|
+
* @returns An aggregate expression over pre-aggregated data partitions.
|
|
433
|
+
*/
|
|
434
|
+
function regrSumXYExpr(aux, args, from) {
|
|
435
|
+
const [y, x] = args;
|
|
436
|
+
const sxy = auxName('sxy', y, x);
|
|
437
|
+
aux[sxy] = agg`SUM((${x} - ${avg(x, from)}) * (${y} - ${avg(y, from)}))`;
|
|
438
|
+
return agg`SUM(${sxy})`;
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
/**
|
|
442
|
+
* Generate an expression for the average x value in a regression context.
|
|
443
|
+
* Only values corresponding to non-null (x, y) pairs are included. As a side
|
|
444
|
+
* effect, this method adds columns to the input *aux* object to track both
|
|
445
|
+
* the count of non-null pairs and partition-level averages.
|
|
446
|
+
* @param {object} aux An object for auxiliary columns (such as
|
|
447
|
+
* sufficient statistics) to include in the data cube aggregation.
|
|
448
|
+
* @param {any[]} args Source data table columns. The entries may be strings,
|
|
449
|
+
* column references, SQL expressions, or other string-coercible values.
|
|
450
|
+
* @returns An aggregate expression over pre-aggregated data partitions.
|
|
451
|
+
*/
|
|
452
|
+
function regrAvgXExpr(aux, args) {
|
|
453
|
+
const [y, x] = args;
|
|
454
|
+
const n = regrCountExpr(aux, args);
|
|
455
|
+
const a = auxName('avg', x, y);
|
|
456
|
+
aux[a] = agg`REGR_AVGX(${y}, ${x})`;
|
|
457
|
+
return agg`(SUM(${a} * ${n.name}) / ${n})`;
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
/**
|
|
461
|
+
* Generate an expression for the average y value in a regression context.
|
|
462
|
+
* Only values corresponding to non-null (x, y) pairs are included. As a side
|
|
463
|
+
* effect, this method adds columns to the input *aux* object to track both
|
|
464
|
+
* the count of non-null pairs and partition-level averages.
|
|
465
|
+
* @param {object} aux An object for auxiliary columns (such as
|
|
466
|
+
* sufficient statistics) to include in the data cube aggregation.
|
|
467
|
+
* @param {any[]} args Source data table columns. The entries may be strings,
|
|
468
|
+
* column references, SQL expressions, or other string-coercible values.
|
|
469
|
+
* @returns An aggregate expression over pre-aggregated data partitions.
|
|
470
|
+
*/
|
|
471
|
+
function regrAvgYExpr(aux, args) {
|
|
472
|
+
const [y, x] = args;
|
|
473
|
+
const n = regrCountExpr(aux, args);
|
|
474
|
+
const a = auxName('avg', y, x);
|
|
475
|
+
aux[a] = agg`REGR_AVGY(${y}, ${x})`;
|
|
476
|
+
return agg`(SUM(${a} * ${n.name}) / ${n})`;
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
/**
|
|
480
|
+
* Generate an expression for calculating variance over data partitions for
|
|
481
|
+
* use in covariance and regression queries. Only values corresponding to
|
|
482
|
+
* non-null (x, y) pairs are included. This method uses mean-centered data to
|
|
483
|
+
* reduce floating point error. As a side effect, this method adds columns
|
|
484
|
+
* for partition-level count and sums to the input *aux* object.
|
|
485
|
+
* @param {object} aux An object for auxiliary columns (such as
|
|
486
|
+
* sufficient statistics) to include in the data cube aggregation.
|
|
487
|
+
* @param {number} i The index of the argument to compute the variance for.
|
|
488
|
+
* @param {any[]} args Source data table columns. The entries may be strings,
|
|
489
|
+
* column references, SQL expressions, or other string-coercible values.
|
|
490
|
+
* @param {string} from The source data table name.
|
|
491
|
+
* @returns An aggregate expression for calculating variance over
|
|
492
|
+
* pre-aggregated data partitions.
|
|
493
|
+
*/
|
|
494
|
+
function regrVarExpr(aux, i, args, from) {
|
|
495
|
+
const n = regrCountExpr(aux, args);
|
|
496
|
+
const sum = regrSumExpr(aux, i, args, from);
|
|
497
|
+
const ssq = regrSumSqExpr(aux, i, args, from);
|
|
498
|
+
return agg`(${ssq} - (${sum} ** 2 / ${n}))`;
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
/**
|
|
502
|
+
* Generate an expression for calculating a regression slope. The slope is
|
|
503
|
+
* computed as the covariance divided by the variance of the x variable. As a
|
|
504
|
+
* side effect, this method adds columns for sufficient statistics to the
|
|
505
|
+
* input *aux* object.
|
|
506
|
+
* @param {object} aux An object for auxiliary columns (such as
|
|
507
|
+
* sufficient statistics) to include in the data cube aggregation.
|
|
508
|
+
* @param {any[]} args Source data table columns. The entries may be strings,
|
|
509
|
+
* column references, SQL expressions, or other string-coercible values.
|
|
510
|
+
* @param {string} from The source data table name.
|
|
511
|
+
* @returns An aggregate expression for calculating regression slopes over
|
|
512
|
+
* pre-aggregated data partitions.
|
|
513
|
+
*/
|
|
514
|
+
function regrSlopeExpr(aux, args, from) {
|
|
515
|
+
const cov = covarianceExpr(aux, args, from, null);
|
|
516
|
+
const varx = regrVarExpr(aux, 1, args, from);
|
|
517
|
+
return agg`(${cov}) / ${varx}`;
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
/**
|
|
521
|
+
* Generate an expression for calculating a regression intercept. The intercept
|
|
522
|
+
* is derived from the regression slope and average x and y values. As a
|
|
523
|
+
* side effect, this method adds columns for sufficient statistics to the
|
|
524
|
+
* input *aux* object.
|
|
525
|
+
* @param {object} aux An object for auxiliary columns (such as
|
|
526
|
+
* sufficient statistics) to include in the data cube aggregation.
|
|
527
|
+
* @param {any[]} args Source data table columns. The entries may be strings,
|
|
528
|
+
* column references, SQL expressions, or other string-coercible values.
|
|
529
|
+
* @param {string} from The source data table name.
|
|
530
|
+
* @returns An aggregate expression for calculating regression intercepts over
|
|
531
|
+
* pre-aggregated data partitions.
|
|
532
|
+
*/
|
|
533
|
+
function regrInterceptExpr(aux, args, from) {
|
|
534
|
+
const ax = regrAvgXExpr(aux, args);
|
|
535
|
+
const ay = regrAvgYExpr(aux, args);
|
|
536
|
+
const m = regrSlopeExpr(aux, args, from);
|
|
537
|
+
return agg`${ay} - (${m}) * ${ax}`;
|
|
538
|
+
}
|