@datagrok/eda 1.1.2 → 1.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +19 -0
- package/LICENSE.txt +202 -0
- package/README.md +13 -1
- package/dist/221.js +2 -0
- package/dist/694.js +2 -0
- package/dist/729.js +2 -0
- package/dist/80.js +2 -0
- package/dist/package-test.js +2 -2
- package/dist/package.js +2 -2
- package/package.json +13 -9
- package/src/eda-tools.ts +185 -0
- package/src/{EDAui.ts → eda-ui.ts} +14 -1
- package/src/package.ts +77 -10
- package/src/stat-tools.ts +266 -0
- package/src/utils.ts +130 -3
- package/src/workers/tsne-worker.ts +20 -0
- package/src/workers/umap-worker.ts +9 -0
- package/src/EDAtools.ts +0 -46
- /package/src/{dataGenerators.ts → data-generators.ts} +0 -0
package/package.json
CHANGED
|
@@ -1,24 +1,28 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@datagrok/eda",
|
|
3
3
|
"friendlyName": "EDA",
|
|
4
|
-
"version": "1.1.
|
|
4
|
+
"version": "1.1.4",
|
|
5
5
|
"description": "Exploratory Data Analysis Tools",
|
|
6
6
|
"dependencies": {
|
|
7
|
-
"datagrok-
|
|
8
|
-
"
|
|
9
|
-
"
|
|
10
|
-
"@
|
|
11
|
-
"
|
|
7
|
+
"@datagrok-libraries/ml": "^6.3.39",
|
|
8
|
+
"@datagrok-libraries/tutorials": "^1.3.6",
|
|
9
|
+
"@datagrok-libraries/utils": "^4.1.4",
|
|
10
|
+
"@keckelt/tsne": "^1.0.2",
|
|
11
|
+
"cash-dom": "^8.1.1",
|
|
12
|
+
"datagrok-api": "^1.16.0",
|
|
13
|
+
"dayjs": "^1.11.9",
|
|
14
|
+
"jstat": "^1.9.6",
|
|
15
|
+
"umap-js": "^1.3.3"
|
|
12
16
|
},
|
|
13
17
|
"author": {
|
|
14
18
|
"name": "Viktor Makarichev",
|
|
15
19
|
"email": "vmakarichev@datagrok.ai"
|
|
16
20
|
},
|
|
17
21
|
"devDependencies": {
|
|
18
|
-
"webpack": "latest",
|
|
19
|
-
"webpack-cli": "latest",
|
|
20
22
|
"ts-loader": "latest",
|
|
21
|
-
"typescript": "latest"
|
|
23
|
+
"typescript": "latest",
|
|
24
|
+
"webpack": "latest",
|
|
25
|
+
"webpack-cli": "latest"
|
|
22
26
|
},
|
|
23
27
|
"scripts": {
|
|
24
28
|
"link-all": "npm link datagrok-api @datagrok-libraries/utils @datagrok-libraries/tutorials",
|
package/src/eda-tools.ts
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
// Exploratory data analysis (EDA) tools
|
|
2
|
+
|
|
3
|
+
import * as grok from 'datagrok-api/grok';
|
|
4
|
+
import * as ui from 'datagrok-api/ui';
|
|
5
|
+
import * as DG from 'datagrok-api/dg';
|
|
6
|
+
|
|
7
|
+
import {DimensionalityReducer} from '@datagrok-libraries/ml/src/reduce-dimensionality';
|
|
8
|
+
import {VectorMetricsNames} from '@datagrok-libraries/ml/src/typed-metrics';
|
|
9
|
+
|
|
10
|
+
import {_principalComponentAnalysisInWebWorker,
|
|
11
|
+
_partialLeastSquareRegressionInWebWorker} from '../wasm/EDAAPI';
|
|
12
|
+
|
|
13
|
+
import {checkWasmDimensionReducerInputs, checkUMAPinputs, checkTSNEinputs, checkSPEinputs,
|
|
14
|
+
getRowsOfNumericalColumnns} from './utils';
|
|
15
|
+
|
|
16
|
+
// Principal components analysis (PCA)
|
|
17
|
+
export async function computePCA(table: DG.DataFrame, features: DG.ColumnList, components: number,
|
|
18
|
+
center: boolean, scale: boolean): Promise<DG.DataFrame>
|
|
19
|
+
{
|
|
20
|
+
checkWasmDimensionReducerInputs(features, components);
|
|
21
|
+
|
|
22
|
+
const centerNum = center ? 1 : 0;
|
|
23
|
+
const scaleNum = scale ? 1 : 0;
|
|
24
|
+
|
|
25
|
+
return await _principalComponentAnalysisInWebWorker(table, features, components, centerNum, scaleNum);
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
// Partial least square regression (PLS)
|
|
29
|
+
export async function computePLS(table: DG.DataFrame, features: DG.ColumnList, predict: DG.Column, components: number): Promise<any>
|
|
30
|
+
{
|
|
31
|
+
// Inputs are checked in the same manner as in PCA, since the same computations are applied.
|
|
32
|
+
checkWasmDimensionReducerInputs(features, components);
|
|
33
|
+
|
|
34
|
+
return await _partialLeastSquareRegressionInWebWorker(table, features, predict, components);
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// Uniform Manifold Approximation and Projection (UMAP)
|
|
38
|
+
export async function computeUMAP(features: DG.ColumnList, components: number, epochs: number,
|
|
39
|
+
neighbors: number, minDist: number, spread: number): Promise<DG.DataFrame>
|
|
40
|
+
{
|
|
41
|
+
// check inputs
|
|
42
|
+
checkUMAPinputs(features, components, epochs, neighbors, minDist, spread);
|
|
43
|
+
|
|
44
|
+
// get row-by-row data
|
|
45
|
+
const data = getRowsOfNumericalColumnns(features);
|
|
46
|
+
|
|
47
|
+
let workerOutput: any;
|
|
48
|
+
|
|
49
|
+
// UMAP in webworker
|
|
50
|
+
let promise = new Promise((resolve, reject) => {
|
|
51
|
+
const worker = new Worker(new URL('workers/umap-worker.ts', import.meta.url));
|
|
52
|
+
|
|
53
|
+
worker.postMessage({
|
|
54
|
+
data: data,
|
|
55
|
+
options: {
|
|
56
|
+
nComponents: components,
|
|
57
|
+
nEpochs: epochs,
|
|
58
|
+
nNeighbors: neighbors,
|
|
59
|
+
minDist: minDist,
|
|
60
|
+
spread: spread
|
|
61
|
+
}});
|
|
62
|
+
|
|
63
|
+
worker.onmessage = function(e) {
|
|
64
|
+
worker.terminate();
|
|
65
|
+
resolve(e.data.embeddings);
|
|
66
|
+
}});
|
|
67
|
+
|
|
68
|
+
await promise.then(
|
|
69
|
+
result => { workerOutput = result; },
|
|
70
|
+
error => { throw new Error ('applying UMAP fails.'); }
|
|
71
|
+
);
|
|
72
|
+
|
|
73
|
+
const embeddings = workerOutput as number[][];
|
|
74
|
+
const rowCount = embeddings.length;
|
|
75
|
+
const range = [...Array(components).keys()];
|
|
76
|
+
|
|
77
|
+
// Create output
|
|
78
|
+
|
|
79
|
+
// columns data
|
|
80
|
+
const umapColumnsData = range.map(_ => new Float32Array(rowCount));
|
|
81
|
+
|
|
82
|
+
// perform transponation
|
|
83
|
+
for (let i = 0; i < rowCount; ++i)
|
|
84
|
+
for (let j = 0; j < components; ++j)
|
|
85
|
+
umapColumnsData[j][i] = embeddings[i][j];
|
|
86
|
+
|
|
87
|
+
return DG.DataFrame.fromColumns(range.map(i =>
|
|
88
|
+
DG.Column.fromFloat32Array('UMAP' + i.toString(), umapColumnsData[i])
|
|
89
|
+
));
|
|
90
|
+
} // computeUMAP
|
|
91
|
+
|
|
92
|
+
// t-distributed stochastic neighbor embedding (t-SNE)
|
|
93
|
+
export async function computeTSNE(features: DG.ColumnList, components: number,
|
|
94
|
+
learningRate: number, perplexity: number, iterations: number): Promise<DG.DataFrame>
|
|
95
|
+
{
|
|
96
|
+
// check inputs
|
|
97
|
+
checkTSNEinputs(features, components, learningRate, perplexity, iterations);
|
|
98
|
+
|
|
99
|
+
// get row-by-row data
|
|
100
|
+
const data = getRowsOfNumericalColumnns(features);
|
|
101
|
+
|
|
102
|
+
let workerOutput: any;
|
|
103
|
+
|
|
104
|
+
// t-SNE in webworker
|
|
105
|
+
let promise = new Promise((resolve, reject) => {
|
|
106
|
+
const worker = new Worker(new URL('workers/tsne-worker.ts', import.meta.url));
|
|
107
|
+
|
|
108
|
+
worker.postMessage({
|
|
109
|
+
data: data,
|
|
110
|
+
options: {
|
|
111
|
+
learningRate: learningRate,
|
|
112
|
+
perplexity: perplexity,
|
|
113
|
+
components: components,
|
|
114
|
+
iterations: iterations
|
|
115
|
+
}});
|
|
116
|
+
|
|
117
|
+
worker.onmessage = function(e) {
|
|
118
|
+
worker.terminate();
|
|
119
|
+
resolve(e.data.embeddings);
|
|
120
|
+
}});
|
|
121
|
+
|
|
122
|
+
await promise.then(
|
|
123
|
+
result => { workerOutput = result; },
|
|
124
|
+
error => { throw new Error ('applying t-SNE fails.'); }
|
|
125
|
+
);
|
|
126
|
+
|
|
127
|
+
const embeddings = workerOutput as any[];
|
|
128
|
+
|
|
129
|
+
const rowCount = embeddings.length;
|
|
130
|
+
const range = [...Array(components).keys()];
|
|
131
|
+
|
|
132
|
+
// Create output
|
|
133
|
+
|
|
134
|
+
// columns data
|
|
135
|
+
const umapColumnsData = range.map(_ => new Float32Array(rowCount));
|
|
136
|
+
|
|
137
|
+
// perform transponation
|
|
138
|
+
for (let i = 0; i < rowCount; ++i)
|
|
139
|
+
for (let j = 0; j < components; ++j)
|
|
140
|
+
umapColumnsData[j][i] = embeddings[i][j];
|
|
141
|
+
|
|
142
|
+
return DG.DataFrame.fromColumns(range.map(i =>
|
|
143
|
+
DG.Column.fromFloat32Array('tSNE' + i.toString(), umapColumnsData[i])
|
|
144
|
+
));
|
|
145
|
+
} // computeTSNE
|
|
146
|
+
|
|
147
|
+
// Stochastic proximity embedding (SPE)
|
|
148
|
+
export async function computeSPE(features: DG.ColumnList, dimension: number,
|
|
149
|
+
steps: number, cycles: number, cutoff: number, lambda: number): Promise<DG.DataFrame>
|
|
150
|
+
{
|
|
151
|
+
// check inputs
|
|
152
|
+
checkSPEinputs(features, dimension, steps, cycles, cutoff, lambda);
|
|
153
|
+
|
|
154
|
+
// get row-by-row data
|
|
155
|
+
const data = getRowsOfNumericalColumnns(features);
|
|
156
|
+
|
|
157
|
+
// SPE reducer
|
|
158
|
+
const spe = new DimensionalityReducer(data, 'SPE', VectorMetricsNames.Euclidean, {
|
|
159
|
+
dimension: dimension,
|
|
160
|
+
steps: steps,
|
|
161
|
+
cycles: cycles,
|
|
162
|
+
cutoff: cutoff,
|
|
163
|
+
lambda: lambda
|
|
164
|
+
});
|
|
165
|
+
|
|
166
|
+
// compute embeddings
|
|
167
|
+
const embeddings = (await spe.transform(false, false)).embedding;
|
|
168
|
+
|
|
169
|
+
const rowCount = embeddings.length;
|
|
170
|
+
const range = [...Array(dimension).keys()];
|
|
171
|
+
|
|
172
|
+
// Create output
|
|
173
|
+
|
|
174
|
+
// columns data
|
|
175
|
+
const umapColumnsData = range.map(_ => new Float32Array(rowCount));
|
|
176
|
+
|
|
177
|
+
// perform transponation
|
|
178
|
+
for (let i = 0; i < rowCount; ++i)
|
|
179
|
+
for (let j = 0; j < dimension; ++j)
|
|
180
|
+
umapColumnsData[j][i] = embeddings[i][j];
|
|
181
|
+
|
|
182
|
+
return DG.DataFrame.fromColumns(range.map(i =>
|
|
183
|
+
DG.Column.fromFloat32Array('SPE' + i.toString(), umapColumnsData[i])
|
|
184
|
+
));
|
|
185
|
+
} // computeSPE
|
|
@@ -12,6 +12,12 @@ export function renamePCAcolumns(pcaTable: DG.DataFrame): DG.DataFrame {
|
|
|
12
12
|
return pcaTable;
|
|
13
13
|
}
|
|
14
14
|
|
|
15
|
+
// Adds prefix to each column name
|
|
16
|
+
export function addPrefixToEachColumnName(prefix: string, columns: DG.ColumnList): void {
|
|
17
|
+
for (const col of columns.toList())
|
|
18
|
+
col.name = prefix + col.name;
|
|
19
|
+
}
|
|
20
|
+
|
|
15
21
|
// Predicted vs Reference scatter plot
|
|
16
22
|
export function predictedVersusReferenceScatterPlot(samplesNames: DG.Column, reference: DG.Column, prediction: DG.Column): DG.Viewer {
|
|
17
23
|
prediction.name = reference.name + '(predicted)';
|
|
@@ -106,7 +112,7 @@ export function loadingScatterPlot(features: DG.ColumnList, xLoadings: Array<DG.
|
|
|
106
112
|
// Add PLS visualization
|
|
107
113
|
export function addPLSvisualization(table: DG.DataFrame, samplesNames: DG.Column, features: DG.ColumnList, predict: DG.Column, plsOutput: any): void {
|
|
108
114
|
|
|
109
|
-
|
|
115
|
+
const view = grok.shell.getTableView(table.name);
|
|
110
116
|
|
|
111
117
|
// 1. Predicted vs Reference scatter plot
|
|
112
118
|
view.addViewer(predictedVersusReferenceScatterPlot(samplesNames, predict, plsOutput[0]));
|
|
@@ -120,3 +126,10 @@ export function addPLSvisualization(table: DG.DataFrame, samplesNames: DG.Column
|
|
|
120
126
|
// 4. Scores Scatter Plot
|
|
121
127
|
view.addViewer(scoresScatterPlot(samplesNames, plsOutput[2], plsOutput[3]));
|
|
122
128
|
}
|
|
129
|
+
|
|
130
|
+
// Add one-way ANOVA results
|
|
131
|
+
export function addOneWayAnovaVizualization(table: DG.DataFrame, factors: DG.Column, values: DG.Column, anova: DG.DataFrame) {
|
|
132
|
+
const view = grok.shell.getTableView(table.name);
|
|
133
|
+
view.addViewer(DG.Viewer.boxPlot(DG.DataFrame.fromColumns([factors, values])));
|
|
134
|
+
view.addViewer(DG.Viewer.grid(anova));
|
|
135
|
+
}
|
package/src/package.ts
CHANGED
|
@@ -6,13 +6,15 @@ import * as DG from 'datagrok-api/dg';
|
|
|
6
6
|
import {DemoScript} from '@datagrok-libraries/tutorials/src/demo-script';
|
|
7
7
|
|
|
8
8
|
import {_initEDAAPI} from '../wasm/EDAAPI';
|
|
9
|
-
import {computePCA, computePLS} from './
|
|
10
|
-
import {
|
|
11
|
-
scoresScatterPlot, predictedVersusReferenceScatterPlot} from './
|
|
12
|
-
import {carsDataframe, testDataForBinaryClassification} from './
|
|
9
|
+
import {computePCA, computePLS, computeUMAP, computeTSNE, computeSPE} from './eda-tools';
|
|
10
|
+
import {addPrefixToEachColumnName, addPLSvisualization, regressionCoefficientsBarChart,
|
|
11
|
+
scoresScatterPlot, predictedVersusReferenceScatterPlot, addOneWayAnovaVizualization} from './eda-ui';
|
|
12
|
+
import {carsDataframe, testDataForBinaryClassification} from './data-generators';
|
|
13
13
|
import {LINEAR, RBF, POLYNOMIAL, SIGMOID,
|
|
14
14
|
getTrainedModel, getPrediction, showTrainReport, getPackedModel} from './svm';
|
|
15
15
|
|
|
16
|
+
import {oneWayAnova} from './stat-tools';
|
|
17
|
+
|
|
16
18
|
export const _package = new DG.Package();
|
|
17
19
|
|
|
18
20
|
//name: info
|
|
@@ -25,22 +27,74 @@ export async function init(): Promise<void> {
|
|
|
25
27
|
await _initEDAAPI();
|
|
26
28
|
}
|
|
27
29
|
|
|
28
|
-
//top-menu:
|
|
30
|
+
//top-menu: ML | Dimensionality Reduction | PCA...
|
|
29
31
|
//name: PCA
|
|
30
32
|
//description: Principal component analysis (PCA).
|
|
31
33
|
//input: dataframe table
|
|
32
34
|
//input: column_list features {type: numerical}
|
|
33
|
-
//input: int components = 2
|
|
34
|
-
//input: bool center =
|
|
35
|
-
//input: bool scale =
|
|
35
|
+
//input: int components = 2 {caption: Components} [Number of components.]
|
|
36
|
+
//input: bool center = false [Indicating whether the variables should be shifted to be zero centered.]
|
|
37
|
+
//input: bool scale = false [Indicating whether the variables should be scaled to have unit variance.]
|
|
36
38
|
//output: dataframe result {action:join(table)}
|
|
37
39
|
export async function PCA(table: DG.DataFrame, features: DG.ColumnList, components: number,
|
|
38
40
|
center: boolean, scale: boolean): Promise<DG.DataFrame>
|
|
39
41
|
{
|
|
40
|
-
|
|
42
|
+
const pcaTable = await computePCA(table, features, components, center, scale);
|
|
43
|
+
addPrefixToEachColumnName('PCA', pcaTable.columns);
|
|
44
|
+
return pcaTable;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
//top-menu: ML | Dimensionality Reduction | UMAP...
|
|
48
|
+
//name: UMAP
|
|
49
|
+
//description: Uniform Manifold Approximation and Projection (UMAP).
|
|
50
|
+
//input: dataframe table {category: Data}
|
|
51
|
+
//input: column_list features {type: numerical; category: Data}
|
|
52
|
+
//input: int components = 2 {caption: Components; category: Hyperparameters} [The number of components (dimensions) to project the data to.]
|
|
53
|
+
//input: int epochs = 100 {caption: Epochs; category: Hyperparameters} [The number of epochs to optimize embeddings.]
|
|
54
|
+
//input: int neighbors = 15 {caption: Neighbors; category: Hyperparameters} [The number of nearest neighbors to construct the fuzzy manifold.]
|
|
55
|
+
//input: double minDist = 0.1 {caption: Minimum distance; category: Hyperparameters} [The effective minimum distance between embedded points.]
|
|
56
|
+
//input: double spread = 1.0 {caption: Spread; category: Hyperparameters} [The effective scale of embedded points.]
|
|
57
|
+
//output: dataframe result {action:join(table)}
|
|
58
|
+
export async function UMAP(table: DG.DataFrame, features: DG.ColumnList, components: number,
|
|
59
|
+
epochs: number, neighbors: number, minDist: number, spread: number): Promise<DG.DataFrame>
|
|
60
|
+
{
|
|
61
|
+
return await computeUMAP(features, components, epochs, neighbors, minDist, spread);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
//top-menu: ML | Dimensionality Reduction | t-SNE...
|
|
65
|
+
//name: t-SNE
|
|
66
|
+
//description: t-distributed stochastic neighbor embedding (t-SNE).
|
|
67
|
+
//input: dataframe table {category: Data}
|
|
68
|
+
//input: column_list features {type: numerical; category: Data}
|
|
69
|
+
//input: int components = 2 {caption: Components; category: Hyperparameters} [Dimension of the embedded space.]
|
|
70
|
+
//input: double learningRate = 10 {caption: Learning rate; category: Hyperparameters} [Optimization tuning parameter. Should be in the range 10...1000.]
|
|
71
|
+
//input: int perplexity = 30 {caption: Perplexity; category: Hyperparameters} [The number of nearest neighbors. Should be less than the number of samples.]
|
|
72
|
+
//input: int iterations = 500 {caption: Iterations; category: Hyperparameters} [Maximum number of iterations for the optimization. Should be at least 250.]
|
|
73
|
+
//output: dataframe result {action:join(table)}
|
|
74
|
+
export async function tSNE(table: DG.DataFrame, features: DG.ColumnList, components: number,
|
|
75
|
+
learningRate: number, perplexity: number, iterations: number): Promise<DG.DataFrame>
|
|
76
|
+
{
|
|
77
|
+
return await computeTSNE(features, components, learningRate, perplexity, iterations);
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
//top-menu: ML | Dimensionality Reduction | SPE...
|
|
81
|
+
//name: SPE
|
|
82
|
+
//description: Stochastic proximity embedding (SPE).
|
|
83
|
+
//input: dataframe table {category: Data}
|
|
84
|
+
//input: column_list features {type: numerical; category: Data}
|
|
85
|
+
//input: int dimension = 2 {caption: Dimension; category: Hyperparameters} [Dimension of the embedded space.]
|
|
86
|
+
//input: int steps = 0 {caption: Steps; category: Hyperparameters} [Number of random selections of point pairs and distance computations between them.]
|
|
87
|
+
//input: int cycles = 1000000 {caption: Cycles; category: Hyperparameters} [Number of the method cycles.]
|
|
88
|
+
//input: double cutoff = 0.0 {caption: Cutoff; category: Hyperparameters} [Cutoff distance between points.]
|
|
89
|
+
//input: double lambda = 2.0 {caption: Learning rate; category: Hyperparameters} [Optimization tuning parameter.]
|
|
90
|
+
//output: dataframe result {action:join(table)}
|
|
91
|
+
export async function SPE(table: DG.DataFrame, features: DG.ColumnList, dimension: number,
|
|
92
|
+
steps: number, cycles: number, cutoff: number, lambda: number): Promise<DG.DataFrame>
|
|
93
|
+
{
|
|
94
|
+
return await computeSPE(features, dimension, steps, cycles, cutoff, lambda);
|
|
41
95
|
}
|
|
42
96
|
|
|
43
|
-
//top-menu: ML | Multivariate Analysis
|
|
97
|
+
//top-menu: ML | Analyze | Multivariate Analysis...
|
|
44
98
|
//name: Multivariate Analysis (PLS)
|
|
45
99
|
//description: Multidimensional data analysis using partial least squares (PLS) regression. It reduces the predictors to a smaller set of uncorrelated components and performs least squares regression on them.
|
|
46
100
|
//input: dataframe table
|
|
@@ -266,3 +320,16 @@ export async function trainSigmoidKernelSVM(df: DG.DataFrame, predict_column: st
|
|
|
266
320
|
export async function applySigmoidKernelSVM(df: DG.DataFrame, model: any): Promise<DG.DataFrame> {
|
|
267
321
|
return await getPrediction(df, model);
|
|
268
322
|
}
|
|
323
|
+
|
|
324
|
+
//top-menu: ML | Analysis of Variances (ANOVA)...
|
|
325
|
+
//name: One-way ANOVA
|
|
326
|
+
//description: One-way analysis of variances (ANOVA) determines whether the examined factor has a significant impact on the studied feature.
|
|
327
|
+
//input: dataframe table
|
|
328
|
+
//input: column factor {type: categorical}
|
|
329
|
+
//input: column feature {type: numerical}
|
|
330
|
+
//input: double significance = 0.05 [The significance level is a value from the interval (0, 1) specifying the criterion used for rejecting the null hypothesis.]
|
|
331
|
+
//input: bool validate = false [Indicates whether the normality of distribution and an eqaulity of varainces should be checked.]
|
|
332
|
+
export function anova(table: DG.DataFrame, factor: DG.Column, feature: DG.Column, significance: number, validate: boolean) {
|
|
333
|
+
const res = oneWayAnova(factor, feature, significance, validate);
|
|
334
|
+
addOneWayAnovaVizualization(table, factor, feature, res);
|
|
335
|
+
}
|
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
// Statistic tools
|
|
2
|
+
|
|
3
|
+
/* REFERENCES
|
|
4
|
+
|
|
5
|
+
[1] One-way analysis of variance, https://en.wikipedia.org/wiki/One-way_analysis_of_variance
|
|
6
|
+
|
|
7
|
+
[2] G.W. Heiman. Basic Statistics for the Behavioral Sciences, 6th ed. Wadsworth Publishing, 2010
|
|
8
|
+
|
|
9
|
+
[3] F-test of equality of variances, https://en.wikipedia.org/wiki/F-test_of_equality_of_variances
|
|
10
|
+
|
|
11
|
+
[4] S. McKillup. Statistics Explained, Cambridge University Press, 2005
|
|
12
|
+
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
import * as grok from 'datagrok-api/grok';
|
|
16
|
+
import * as ui from 'datagrok-api/ui';
|
|
17
|
+
import * as DG from 'datagrok-api/dg';
|
|
18
|
+
|
|
19
|
+
//@ts-ignore: no types
|
|
20
|
+
import * as jStat from 'jstat';
|
|
21
|
+
|
|
22
|
+
enum ERROR_MSG {
|
|
23
|
+
NON_EQUAL_FACTORS_VALUES_SIZE = 'non-equal sizes of factor and values arrays. INPUT ERROR.',
|
|
24
|
+
INCORRECT_SIGNIFICANCE_LEVEL = 'incorrect significance level. It must be from the interval (0, 1). INPUT ERROR.',
|
|
25
|
+
INCORRECT_SAMPLE_SIZE = 'incorrect size of sample. DATA FACTORIZAING ERROR.',
|
|
26
|
+
NON_EQUAL_VARIANCES = 'variances are not equal.',
|
|
27
|
+
NON_NORMAL_DISTRIB = 'non-normal distribution.',
|
|
28
|
+
UNSUPPORTED_COLUMN_TYPE = 'unsupported column type.',
|
|
29
|
+
INCORRECT_CATEGORIES_COL_TYPE = 'incorrect categories column type.',
|
|
30
|
+
ANOVA_FAILED_JUST_ONE_CAT = 'ANOVA filed: there should be at least 2 categories.'
|
|
31
|
+
};
|
|
32
|
+
|
|
33
|
+
type SampleData = {
|
|
34
|
+
sum: number,
|
|
35
|
+
sumOfSquares: number,
|
|
36
|
+
size: number,
|
|
37
|
+
};
|
|
38
|
+
|
|
39
|
+
/** One-way ANOVA computation results. The classic notations are used (see [2], p. 290). */
|
|
40
|
+
type OneWayAnova = {
|
|
41
|
+
/** sum of squares between groups, SSbn */
|
|
42
|
+
ssBn: number,
|
|
43
|
+
/** sum of squares within groups, SSnn */
|
|
44
|
+
ssWn: number,
|
|
45
|
+
/** total sum of squares, SStot */
|
|
46
|
+
ssTot: number,
|
|
47
|
+
/** degrees of freedom between groups, DFbn */
|
|
48
|
+
dfBn: number,
|
|
49
|
+
/** degrees of freedom within groups, DFwn */
|
|
50
|
+
dfWn: number,
|
|
51
|
+
/** total degrees of freedom, DFtot */
|
|
52
|
+
dfTot: number,
|
|
53
|
+
/** mean square between groups, MSbn */
|
|
54
|
+
msBn: number,
|
|
55
|
+
/** mean square within groups, MSwn */
|
|
56
|
+
msWn: number,
|
|
57
|
+
/** Fobt, value of F-statistics, Fstat */
|
|
58
|
+
fStat: number,
|
|
59
|
+
/** p-value corresponding to F-statistics, pValue */
|
|
60
|
+
pValue: number,
|
|
61
|
+
};
|
|
62
|
+
|
|
63
|
+
/** Categorical column */
|
|
64
|
+
type CatCol = DG.Column<DG.COLUMN_TYPE.STRING>;
|
|
65
|
+
|
|
66
|
+
/** Numerical column */
|
|
67
|
+
type NumCol = DG.Column<DG.COLUMN_TYPE.FLOAT> | DG.Column<DG.COLUMN_TYPE.INT>;
|
|
68
|
+
|
|
69
|
+
/** Create dataframe with one-way ANOVA results. */
|
|
70
|
+
export function getOneWayAnovaDF(anova: OneWayAnova, alpha: number, fCritical: number, hypothesis: string, testResult: string): DG.DataFrame {
|
|
71
|
+
return DG.DataFrame.fromColumns([
|
|
72
|
+
DG.Column.fromStrings('Source of variance', ['Between groups', 'Within groups', 'Total', '', hypothesis, '', testResult]),
|
|
73
|
+
DG.Column.fromList(DG.COLUMN_TYPE.FLOAT, 'Sum of squares', [anova.ssBn, anova.ssWn, anova.ssTot, null, null, null, null]),
|
|
74
|
+
DG.Column.fromList(DG.COLUMN_TYPE.INT, 'Degrees of freedom', [anova.dfBn, anova.dfWn, anova.dfTot, null, null, null, null]),
|
|
75
|
+
DG.Column.fromList(DG.COLUMN_TYPE.FLOAT, 'Mean square', [anova.msBn, anova.msWn, null, null, null, null, null]),
|
|
76
|
+
DG.Column.fromList(DG.COLUMN_TYPE.FLOAT, 'F-statistics', [anova.fStat, null, null, null, null, null, null]),
|
|
77
|
+
DG.Column.fromList(DG.COLUMN_TYPE.FLOAT, 'p-value', [anova.pValue, null, null, null, null, null, null]),
|
|
78
|
+
DG.Column.fromList(DG.COLUMN_TYPE.FLOAT, `${alpha}-critical value`, [fCritical, null, null, null, null, null, null]),
|
|
79
|
+
]);
|
|
80
|
+
} // getOneWayAnovaDF
|
|
81
|
+
|
|
82
|
+
/** Check correctness of significance level. */
|
|
83
|
+
export function checkSignificanceLevel(alpha: number) {
|
|
84
|
+
if ((alpha <= 0) || (alpha >= 1))
|
|
85
|
+
throw new Error(ERROR_MSG.INCORRECT_SIGNIFICANCE_LEVEL);
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
/** Compute unbiased variance.*/
|
|
89
|
+
export function getVariance(data: SampleData): number {
|
|
90
|
+
// The applied formulas can be found in [4] (see p. 63)
|
|
91
|
+
const size = data.size;
|
|
92
|
+
|
|
93
|
+
if (size <= 0)
|
|
94
|
+
throw new Error(ERROR_MSG.INCORRECT_SAMPLE_SIZE);
|
|
95
|
+
|
|
96
|
+
if (size === 1)
|
|
97
|
+
return 0;
|
|
98
|
+
|
|
99
|
+
return (data.sumOfSquares - (data.sum) ** 2 / size) / (size - 1);
|
|
100
|
+
} // getVariance
|
|
101
|
+
|
|
102
|
+
/** Check equality of variances of 2 samples. F-test is performed.*/
|
|
103
|
+
function areVarsEqual(xData: SampleData, yData: SampleData, alpha: number = 0.05): boolean {
|
|
104
|
+
// The applied approach can be found in [3]
|
|
105
|
+
checkSignificanceLevel(alpha);
|
|
106
|
+
|
|
107
|
+
const xVar = getVariance(xData);
|
|
108
|
+
const yVar = getVariance(yData);
|
|
109
|
+
|
|
110
|
+
if (yVar === 0)
|
|
111
|
+
return (xVar === yVar);
|
|
112
|
+
|
|
113
|
+
const fStat = xVar / yVar;
|
|
114
|
+
const fCrit = jStat.centralF.inv(1 - alpha, xData.size - 1, yData.size - 1);
|
|
115
|
+
|
|
116
|
+
return (fStat < fCrit);
|
|
117
|
+
} // areVarsEqual
|
|
118
|
+
|
|
119
|
+
export class FactorizedData {
|
|
120
|
+
private isNormDistrib: boolean | undefined = undefined;
|
|
121
|
+
private categories: string[] = [];
|
|
122
|
+
private sums!: Float64Array;
|
|
123
|
+
private sumsOfSquares!: Float64Array;
|
|
124
|
+
private subSampleSizes!: Int32Array;
|
|
125
|
+
private size!: number;
|
|
126
|
+
private catCount!: number;
|
|
127
|
+
|
|
128
|
+
constructor(categories: CatCol, values: NumCol, checkNormality: boolean = false, alpha: number = 0.05) {
|
|
129
|
+
if (categories.type !== DG.COLUMN_TYPE.STRING)
|
|
130
|
+
throw new Error();
|
|
131
|
+
|
|
132
|
+
if (categories.length !== values.length)
|
|
133
|
+
throw new Error(ERROR_MSG.NON_EQUAL_FACTORS_VALUES_SIZE);
|
|
134
|
+
|
|
135
|
+
this.setStats(categories, values, checkNormality, alpha);
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
public isNormal(): boolean | undefined {
|
|
139
|
+
return true;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
/** Check equality of variances of factorized data. */
|
|
143
|
+
public areVarsEqual(alpha: number = 0.05): boolean {
|
|
144
|
+
const K = this.catCount;
|
|
145
|
+
|
|
146
|
+
if (K === 1)
|
|
147
|
+
return true;
|
|
148
|
+
|
|
149
|
+
const first: SampleData = {sum: this.sums[0], sumOfSquares: this.sumsOfSquares[0], size: this.subSampleSizes[0]};
|
|
150
|
+
|
|
151
|
+
for (let i = 1; i < K; ++i)
|
|
152
|
+
if(!areVarsEqual(first, {sum: this.sums[i], sumOfSquares: this.sumsOfSquares[i], size: this.subSampleSizes[i]}, alpha))
|
|
153
|
+
return false;
|
|
154
|
+
|
|
155
|
+
return true;
|
|
156
|
+
} // areVarsEqual
|
|
157
|
+
|
|
158
|
+
/** Perform one-way ANOVA computations. */
|
|
159
|
+
public getOneWayAnova(): OneWayAnova {
|
|
160
|
+
// Further, notations and formulas from (see [2], p. 290) are used.
|
|
161
|
+
|
|
162
|
+
const K = this.catCount;
|
|
163
|
+
|
|
164
|
+
if (K === 1)
|
|
165
|
+
throw new Error(ERROR_MSG.ANOVA_FAILED_JUST_ONE_CAT);
|
|
166
|
+
|
|
167
|
+
let sum = 0;
|
|
168
|
+
let sumOfSquares = 0;
|
|
169
|
+
let N = this.size;
|
|
170
|
+
let buf = 0;
|
|
171
|
+
|
|
172
|
+
for (let i = 0; i < K; ++i) {
|
|
173
|
+
sum += this.sums[i];
|
|
174
|
+
sumOfSquares += this.sumsOfSquares[i];
|
|
175
|
+
buf += this.sums[i] ** 2 / this.subSampleSizes[i];
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
const ssTot = sumOfSquares - sum ** 2 / N;
|
|
179
|
+
const ssBn = buf - sum ** 2 / N;
|
|
180
|
+
const ssWn = ssTot - ssBn;
|
|
181
|
+
|
|
182
|
+
const dfBn = K - 1;
|
|
183
|
+
const dfWn = N - K;
|
|
184
|
+
const dfTot = N - 1;
|
|
185
|
+
|
|
186
|
+
const msBn = ssBn / dfBn;
|
|
187
|
+
const msWn = ssWn / dfWn;
|
|
188
|
+
|
|
189
|
+
const fStat = msBn / msWn;
|
|
190
|
+
|
|
191
|
+
return {
|
|
192
|
+
ssBn: ssBn,
|
|
193
|
+
ssWn: ssWn,
|
|
194
|
+
ssTot: ssTot,
|
|
195
|
+
dfBn: dfBn,
|
|
196
|
+
dfWn: dfWn,
|
|
197
|
+
dfTot: dfTot,
|
|
198
|
+
msBn: msBn,
|
|
199
|
+
msWn: msWn,
|
|
200
|
+
fStat: fStat,
|
|
201
|
+
pValue: 1 - jStat.centralF.cdf(fStat, dfBn, dfWn)
|
|
202
|
+
};
|
|
203
|
+
} // getOneWayAnova
|
|
204
|
+
|
|
205
|
+
/** Compute sum & sums of squares with respect to factor levels. */
|
|
206
|
+
private setStats(categories: CatCol, values: NumCol, checkNormality: boolean = false, alpha: number = 0.05): void {
|
|
207
|
+
// TODO: provide check normality feature
|
|
208
|
+
const type = values.type;
|
|
209
|
+
const size = values.length;
|
|
210
|
+
|
|
211
|
+
switch (type) {
|
|
212
|
+
case DG.COLUMN_TYPE.INT:
|
|
213
|
+
case DG.COLUMN_TYPE.FLOAT:
|
|
214
|
+
this.categories = categories.categories;
|
|
215
|
+
const catCount = this.categories.length;
|
|
216
|
+
this.catCount = catCount;
|
|
217
|
+
this.size = size;
|
|
218
|
+
|
|
219
|
+
const vals = values.getRawData();
|
|
220
|
+
const cats = categories.getRawData();
|
|
221
|
+
|
|
222
|
+
const sums = new Float64Array(catCount).fill(0);
|
|
223
|
+
const sumsOfSquares = new Float64Array(catCount).fill(0);
|
|
224
|
+
const subSampleSizes = new Int32Array(catCount).fill(0);
|
|
225
|
+
|
|
226
|
+
for (let i = 0; i < size; ++i) {
|
|
227
|
+
const c = cats[i];
|
|
228
|
+
sums[c] += vals[i];
|
|
229
|
+
sumsOfSquares[c] += vals[i] ** 2;
|
|
230
|
+
++subSampleSizes[c];
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
this.sums = sums;
|
|
234
|
+
this.sumsOfSquares = sumsOfSquares;
|
|
235
|
+
this.subSampleSizes = subSampleSizes;
|
|
236
|
+
|
|
237
|
+
break;
|
|
238
|
+
|
|
239
|
+
default:
|
|
240
|
+
throw new Error(ERROR_MSG.UNSUPPORTED_COLUMN_TYPE);
|
|
241
|
+
}
|
|
242
|
+
} // setStats
|
|
243
|
+
} // FactorizedData
|
|
244
|
+
|
|
245
|
+
/** Perform one-way analysis of variances. */
|
|
246
|
+
export function oneWayAnova(categores: CatCol, values: NumCol, alpha: number = 0.05, validate: boolean = false): DG.DataFrame {
|
|
247
|
+
checkSignificanceLevel(alpha);
|
|
248
|
+
|
|
249
|
+
const factorized = new FactorizedData(categores, values, validate, alpha);
|
|
250
|
+
|
|
251
|
+
if (validate) {
|
|
252
|
+
if(!factorized.areVarsEqual(alpha))
|
|
253
|
+
throw new Error(ERROR_MSG.NON_EQUAL_VARIANCES);
|
|
254
|
+
|
|
255
|
+
if (!factorized.isNormal())
|
|
256
|
+
throw new Error(ERROR_MSG.NON_NORMAL_DISTRIB);
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
const anova = factorized.getOneWayAnova();
|
|
260
|
+
const fCrit = jStat.centralF.inv(1 - alpha, anova.dfBn, anova.dfWn);
|
|
261
|
+
|
|
262
|
+
const hypothesis = `THE NULL HYPOTHESIS: the "${categores.name}" factor does not produce a significant difference in the "${values.name}" feature.`;
|
|
263
|
+
const testResult = `Test result: ${(anova.fStat > fCrit) ? 'REJECTED.' : 'FAILED TO REJECT.'}`;
|
|
264
|
+
|
|
265
|
+
return getOneWayAnovaDF(anova, alpha, fCrit, hypothesis, testResult);
|
|
266
|
+
} // oneWayAnova
|