@datagrok/eda 1.1.2 → 1.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,24 +1,28 @@
1
1
  {
2
2
  "name": "@datagrok/eda",
3
3
  "friendlyName": "EDA",
4
- "version": "1.1.2",
4
+ "version": "1.1.4",
5
5
  "description": "Exploratory Data Analysis Tools",
6
6
  "dependencies": {
7
- "datagrok-api": "latest",
8
- "cash-dom": "latest",
9
- "dayjs": "latest",
10
- "@datagrok-libraries/utils": "latest",
11
- "@datagrok-libraries/tutorials": "^1.3.6"
7
+ "@datagrok-libraries/ml": "^6.3.39",
8
+ "@datagrok-libraries/tutorials": "^1.3.6",
9
+ "@datagrok-libraries/utils": "^4.1.4",
10
+ "@keckelt/tsne": "^1.0.2",
11
+ "cash-dom": "^8.1.1",
12
+ "datagrok-api": "^1.16.0",
13
+ "dayjs": "^1.11.9",
14
+ "jstat": "^1.9.6",
15
+ "umap-js": "^1.3.3"
12
16
  },
13
17
  "author": {
14
18
  "name": "Viktor Makarichev",
15
19
  "email": "vmakarichev@datagrok.ai"
16
20
  },
17
21
  "devDependencies": {
18
- "webpack": "latest",
19
- "webpack-cli": "latest",
20
22
  "ts-loader": "latest",
21
- "typescript": "latest"
23
+ "typescript": "latest",
24
+ "webpack": "latest",
25
+ "webpack-cli": "latest"
22
26
  },
23
27
  "scripts": {
24
28
  "link-all": "npm link datagrok-api @datagrok-libraries/utils @datagrok-libraries/tutorials",
@@ -0,0 +1,185 @@
1
+ // Exploratory data analysis (EDA) tools
2
+
3
+ import * as grok from 'datagrok-api/grok';
4
+ import * as ui from 'datagrok-api/ui';
5
+ import * as DG from 'datagrok-api/dg';
6
+
7
+ import {DimensionalityReducer} from '@datagrok-libraries/ml/src/reduce-dimensionality';
8
+ import {VectorMetricsNames} from '@datagrok-libraries/ml/src/typed-metrics';
9
+
10
+ import {_principalComponentAnalysisInWebWorker,
11
+ _partialLeastSquareRegressionInWebWorker} from '../wasm/EDAAPI';
12
+
13
+ import {checkWasmDimensionReducerInputs, checkUMAPinputs, checkTSNEinputs, checkSPEinputs,
14
+ getRowsOfNumericalColumnns} from './utils';
15
+
16
+ // Principal components analysis (PCA)
17
+ export async function computePCA(table: DG.DataFrame, features: DG.ColumnList, components: number,
18
+ center: boolean, scale: boolean): Promise<DG.DataFrame>
19
+ {
20
+ checkWasmDimensionReducerInputs(features, components);
21
+
22
+ const centerNum = center ? 1 : 0;
23
+ const scaleNum = scale ? 1 : 0;
24
+
25
+ return await _principalComponentAnalysisInWebWorker(table, features, components, centerNum, scaleNum);
26
+ }
27
+
28
+ // Partial least square regression (PLS)
29
+ export async function computePLS(table: DG.DataFrame, features: DG.ColumnList, predict: DG.Column, components: number): Promise<any>
30
+ {
31
+ // Inputs are checked in the same manner as in PCA, since the same computations are applied.
32
+ checkWasmDimensionReducerInputs(features, components);
33
+
34
+ return await _partialLeastSquareRegressionInWebWorker(table, features, predict, components);
35
+ }
36
+
37
+ // Uniform Manifold Approximation and Projection (UMAP)
38
+ export async function computeUMAP(features: DG.ColumnList, components: number, epochs: number,
39
+ neighbors: number, minDist: number, spread: number): Promise<DG.DataFrame>
40
+ {
41
+ // check inputs
42
+ checkUMAPinputs(features, components, epochs, neighbors, minDist, spread);
43
+
44
+ // get row-by-row data
45
+ const data = getRowsOfNumericalColumnns(features);
46
+
47
+ let workerOutput: any;
48
+
49
+ // UMAP in webworker
50
+ let promise = new Promise((resolve, reject) => {
51
+ const worker = new Worker(new URL('workers/umap-worker.ts', import.meta.url));
52
+
53
+ worker.postMessage({
54
+ data: data,
55
+ options: {
56
+ nComponents: components,
57
+ nEpochs: epochs,
58
+ nNeighbors: neighbors,
59
+ minDist: minDist,
60
+ spread: spread
61
+ }});
62
+
63
+ worker.onmessage = function(e) {
64
+ worker.terminate();
65
+ resolve(e.data.embeddings);
66
+ }});
67
+
68
+ await promise.then(
69
+ result => { workerOutput = result; },
70
+ error => { throw new Error ('applying UMAP fails.'); }
71
+ );
72
+
73
+ const embeddings = workerOutput as number[][];
74
+ const rowCount = embeddings.length;
75
+ const range = [...Array(components).keys()];
76
+
77
+ // Create output
78
+
79
+ // columns data
80
+ const umapColumnsData = range.map(_ => new Float32Array(rowCount));
81
+
82
+ // perform transponation
83
+ for (let i = 0; i < rowCount; ++i)
84
+ for (let j = 0; j < components; ++j)
85
+ umapColumnsData[j][i] = embeddings[i][j];
86
+
87
+ return DG.DataFrame.fromColumns(range.map(i =>
88
+ DG.Column.fromFloat32Array('UMAP' + i.toString(), umapColumnsData[i])
89
+ ));
90
+ } // computeUMAP
91
+
92
+ // t-distributed stochastic neighbor embedding (t-SNE)
93
+ export async function computeTSNE(features: DG.ColumnList, components: number,
94
+ learningRate: number, perplexity: number, iterations: number): Promise<DG.DataFrame>
95
+ {
96
+ // check inputs
97
+ checkTSNEinputs(features, components, learningRate, perplexity, iterations);
98
+
99
+ // get row-by-row data
100
+ const data = getRowsOfNumericalColumnns(features);
101
+
102
+ let workerOutput: any;
103
+
104
+ // t-SNE in webworker
105
+ let promise = new Promise((resolve, reject) => {
106
+ const worker = new Worker(new URL('workers/tsne-worker.ts', import.meta.url));
107
+
108
+ worker.postMessage({
109
+ data: data,
110
+ options: {
111
+ learningRate: learningRate,
112
+ perplexity: perplexity,
113
+ components: components,
114
+ iterations: iterations
115
+ }});
116
+
117
+ worker.onmessage = function(e) {
118
+ worker.terminate();
119
+ resolve(e.data.embeddings);
120
+ }});
121
+
122
+ await promise.then(
123
+ result => { workerOutput = result; },
124
+ error => { throw new Error ('applying t-SNE fails.'); }
125
+ );
126
+
127
+ const embeddings = workerOutput as any[];
128
+
129
+ const rowCount = embeddings.length;
130
+ const range = [...Array(components).keys()];
131
+
132
+ // Create output
133
+
134
+ // columns data
135
+ const umapColumnsData = range.map(_ => new Float32Array(rowCount));
136
+
137
+ // perform transponation
138
+ for (let i = 0; i < rowCount; ++i)
139
+ for (let j = 0; j < components; ++j)
140
+ umapColumnsData[j][i] = embeddings[i][j];
141
+
142
+ return DG.DataFrame.fromColumns(range.map(i =>
143
+ DG.Column.fromFloat32Array('tSNE' + i.toString(), umapColumnsData[i])
144
+ ));
145
+ } // computeTSNE
146
+
147
+ // Stochastic proximity embedding (SPE)
148
+ export async function computeSPE(features: DG.ColumnList, dimension: number,
149
+ steps: number, cycles: number, cutoff: number, lambda: number): Promise<DG.DataFrame>
150
+ {
151
+ // check inputs
152
+ checkSPEinputs(features, dimension, steps, cycles, cutoff, lambda);
153
+
154
+ // get row-by-row data
155
+ const data = getRowsOfNumericalColumnns(features);
156
+
157
+ // SPE reducer
158
+ const spe = new DimensionalityReducer(data, 'SPE', VectorMetricsNames.Euclidean, {
159
+ dimension: dimension,
160
+ steps: steps,
161
+ cycles: cycles,
162
+ cutoff: cutoff,
163
+ lambda: lambda
164
+ });
165
+
166
+ // compute embeddings
167
+ const embeddings = (await spe.transform(false, false)).embedding;
168
+
169
+ const rowCount = embeddings.length;
170
+ const range = [...Array(dimension).keys()];
171
+
172
+ // Create output
173
+
174
+ // columns data
175
+ const umapColumnsData = range.map(_ => new Float32Array(rowCount));
176
+
177
+ // perform transponation
178
+ for (let i = 0; i < rowCount; ++i)
179
+ for (let j = 0; j < dimension; ++j)
180
+ umapColumnsData[j][i] = embeddings[i][j];
181
+
182
+ return DG.DataFrame.fromColumns(range.map(i =>
183
+ DG.Column.fromFloat32Array('SPE' + i.toString(), umapColumnsData[i])
184
+ ));
185
+ } // computeSPE
@@ -12,6 +12,12 @@ export function renamePCAcolumns(pcaTable: DG.DataFrame): DG.DataFrame {
12
12
  return pcaTable;
13
13
  }
14
14
 
15
+ // Adds prefix to each column name
16
+ export function addPrefixToEachColumnName(prefix: string, columns: DG.ColumnList): void {
17
+ for (const col of columns.toList())
18
+ col.name = prefix + col.name;
19
+ }
20
+
15
21
  // Predicted vs Reference scatter plot
16
22
  export function predictedVersusReferenceScatterPlot(samplesNames: DG.Column, reference: DG.Column, prediction: DG.Column): DG.Viewer {
17
23
  prediction.name = reference.name + '(predicted)';
@@ -106,7 +112,7 @@ export function loadingScatterPlot(features: DG.ColumnList, xLoadings: Array<DG.
106
112
  // Add PLS visualization
107
113
  export function addPLSvisualization(table: DG.DataFrame, samplesNames: DG.Column, features: DG.ColumnList, predict: DG.Column, plsOutput: any): void {
108
114
 
109
- let view = grok.shell.getTableView(table.name);
115
+ const view = grok.shell.getTableView(table.name);
110
116
 
111
117
  // 1. Predicted vs Reference scatter plot
112
118
  view.addViewer(predictedVersusReferenceScatterPlot(samplesNames, predict, plsOutput[0]));
@@ -120,3 +126,10 @@ export function addPLSvisualization(table: DG.DataFrame, samplesNames: DG.Column
120
126
  // 4. Scores Scatter Plot
121
127
  view.addViewer(scoresScatterPlot(samplesNames, plsOutput[2], plsOutput[3]));
122
128
  }
129
+
130
+ // Add one-way ANOVA results
131
+ export function addOneWayAnovaVizualization(table: DG.DataFrame, factors: DG.Column, values: DG.Column, anova: DG.DataFrame) {
132
+ const view = grok.shell.getTableView(table.name);
133
+ view.addViewer(DG.Viewer.boxPlot(DG.DataFrame.fromColumns([factors, values])));
134
+ view.addViewer(DG.Viewer.grid(anova));
135
+ }
package/src/package.ts CHANGED
@@ -6,13 +6,15 @@ import * as DG from 'datagrok-api/dg';
6
6
  import {DemoScript} from '@datagrok-libraries/tutorials/src/demo-script';
7
7
 
8
8
  import {_initEDAAPI} from '../wasm/EDAAPI';
9
- import {computePCA, computePLS} from './EDAtools';
10
- import {renamePCAcolumns, addPLSvisualization, regressionCoefficientsBarChart,
11
- scoresScatterPlot, predictedVersusReferenceScatterPlot} from './EDAui';
12
- import {carsDataframe, testDataForBinaryClassification} from './dataGenerators';
9
+ import {computePCA, computePLS, computeUMAP, computeTSNE, computeSPE} from './eda-tools';
10
+ import {addPrefixToEachColumnName, addPLSvisualization, regressionCoefficientsBarChart,
11
+ scoresScatterPlot, predictedVersusReferenceScatterPlot, addOneWayAnovaVizualization} from './eda-ui';
12
+ import {carsDataframe, testDataForBinaryClassification} from './data-generators';
13
13
  import {LINEAR, RBF, POLYNOMIAL, SIGMOID,
14
14
  getTrainedModel, getPrediction, showTrainReport, getPackedModel} from './svm';
15
15
 
16
+ import {oneWayAnova} from './stat-tools';
17
+
16
18
  export const _package = new DG.Package();
17
19
 
18
20
  //name: info
@@ -25,22 +27,74 @@ export async function init(): Promise<void> {
25
27
  await _initEDAAPI();
26
28
  }
27
29
 
28
- //top-menu: Tools | Data Science | Principal Component Analysis...
30
+ //top-menu: ML | Dimensionality Reduction | PCA...
29
31
  //name: PCA
30
32
  //description: Principal component analysis (PCA).
31
33
  //input: dataframe table
32
34
  //input: column_list features {type: numerical}
33
- //input: int components = 2
34
- //input: bool center = true
35
- //input: bool scale = true
35
+ //input: int components = 2 {caption: Components} [Number of components.]
36
+ //input: bool center = false [Indicating whether the variables should be shifted to be zero centered.]
37
+ //input: bool scale = false [Indicating whether the variables should be scaled to have unit variance.]
36
38
  //output: dataframe result {action:join(table)}
37
39
  export async function PCA(table: DG.DataFrame, features: DG.ColumnList, components: number,
38
40
  center: boolean, scale: boolean): Promise<DG.DataFrame>
39
41
  {
40
- return renamePCAcolumns(await computePCA(table, features, components, center, scale));
42
+ const pcaTable = await computePCA(table, features, components, center, scale);
43
+ addPrefixToEachColumnName('PCA', pcaTable.columns);
44
+ return pcaTable;
45
+ }
46
+
47
+ //top-menu: ML | Dimensionality Reduction | UMAP...
48
+ //name: UMAP
49
+ //description: Uniform Manifold Approximation and Projection (UMAP).
50
+ //input: dataframe table {category: Data}
51
+ //input: column_list features {type: numerical; category: Data}
52
+ //input: int components = 2 {caption: Components; category: Hyperparameters} [The number of components (dimensions) to project the data to.]
53
+ //input: int epochs = 100 {caption: Epochs; category: Hyperparameters} [The number of epochs to optimize embeddings.]
54
+ //input: int neighbors = 15 {caption: Neighbors; category: Hyperparameters} [The number of nearest neighbors to construct the fuzzy manifold.]
55
+ //input: double minDist = 0.1 {caption: Minimum distance; category: Hyperparameters} [The effective minimum distance between embedded points.]
56
+ //input: double spread = 1.0 {caption: Spread; category: Hyperparameters} [The effective scale of embedded points.]
57
+ //output: dataframe result {action:join(table)}
58
+ export async function UMAP(table: DG.DataFrame, features: DG.ColumnList, components: number,
59
+ epochs: number, neighbors: number, minDist: number, spread: number): Promise<DG.DataFrame>
60
+ {
61
+ return await computeUMAP(features, components, epochs, neighbors, minDist, spread);
62
+ }
63
+
64
+ //top-menu: ML | Dimensionality Reduction | t-SNE...
65
+ //name: t-SNE
66
+ //description: t-distributed stochastic neighbor embedding (t-SNE).
67
+ //input: dataframe table {category: Data}
68
+ //input: column_list features {type: numerical; category: Data}
69
+ //input: int components = 2 {caption: Components; category: Hyperparameters} [Dimension of the embedded space.]
70
+ //input: double learningRate = 10 {caption: Learning rate; category: Hyperparameters} [Optimization tuning parameter. Should be in the range 10...1000.]
71
+ //input: int perplexity = 30 {caption: Perplexity; category: Hyperparameters} [The number of nearest neighbors. Should be less than the number of samples.]
72
+ //input: int iterations = 500 {caption: Iterations; category: Hyperparameters} [Maximum number of iterations for the optimization. Should be at least 250.]
73
+ //output: dataframe result {action:join(table)}
74
+ export async function tSNE(table: DG.DataFrame, features: DG.ColumnList, components: number,
75
+ learningRate: number, perplexity: number, iterations: number): Promise<DG.DataFrame>
76
+ {
77
+ return await computeTSNE(features, components, learningRate, perplexity, iterations);
78
+ }
79
+
80
+ //top-menu: ML | Dimensionality Reduction | SPE...
81
+ //name: SPE
82
+ //description: Stochastic proximity embedding (SPE).
83
+ //input: dataframe table {category: Data}
84
+ //input: column_list features {type: numerical; category: Data}
85
+ //input: int dimension = 2 {caption: Dimension; category: Hyperparameters} [Dimension of the embedded space.]
86
+ //input: int steps = 0 {caption: Steps; category: Hyperparameters} [Number of random selections of point pairs and distance computations between them.]
87
+ //input: int cycles = 1000000 {caption: Cycles; category: Hyperparameters} [Number of the method cycles.]
88
+ //input: double cutoff = 0.0 {caption: Cutoff; category: Hyperparameters} [Cutoff distance between points.]
89
+ //input: double lambda = 2.0 {caption: Learning rate; category: Hyperparameters} [Optimization tuning parameter.]
90
+ //output: dataframe result {action:join(table)}
91
+ export async function SPE(table: DG.DataFrame, features: DG.ColumnList, dimension: number,
92
+ steps: number, cycles: number, cutoff: number, lambda: number): Promise<DG.DataFrame>
93
+ {
94
+ return await computeSPE(features, dimension, steps, cycles, cutoff, lambda);
41
95
  }
42
96
 
43
- //top-menu: ML | Multivariate Analysis (PLS)...
97
+ //top-menu: ML | Analyze | Multivariate Analysis...
44
98
  //name: Multivariate Analysis (PLS)
45
99
  //description: Multidimensional data analysis using partial least squares (PLS) regression. It reduces the predictors to a smaller set of uncorrelated components and performs least squares regression on them.
46
100
  //input: dataframe table
@@ -266,3 +320,16 @@ export async function trainSigmoidKernelSVM(df: DG.DataFrame, predict_column: st
266
320
  export async function applySigmoidKernelSVM(df: DG.DataFrame, model: any): Promise<DG.DataFrame> {
267
321
  return await getPrediction(df, model);
268
322
  }
323
+
324
+ //top-menu: ML | Analysis of Variances (ANOVA)...
325
+ //name: One-way ANOVA
326
+ //description: One-way analysis of variances (ANOVA) determines whether the examined factor has a significant impact on the studied feature.
327
+ //input: dataframe table
328
+ //input: column factor {type: categorical}
329
+ //input: column feature {type: numerical}
330
+ //input: double significance = 0.05 [The significance level is a value from the interval (0, 1) specifying the criterion used for rejecting the null hypothesis.]
331
+ //input: bool validate = false [Indicates whether the normality of distribution and an eqaulity of varainces should be checked.]
332
+ export function anova(table: DG.DataFrame, factor: DG.Column, feature: DG.Column, significance: number, validate: boolean) {
333
+ const res = oneWayAnova(factor, feature, significance, validate);
334
+ addOneWayAnovaVizualization(table, factor, feature, res);
335
+ }
@@ -0,0 +1,266 @@
1
+ // Statistic tools
2
+
3
+ /* REFERENCES
4
+
5
+ [1] One-way analysis of variance, https://en.wikipedia.org/wiki/One-way_analysis_of_variance
6
+
7
+ [2] G.W. Heiman. Basic Statistics for the Behavioral Sciences, 6th ed. Wadsworth Publishing, 2010
8
+
9
+ [3] F-test of equality of variances, https://en.wikipedia.org/wiki/F-test_of_equality_of_variances
10
+
11
+ [4] S. McKillup. Statistics Explained, Cambridge University Press, 2005
12
+
13
+ */
14
+
15
+ import * as grok from 'datagrok-api/grok';
16
+ import * as ui from 'datagrok-api/ui';
17
+ import * as DG from 'datagrok-api/dg';
18
+
19
+ //@ts-ignore: no types
20
+ import * as jStat from 'jstat';
21
+
22
+ enum ERROR_MSG {
23
+ NON_EQUAL_FACTORS_VALUES_SIZE = 'non-equal sizes of factor and values arrays. INPUT ERROR.',
24
+ INCORRECT_SIGNIFICANCE_LEVEL = 'incorrect significance level. It must be from the interval (0, 1). INPUT ERROR.',
25
+ INCORRECT_SAMPLE_SIZE = 'incorrect size of sample. DATA FACTORIZAING ERROR.',
26
+ NON_EQUAL_VARIANCES = 'variances are not equal.',
27
+ NON_NORMAL_DISTRIB = 'non-normal distribution.',
28
+ UNSUPPORTED_COLUMN_TYPE = 'unsupported column type.',
29
+ INCORRECT_CATEGORIES_COL_TYPE = 'incorrect categories column type.',
30
+ ANOVA_FAILED_JUST_ONE_CAT = 'ANOVA filed: there should be at least 2 categories.'
31
+ };
32
+
33
+ type SampleData = {
34
+ sum: number,
35
+ sumOfSquares: number,
36
+ size: number,
37
+ };
38
+
39
+ /** One-way ANOVA computation results. The classic notations are used (see [2], p. 290). */
40
+ type OneWayAnova = {
41
+ /** sum of squares between groups, SSbn */
42
+ ssBn: number,
43
+ /** sum of squares within groups, SSnn */
44
+ ssWn: number,
45
+ /** total sum of squares, SStot */
46
+ ssTot: number,
47
+ /** degrees of freedom between groups, DFbn */
48
+ dfBn: number,
49
+ /** degrees of freedom within groups, DFwn */
50
+ dfWn: number,
51
+ /** total degrees of freedom, DFtot */
52
+ dfTot: number,
53
+ /** mean square between groups, MSbn */
54
+ msBn: number,
55
+ /** mean square within groups, MSwn */
56
+ msWn: number,
57
+ /** Fobt, value of F-statistics, Fstat */
58
+ fStat: number,
59
+ /** p-value corresponding to F-statistics, pValue */
60
+ pValue: number,
61
+ };
62
+
63
+ /** Categorical column */
64
+ type CatCol = DG.Column<DG.COLUMN_TYPE.STRING>;
65
+
66
+ /** Numerical column */
67
+ type NumCol = DG.Column<DG.COLUMN_TYPE.FLOAT> | DG.Column<DG.COLUMN_TYPE.INT>;
68
+
69
+ /** Create dataframe with one-way ANOVA results. */
70
+ export function getOneWayAnovaDF(anova: OneWayAnova, alpha: number, fCritical: number, hypothesis: string, testResult: string): DG.DataFrame {
71
+ return DG.DataFrame.fromColumns([
72
+ DG.Column.fromStrings('Source of variance', ['Between groups', 'Within groups', 'Total', '', hypothesis, '', testResult]),
73
+ DG.Column.fromList(DG.COLUMN_TYPE.FLOAT, 'Sum of squares', [anova.ssBn, anova.ssWn, anova.ssTot, null, null, null, null]),
74
+ DG.Column.fromList(DG.COLUMN_TYPE.INT, 'Degrees of freedom', [anova.dfBn, anova.dfWn, anova.dfTot, null, null, null, null]),
75
+ DG.Column.fromList(DG.COLUMN_TYPE.FLOAT, 'Mean square', [anova.msBn, anova.msWn, null, null, null, null, null]),
76
+ DG.Column.fromList(DG.COLUMN_TYPE.FLOAT, 'F-statistics', [anova.fStat, null, null, null, null, null, null]),
77
+ DG.Column.fromList(DG.COLUMN_TYPE.FLOAT, 'p-value', [anova.pValue, null, null, null, null, null, null]),
78
+ DG.Column.fromList(DG.COLUMN_TYPE.FLOAT, `${alpha}-critical value`, [fCritical, null, null, null, null, null, null]),
79
+ ]);
80
+ } // getOneWayAnovaDF
81
+
82
+ /** Check correctness of significance level. */
83
+ export function checkSignificanceLevel(alpha: number) {
84
+ if ((alpha <= 0) || (alpha >= 1))
85
+ throw new Error(ERROR_MSG.INCORRECT_SIGNIFICANCE_LEVEL);
86
+ }
87
+
88
+ /** Compute unbiased variance.*/
89
+ export function getVariance(data: SampleData): number {
90
+ // The applied formulas can be found in [4] (see p. 63)
91
+ const size = data.size;
92
+
93
+ if (size <= 0)
94
+ throw new Error(ERROR_MSG.INCORRECT_SAMPLE_SIZE);
95
+
96
+ if (size === 1)
97
+ return 0;
98
+
99
+ return (data.sumOfSquares - (data.sum) ** 2 / size) / (size - 1);
100
+ } // getVariance
101
+
102
+ /** Check equality of variances of 2 samples. F-test is performed.*/
103
+ function areVarsEqual(xData: SampleData, yData: SampleData, alpha: number = 0.05): boolean {
104
+ // The applied approach can be found in [3]
105
+ checkSignificanceLevel(alpha);
106
+
107
+ const xVar = getVariance(xData);
108
+ const yVar = getVariance(yData);
109
+
110
+ if (yVar === 0)
111
+ return (xVar === yVar);
112
+
113
+ const fStat = xVar / yVar;
114
+ const fCrit = jStat.centralF.inv(1 - alpha, xData.size - 1, yData.size - 1);
115
+
116
+ return (fStat < fCrit);
117
+ } // areVarsEqual
118
+
119
+ export class FactorizedData {
120
+ private isNormDistrib: boolean | undefined = undefined;
121
+ private categories: string[] = [];
122
+ private sums!: Float64Array;
123
+ private sumsOfSquares!: Float64Array;
124
+ private subSampleSizes!: Int32Array;
125
+ private size!: number;
126
+ private catCount!: number;
127
+
128
+ constructor(categories: CatCol, values: NumCol, checkNormality: boolean = false, alpha: number = 0.05) {
129
+ if (categories.type !== DG.COLUMN_TYPE.STRING)
130
+ throw new Error();
131
+
132
+ if (categories.length !== values.length)
133
+ throw new Error(ERROR_MSG.NON_EQUAL_FACTORS_VALUES_SIZE);
134
+
135
+ this.setStats(categories, values, checkNormality, alpha);
136
+ }
137
+
138
+ public isNormal(): boolean | undefined {
139
+ return true;
140
+ }
141
+
142
+ /** Check equality of variances of factorized data. */
143
+ public areVarsEqual(alpha: number = 0.05): boolean {
144
+ const K = this.catCount;
145
+
146
+ if (K === 1)
147
+ return true;
148
+
149
+ const first: SampleData = {sum: this.sums[0], sumOfSquares: this.sumsOfSquares[0], size: this.subSampleSizes[0]};
150
+
151
+ for (let i = 1; i < K; ++i)
152
+ if(!areVarsEqual(first, {sum: this.sums[i], sumOfSquares: this.sumsOfSquares[i], size: this.subSampleSizes[i]}, alpha))
153
+ return false;
154
+
155
+ return true;
156
+ } // areVarsEqual
157
+
158
+ /** Perform one-way ANOVA computations. */
159
+ public getOneWayAnova(): OneWayAnova {
160
+ // Further, notations and formulas from (see [2], p. 290) are used.
161
+
162
+ const K = this.catCount;
163
+
164
+ if (K === 1)
165
+ throw new Error(ERROR_MSG.ANOVA_FAILED_JUST_ONE_CAT);
166
+
167
+ let sum = 0;
168
+ let sumOfSquares = 0;
169
+ let N = this.size;
170
+ let buf = 0;
171
+
172
+ for (let i = 0; i < K; ++i) {
173
+ sum += this.sums[i];
174
+ sumOfSquares += this.sumsOfSquares[i];
175
+ buf += this.sums[i] ** 2 / this.subSampleSizes[i];
176
+ }
177
+
178
+ const ssTot = sumOfSquares - sum ** 2 / N;
179
+ const ssBn = buf - sum ** 2 / N;
180
+ const ssWn = ssTot - ssBn;
181
+
182
+ const dfBn = K - 1;
183
+ const dfWn = N - K;
184
+ const dfTot = N - 1;
185
+
186
+ const msBn = ssBn / dfBn;
187
+ const msWn = ssWn / dfWn;
188
+
189
+ const fStat = msBn / msWn;
190
+
191
+ return {
192
+ ssBn: ssBn,
193
+ ssWn: ssWn,
194
+ ssTot: ssTot,
195
+ dfBn: dfBn,
196
+ dfWn: dfWn,
197
+ dfTot: dfTot,
198
+ msBn: msBn,
199
+ msWn: msWn,
200
+ fStat: fStat,
201
+ pValue: 1 - jStat.centralF.cdf(fStat, dfBn, dfWn)
202
+ };
203
+ } // getOneWayAnova
204
+
205
+ /** Compute sum & sums of squares with respect to factor levels. */
206
+ private setStats(categories: CatCol, values: NumCol, checkNormality: boolean = false, alpha: number = 0.05): void {
207
+ // TODO: provide check normality feature
208
+ const type = values.type;
209
+ const size = values.length;
210
+
211
+ switch (type) {
212
+ case DG.COLUMN_TYPE.INT:
213
+ case DG.COLUMN_TYPE.FLOAT:
214
+ this.categories = categories.categories;
215
+ const catCount = this.categories.length;
216
+ this.catCount = catCount;
217
+ this.size = size;
218
+
219
+ const vals = values.getRawData();
220
+ const cats = categories.getRawData();
221
+
222
+ const sums = new Float64Array(catCount).fill(0);
223
+ const sumsOfSquares = new Float64Array(catCount).fill(0);
224
+ const subSampleSizes = new Int32Array(catCount).fill(0);
225
+
226
+ for (let i = 0; i < size; ++i) {
227
+ const c = cats[i];
228
+ sums[c] += vals[i];
229
+ sumsOfSquares[c] += vals[i] ** 2;
230
+ ++subSampleSizes[c];
231
+ }
232
+
233
+ this.sums = sums;
234
+ this.sumsOfSquares = sumsOfSquares;
235
+ this.subSampleSizes = subSampleSizes;
236
+
237
+ break;
238
+
239
+ default:
240
+ throw new Error(ERROR_MSG.UNSUPPORTED_COLUMN_TYPE);
241
+ }
242
+ } // setStats
243
+ } // FactorizedData
244
+
245
+ /** Perform one-way analysis of variances. */
246
+ export function oneWayAnova(categores: CatCol, values: NumCol, alpha: number = 0.05, validate: boolean = false): DG.DataFrame {
247
+ checkSignificanceLevel(alpha);
248
+
249
+ const factorized = new FactorizedData(categores, values, validate, alpha);
250
+
251
+ if (validate) {
252
+ if(!factorized.areVarsEqual(alpha))
253
+ throw new Error(ERROR_MSG.NON_EQUAL_VARIANCES);
254
+
255
+ if (!factorized.isNormal())
256
+ throw new Error(ERROR_MSG.NON_NORMAL_DISTRIB);
257
+ }
258
+
259
+ const anova = factorized.getOneWayAnova();
260
+ const fCrit = jStat.centralF.inv(1 - alpha, anova.dfBn, anova.dfWn);
261
+
262
+ const hypothesis = `THE NULL HYPOTHESIS: the "${categores.name}" factor does not produce a significant difference in the "${values.name}" feature.`;
263
+ const testResult = `Test result: ${(anova.fStat > fCrit) ? 'REJECTED.' : 'FAILED TO REJECT.'}`;
264
+
265
+ return getOneWayAnovaDF(anova, alpha, fCrit, hypothesis, testResult);
266
+ } // oneWayAnova