@datagrok/eda 1.1.9 → 1.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/stat-tools.ts CHANGED
@@ -1,11 +1,11 @@
1
1
  // Statistic tools
2
2
 
3
3
  /* REFERENCES
4
-
4
+
5
5
  [1] One-way analysis of variance, https://en.wikipedia.org/wiki/One-way_analysis_of_variance
6
6
 
7
7
  [2] G.W. Heiman. Basic Statistics for the Behavioral Sciences, 6th ed. Wadsworth Publishing, 2010
8
-
8
+
9
9
  [3] F-test of equality of variances, https://en.wikipedia.org/wiki/F-test_of_equality_of_variances
10
10
 
11
11
  [4] S. McKillup. Statistics Explained, Cambridge University Press, 2005
@@ -40,9 +40,9 @@ type SampleData = {
40
40
  type OneWayAnova = {
41
41
  /** sum of squares between groups, SSbn */
42
42
  ssBn: number,
43
- /** sum of squares within groups, SSnn */
43
+ /** sum of squares within groups, SSnn */
44
44
  ssWn: number,
45
- /** total sum of squares, SStot */
45
+ /** total sum of squares, SStot */
46
46
  ssTot: number,
47
47
  /** degrees of freedom between groups, DFbn */
48
48
  dfBn: number,
@@ -67,15 +67,21 @@ type CatCol = DG.Column<DG.COLUMN_TYPE.STRING>;
67
67
  type NumCol = DG.Column<DG.COLUMN_TYPE.FLOAT> | DG.Column<DG.COLUMN_TYPE.INT>;
68
68
 
69
69
  /** Create dataframe with one-way ANOVA results. */
70
- export function getOneWayAnovaDF(anova: OneWayAnova, alpha: number, fCritical: number, hypothesis: string, testResult: string): DG.DataFrame {
70
+ export function getOneWayAnovaDF(
71
+ anova: OneWayAnova, alpha: number, fCritical: number, hypothesis: string, testResult: string,
72
+ ): DG.DataFrame {
71
73
  return DG.DataFrame.fromColumns([
72
- DG.Column.fromStrings('Source of variance', ['Between groups', 'Within groups', 'Total', '', hypothesis, '', testResult]),
73
- DG.Column.fromList(DG.COLUMN_TYPE.FLOAT, 'Sum of squares', [anova.ssBn, anova.ssWn, anova.ssTot, null, null, null, null]),
74
- DG.Column.fromList(DG.COLUMN_TYPE.INT, 'Degrees of freedom', [anova.dfBn, anova.dfWn, anova.dfTot, null, null, null, null]),
74
+ DG.Column.fromStrings('Source of variance',
75
+ ['Between groups', 'Within groups', 'Total', '', hypothesis, '', testResult]),
76
+ DG.Column.fromList(DG.COLUMN_TYPE.FLOAT, 'Sum of squares',
77
+ [anova.ssBn, anova.ssWn, anova.ssTot, null, null, null, null]),
78
+ DG.Column.fromList(DG.COLUMN_TYPE.INT, 'Degrees of freedom',
79
+ [anova.dfBn, anova.dfWn, anova.dfTot, null, null, null, null]),
75
80
  DG.Column.fromList(DG.COLUMN_TYPE.FLOAT, 'Mean square', [anova.msBn, anova.msWn, null, null, null, null, null]),
76
81
  DG.Column.fromList(DG.COLUMN_TYPE.FLOAT, 'F-statistics', [anova.fStat, null, null, null, null, null, null]),
77
82
  DG.Column.fromList(DG.COLUMN_TYPE.FLOAT, 'p-value', [anova.pValue, null, null, null, null, null, null]),
78
- DG.Column.fromList(DG.COLUMN_TYPE.FLOAT, `${alpha}-critical value`, [fCritical, null, null, null, null, null, null]),
83
+ DG.Column.fromList(DG.COLUMN_TYPE.FLOAT,
84
+ `${alpha}-critical value`, [fCritical, null, null, null, null, null, null]),
79
85
  ]);
80
86
  } // getOneWayAnovaDF
81
87
 
@@ -95,7 +101,7 @@ export function getVariance(data: SampleData): number {
95
101
 
96
102
  if (size === 1)
97
103
  return 0;
98
-
104
+
99
105
  return (data.sumOfSquares - (data.sum) ** 2 / size) / (size - 1);
100
106
  } // getVariance
101
107
 
@@ -103,7 +109,7 @@ export function getVariance(data: SampleData): number {
103
109
  function areVarsEqual(xData: SampleData, yData: SampleData, alpha: number = 0.05): boolean {
104
110
  // The applied approach can be found in [3]
105
111
  checkSignificanceLevel(alpha);
106
-
112
+
107
113
  const xVar = getVariance(xData);
108
114
  const yVar = getVariance(yData);
109
115
 
@@ -118,7 +124,7 @@ function areVarsEqual(xData: SampleData, yData: SampleData, alpha: number = 0.05
118
124
 
119
125
  export class FactorizedData {
120
126
  private isNormDistrib: boolean | undefined = undefined;
121
- private categories: string[] = [];
127
+ private categories: string[] = [];
122
128
  private sums!: Float64Array;
123
129
  private sumsOfSquares!: Float64Array;
124
130
  private subSampleSizes!: Int32Array;
@@ -130,7 +136,7 @@ export class FactorizedData {
130
136
  throw new Error();
131
137
 
132
138
  if (categories.length !== values.length)
133
- throw new Error(ERROR_MSG.NON_EQUAL_FACTORS_VALUES_SIZE);
139
+ throw new Error(ERROR_MSG.NON_EQUAL_FACTORS_VALUES_SIZE);
134
140
 
135
141
  this.setStats(categories, values, checkNormality, alpha);
136
142
  }
@@ -148,11 +154,13 @@ export class FactorizedData {
148
154
 
149
155
  const first: SampleData = {sum: this.sums[0], sumOfSquares: this.sumsOfSquares[0], size: this.subSampleSizes[0]};
150
156
 
151
- for (let i = 1; i < K; ++i)
152
- if(!areVarsEqual(first, {sum: this.sums[i], sumOfSquares: this.sumsOfSquares[i], size: this.subSampleSizes[i]}, alpha))
157
+ for (let i = 1; i < K; ++i) {
158
+ if (!areVarsEqual(first, {sum: this.sums[i], sumOfSquares: this.sumsOfSquares[i],
159
+ size: this.subSampleSizes[i]}, alpha))
153
160
  return false;
161
+ }
154
162
 
155
- return true;
163
+ return true;
156
164
  } // areVarsEqual
157
165
 
158
166
  /** Perform one-way ANOVA computations. */
@@ -163,18 +171,18 @@ export class FactorizedData {
163
171
 
164
172
  if (K === 1)
165
173
  throw new Error(ERROR_MSG.ANOVA_FAILED_JUST_ONE_CAT);
166
-
174
+
167
175
  let sum = 0;
168
176
  let sumOfSquares = 0;
169
- let N = this.size;
177
+ const N = this.size;
170
178
  let buf = 0;
171
179
 
172
180
  for (let i = 0; i < K; ++i) {
173
181
  sum += this.sums[i];
174
182
  sumOfSquares += this.sumsOfSquares[i];
175
- buf += this.sums[i] ** 2 / this.subSampleSizes[i];
183
+ buf += this.sums[i] ** 2 / this.subSampleSizes[i];
176
184
  }
177
-
185
+
178
186
  const ssTot = sumOfSquares - sum ** 2 / N;
179
187
  const ssBn = buf - sum ** 2 / N;
180
188
  const ssWn = ssTot - ssBn;
@@ -182,12 +190,12 @@ export class FactorizedData {
182
190
  const dfBn = K - 1;
183
191
  const dfWn = N - K;
184
192
  const dfTot = N - 1;
185
-
193
+
186
194
  const msBn = ssBn / dfBn;
187
195
  const msWn = ssWn / dfWn;
188
196
 
189
197
  const fStat = msBn / msWn;
190
-
198
+
191
199
  return {
192
200
  ssBn: ssBn,
193
201
  ssWn: ssWn,
@@ -197,61 +205,63 @@ export class FactorizedData {
197
205
  dfTot: dfTot,
198
206
  msBn: msBn,
199
207
  msWn: msWn,
200
- fStat: fStat,
201
- pValue: 1 - jStat.centralF.cdf(fStat, dfBn, dfWn)
208
+ fStat: fStat,
209
+ pValue: 1 - jStat.centralF.cdf(fStat, dfBn, dfWn),
202
210
  };
203
211
  } // getOneWayAnova
204
212
 
205
213
  /** Compute sum & sums of squares with respect to factor levels. */
206
- private setStats(categories: CatCol, values: NumCol, checkNormality: boolean = false, alpha: number = 0.05): void {
207
- // TODO: provide check normality feature
214
+ private setStats(categories: CatCol, values: NumCol, _checkNormality: boolean = false, _alpha: number = 0.05): void {
215
+ // TODO: provide check normality feature
208
216
  const type = values.type;
209
217
  const size = values.length;
210
218
 
211
219
  switch (type) {
212
- case DG.COLUMN_TYPE.INT:
213
- case DG.COLUMN_TYPE.FLOAT:
214
- this.categories = categories.categories;
215
- const catCount = this.categories.length;
216
- this.catCount = catCount;
217
- this.size = size;
218
-
219
- const vals = values.getRawData();
220
- const cats = categories.getRawData();
221
-
222
- const sums = new Float64Array(catCount).fill(0);
223
- const sumsOfSquares = new Float64Array(catCount).fill(0);
224
- const subSampleSizes = new Int32Array(catCount).fill(0);
225
-
226
- for (let i = 0; i < size; ++i) {
227
- const c = cats[i];
228
- sums[c] += vals[i];
229
- sumsOfSquares[c] += vals[i] ** 2;
230
- ++subSampleSizes[c];
231
- }
232
-
233
- this.sums = sums;
234
- this.sumsOfSquares = sumsOfSquares;
235
- this.subSampleSizes = subSampleSizes;
236
-
237
- break;
238
-
239
- default:
240
- throw new Error(ERROR_MSG.UNSUPPORTED_COLUMN_TYPE);
241
- }
220
+ case DG.COLUMN_TYPE.INT:
221
+ case DG.COLUMN_TYPE.FLOAT:
222
+ this.categories = categories.categories;
223
+ const catCount = this.categories.length;
224
+ this.catCount = catCount;
225
+ this.size = size;
226
+
227
+ const vals = values.getRawData();
228
+ const cats = categories.getRawData();
229
+
230
+ const sums = new Float64Array(catCount).fill(0);
231
+ const sumsOfSquares = new Float64Array(catCount).fill(0);
232
+ const subSampleSizes = new Int32Array(catCount).fill(0);
233
+
234
+ for (let i = 0; i < size; ++i) {
235
+ const c = cats[i];
236
+ sums[c] += vals[i];
237
+ sumsOfSquares[c] += vals[i] ** 2;
238
+ ++subSampleSizes[c];
239
+ }
240
+
241
+ this.sums = sums;
242
+ this.sumsOfSquares = sumsOfSquares;
243
+ this.subSampleSizes = subSampleSizes;
244
+
245
+ break;
246
+
247
+ default:
248
+ throw new Error(ERROR_MSG.UNSUPPORTED_COLUMN_TYPE);
249
+ }
242
250
  } // setStats
243
251
  } // FactorizedData
244
252
 
245
253
  /** Perform one-way analysis of variances. */
246
- export function oneWayAnova(categores: CatCol, values: NumCol, alpha: number = 0.05, validate: boolean = false): DG.DataFrame {
247
- checkSignificanceLevel(alpha);
254
+ export function oneWayAnova(
255
+ categores: CatCol, values: NumCol, alpha: number = 0.05, validate: boolean = false,
256
+ ): DG.DataFrame {
257
+ checkSignificanceLevel(alpha);
248
258
 
249
259
  const factorized = new FactorizedData(categores, values, validate, alpha);
250
260
 
251
261
  if (validate) {
252
- if(!factorized.areVarsEqual(alpha))
262
+ if (!factorized.areVarsEqual(alpha))
253
263
  throw new Error(ERROR_MSG.NON_EQUAL_VARIANCES);
254
-
264
+
255
265
  if (!factorized.isNormal())
256
266
  throw new Error(ERROR_MSG.NON_NORMAL_DISTRIB);
257
267
  }
@@ -259,7 +269,8 @@ export function oneWayAnova(categores: CatCol, values: NumCol, alpha: number = 0
259
269
  const anova = factorized.getOneWayAnova();
260
270
  const fCrit = jStat.centralF.inv(1 - alpha, anova.dfBn, anova.dfWn);
261
271
 
262
- const hypothesis = `THE NULL HYPOTHESIS: the "${categores.name}" factor does not produce a significant difference in the "${values.name}" feature.`;
272
+ const hypothesis = `THE NULL HYPOTHESIS: the "${categores.name}"
273
+ factor does not produce a significant difference in the "${values.name}" feature.`;
263
274
  const testResult = `Test result: ${(anova.fStat > fCrit) ? 'REJECTED.' : 'FAILED TO REJECT.'}`;
264
275
 
265
276
  return getOneWayAnovaDF(anova, alpha, fCrit, hypothesis, testResult);