datly 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/datly.cjs +1 -0
- package/dist/datly.mjs +1 -0
- package/dist/datly.umd.js +1 -1
- package/dist/datly.umd.js.map +1 -0
- package/package.json +24 -11
- package/src/core/dataLoader.js +407 -0
- package/src/core/utils.js +306 -0
- package/src/core/validator.js +205 -0
- package/src/dataviz/index.js +1566 -0
- package/src/descriptive/centralTendency.js +208 -0
- package/src/descriptive/dispersion.js +273 -0
- package/src/descriptive/position.js +268 -0
- package/src/descriptive/shape.js +336 -0
- package/src/index.js +480 -0
- package/src/inferential/confidenceIntervals.js +561 -0
- package/src/inferential/hypothesisTesting.js +527 -0
- package/src/inferential/normalityTests.js +587 -0
- package/src/insights/autoAnalyser.js +685 -0
- package/src/insights/interpreter.js +543 -0
- package/src/insights/patternDetector.js +897 -0
- package/src/insights/reportGenerator.js +1072 -0
- package/src/ml/ClassificationMetrics.js +336 -0
- package/src/ml/DecisionTree.js +412 -0
- package/src/ml/KNearestNeighbors.js +317 -0
- package/src/ml/LinearRegression.js +179 -0
- package/src/ml/LogisticRegression.js +396 -0
- package/src/ml/MachineLearning.js +490 -0
- package/src/ml/NaiveBayes.js +296 -0
- package/src/ml/RandomForest.js +323 -0
- package/src/ml/SupportVectorMachine.js +299 -0
- package/src/ml/baseModel.js +106 -0
- package/src/multivariate/correlation.js +653 -0
- package/src/multivariate/regression.js +660 -0
@@ -0,0 +1,306 @@
|
|
1
|
+
class Utils {
|
2
|
+
detectOutliers(column, method = 'iqr') {
|
3
|
+
const sortedData = [...column].sort((a, b) => a - b);
|
4
|
+
const outliers = [];
|
5
|
+
const indices = [];
|
6
|
+
|
7
|
+
switch (method) {
|
8
|
+
case 'iqr':
|
9
|
+
const q1 = this.quantile(sortedData, 0.25);
|
10
|
+
const q3 = this.quantile(sortedData, 0.75);
|
11
|
+
const iqr = q3 - q1;
|
12
|
+
const lowerBound = q1 - 1.5 * iqr;
|
13
|
+
const upperBound = q3 + 1.5 * iqr;
|
14
|
+
|
15
|
+
column.forEach((value, index) => {
|
16
|
+
if (value < lowerBound || value > upperBound) {
|
17
|
+
outliers.push(value);
|
18
|
+
indices.push(index);
|
19
|
+
}
|
20
|
+
});
|
21
|
+
break;
|
22
|
+
|
23
|
+
case 'zscore':
|
24
|
+
const mean = this.mean(column);
|
25
|
+
const std = this.standardDeviation(column);
|
26
|
+
|
27
|
+
column.forEach((value, index) => {
|
28
|
+
const zscore = Math.abs((value - mean) / std);
|
29
|
+
if (zscore > 3) {
|
30
|
+
outliers.push(value);
|
31
|
+
indices.push(index);
|
32
|
+
}
|
33
|
+
});
|
34
|
+
break;
|
35
|
+
|
36
|
+
case 'modified_zscore':
|
37
|
+
const median = this.median(column);
|
38
|
+
const deviations = column.map(x => Math.abs(x - median));
|
39
|
+
const mad = this.median(deviations);
|
40
|
+
|
41
|
+
column.forEach((value, index) => {
|
42
|
+
const modifiedZScore = 0.6745 * (value - median) / mad;
|
43
|
+
if (Math.abs(modifiedZScore) > 3.5) {
|
44
|
+
outliers.push(value);
|
45
|
+
indices.push(index);
|
46
|
+
}
|
47
|
+
});
|
48
|
+
break;
|
49
|
+
|
50
|
+
default:
|
51
|
+
throw new Error(`Unknown outlier detection method: ${method}`);
|
52
|
+
}
|
53
|
+
|
54
|
+
return {
|
55
|
+
outliers,
|
56
|
+
indices,
|
57
|
+
count: outliers.length,
|
58
|
+
percentage: (outliers.length / column.length) * 100
|
59
|
+
};
|
60
|
+
}
|
61
|
+
|
62
|
+
frequencyTable(column) {
|
63
|
+
const frequencies = {};
|
64
|
+
const total = column.length;
|
65
|
+
|
66
|
+
column.forEach(value => {
|
67
|
+
const key = value === null || value === undefined ? 'null' : String(value);
|
68
|
+
frequencies[key] = (frequencies[key] || 0) + 1;
|
69
|
+
});
|
70
|
+
|
71
|
+
const result = Object.entries(frequencies).map(([value, count]) => ({
|
72
|
+
value: value === 'null' ? null : value,
|
73
|
+
frequency: count,
|
74
|
+
relativeFrequency: count / total,
|
75
|
+
percentage: (count / total) * 100
|
76
|
+
}));
|
77
|
+
|
78
|
+
return result.sort((a, b) => b.frequency - a.frequency);
|
79
|
+
}
|
80
|
+
|
81
|
+
groupBy(dataset, column, aggregation) {
|
82
|
+
const groups = {};
|
83
|
+
|
84
|
+
dataset.data.forEach(row => {
|
85
|
+
const key = row[column];
|
86
|
+
if (!groups[key]) {
|
87
|
+
groups[key] = [];
|
88
|
+
}
|
89
|
+
groups[key].push(row);
|
90
|
+
});
|
91
|
+
|
92
|
+
const result = {};
|
93
|
+
Object.entries(groups).forEach(([key, rows]) => {
|
94
|
+
result[key] = {
|
95
|
+
count: rows.length,
|
96
|
+
data: rows
|
97
|
+
};
|
98
|
+
|
99
|
+
if (aggregation && typeof aggregation === 'object') {
|
100
|
+
Object.entries(aggregation).forEach(([targetCol, func]) => {
|
101
|
+
const values = rows.map(row => row[targetCol]).filter(v =>
|
102
|
+
typeof v === 'number' && !isNaN(v)
|
103
|
+
);
|
104
|
+
|
105
|
+
if (values.length > 0) {
|
106
|
+
result[key][`${func}_${targetCol}`] = this.applyAggregation(values, func);
|
107
|
+
}
|
108
|
+
});
|
109
|
+
}
|
110
|
+
});
|
111
|
+
|
112
|
+
return result;
|
113
|
+
}
|
114
|
+
|
115
|
+
applyAggregation(values, func) {
|
116
|
+
switch (func) {
|
117
|
+
case 'mean': return this.mean(values);
|
118
|
+
case 'median': return this.median(values);
|
119
|
+
case 'sum': return values.reduce((a, b) => a + b, 0);
|
120
|
+
case 'min': return Math.min(...values);
|
121
|
+
case 'max': return Math.max(...values);
|
122
|
+
case 'std': return this.standardDeviation(values);
|
123
|
+
case 'var': return this.variance(values);
|
124
|
+
case 'count': return values.length;
|
125
|
+
default: throw new Error(`Unknown aggregation function: ${func}`);
|
126
|
+
}
|
127
|
+
}
|
128
|
+
|
129
|
+
sample(dataset, size, method = 'random') {
|
130
|
+
if (size >= dataset.length) {
|
131
|
+
return { ...dataset };
|
132
|
+
}
|
133
|
+
|
134
|
+
let sampledData;
|
135
|
+
|
136
|
+
switch (method) {
|
137
|
+
case 'random':
|
138
|
+
const indices = this.randomSample(dataset.length, size);
|
139
|
+
sampledData = indices.map(i => dataset.data[i]);
|
140
|
+
break;
|
141
|
+
|
142
|
+
case 'systematic':
|
143
|
+
const interval = Math.floor(dataset.length / size);
|
144
|
+
sampledData = [];
|
145
|
+
for (let i = 0; i < size; i++) {
|
146
|
+
sampledData.push(dataset.data[i * interval]);
|
147
|
+
}
|
148
|
+
break;
|
149
|
+
|
150
|
+
case 'first':
|
151
|
+
sampledData = dataset.data.slice(0, size);
|
152
|
+
break;
|
153
|
+
|
154
|
+
case 'last':
|
155
|
+
sampledData = dataset.data.slice(-size);
|
156
|
+
break;
|
157
|
+
|
158
|
+
default:
|
159
|
+
throw new Error(`Unknown sampling method: ${method}`);
|
160
|
+
}
|
161
|
+
|
162
|
+
return {
|
163
|
+
...dataset,
|
164
|
+
data: sampledData,
|
165
|
+
length: sampledData.length
|
166
|
+
};
|
167
|
+
}
|
168
|
+
|
169
|
+
randomSample(populationSize, sampleSize) {
|
170
|
+
const indices = Array.from({ length: populationSize }, (_, i) => i);
|
171
|
+
const sample = [];
|
172
|
+
|
173
|
+
for (let i = 0; i < sampleSize; i++) {
|
174
|
+
const randomIndex = Math.floor(Math.random() * indices.length);
|
175
|
+
sample.push(indices.splice(randomIndex, 1)[0]);
|
176
|
+
}
|
177
|
+
|
178
|
+
return sample;
|
179
|
+
}
|
180
|
+
|
181
|
+
bootstrap(sample, statistic, iterations = 1000) {
|
182
|
+
const bootstrapStats = [];
|
183
|
+
|
184
|
+
for (let i = 0; i < iterations; i++) {
|
185
|
+
const bootstrapSample = [];
|
186
|
+
for (let j = 0; j < sample.length; j++) {
|
187
|
+
const randomIndex = Math.floor(Math.random() * sample.length);
|
188
|
+
bootstrapSample.push(sample[randomIndex]);
|
189
|
+
}
|
190
|
+
|
191
|
+
const stat = this.applyStatistic(bootstrapSample, statistic);
|
192
|
+
bootstrapStats.push(stat);
|
193
|
+
}
|
194
|
+
|
195
|
+
return {
|
196
|
+
bootstrapStats: bootstrapStats.sort((a, b) => a - b),
|
197
|
+
mean: this.mean(bootstrapStats),
|
198
|
+
standardError: this.standardDeviation(bootstrapStats),
|
199
|
+
confidenceInterval: {
|
200
|
+
lower: this.quantile(bootstrapStats, 0.025),
|
201
|
+
upper: this.quantile(bootstrapStats, 0.975)
|
202
|
+
}
|
203
|
+
};
|
204
|
+
}
|
205
|
+
|
206
|
+
applyStatistic(sample, statistic) {
|
207
|
+
switch (statistic) {
|
208
|
+
case 'mean': return this.mean(sample);
|
209
|
+
case 'median': return this.median(sample);
|
210
|
+
case 'std': return this.standardDeviation(sample);
|
211
|
+
case 'var': return this.variance(sample);
|
212
|
+
default:
|
213
|
+
if (typeof statistic === 'function') {
|
214
|
+
return statistic(sample);
|
215
|
+
}
|
216
|
+
throw new Error(`Unknown statistic: ${statistic}`);
|
217
|
+
}
|
218
|
+
}
|
219
|
+
|
220
|
+
contingencyTable(col1, col2) {
|
221
|
+
const uniqueCol1 = [...new Set(col1)];
|
222
|
+
const uniqueCol2 = [...new Set(col2)];
|
223
|
+
|
224
|
+
const table = {};
|
225
|
+
const totals = { row: {}, col: {}, grand: 0 };
|
226
|
+
|
227
|
+
uniqueCol1.forEach(val1 => {
|
228
|
+
table[val1] = {};
|
229
|
+
totals.row[val1] = 0;
|
230
|
+
});
|
231
|
+
|
232
|
+
uniqueCol2.forEach(val2 => {
|
233
|
+
totals.col[val2] = 0;
|
234
|
+
});
|
235
|
+
|
236
|
+
for (let i = 0; i < col1.length; i++) {
|
237
|
+
const val1 = col1[i];
|
238
|
+
const val2 = col2[i];
|
239
|
+
|
240
|
+
if (!table[val1][val2]) {
|
241
|
+
table[val1][val2] = 0;
|
242
|
+
}
|
243
|
+
|
244
|
+
table[val1][val2]++;
|
245
|
+
totals.row[val1]++;
|
246
|
+
totals.col[val2]++;
|
247
|
+
totals.grand++;
|
248
|
+
}
|
249
|
+
|
250
|
+
uniqueCol1.forEach(val1 => {
|
251
|
+
uniqueCol2.forEach(val2 => {
|
252
|
+
if (!table[val1][val2]) {
|
253
|
+
table[val1][val2] = 0;
|
254
|
+
}
|
255
|
+
});
|
256
|
+
});
|
257
|
+
|
258
|
+
return { table, totals, rows: uniqueCol1, columns: uniqueCol2 };
|
259
|
+
}
|
260
|
+
|
261
|
+
mean(arr) {
|
262
|
+
return arr.reduce((sum, val) => sum + val, 0) / arr.length;
|
263
|
+
}
|
264
|
+
|
265
|
+
median(arr) {
|
266
|
+
const sorted = [...arr].sort((a, b) => a - b);
|
267
|
+
const mid = Math.floor(sorted.length / 2);
|
268
|
+
return sorted.length % 2 === 0 ?
|
269
|
+
(sorted[mid - 1] + sorted[mid]) / 2 :
|
270
|
+
sorted[mid];
|
271
|
+
}
|
272
|
+
|
273
|
+
quantile(arr, q) {
|
274
|
+
const sorted = [...arr].sort((a, b) => a - b);
|
275
|
+
const index = (sorted.length - 1) * q;
|
276
|
+
const lower = Math.floor(index);
|
277
|
+
const upper = Math.ceil(index);
|
278
|
+
const weight = index % 1;
|
279
|
+
|
280
|
+
if (lower === upper) {
|
281
|
+
return sorted[lower];
|
282
|
+
}
|
283
|
+
|
284
|
+
return sorted[lower] * (1 - weight) + sorted[upper] * weight;
|
285
|
+
}
|
286
|
+
|
287
|
+
standardDeviation(arr) {
|
288
|
+
return Math.sqrt(this.variance(arr));
|
289
|
+
}
|
290
|
+
|
291
|
+
variance(arr) {
|
292
|
+
const mean = this.mean(arr);
|
293
|
+
return arr.reduce((sum, val) => sum + Math.pow(val - mean, 2), 0) / (arr.length - 1);
|
294
|
+
}
|
295
|
+
|
296
|
+
round(value, decimals = 4) {
|
297
|
+
return Math.round(value * Math.pow(10, decimals)) / Math.pow(10, decimals);
|
298
|
+
}
|
299
|
+
|
300
|
+
formatNumber(value, decimals = 4) {
|
301
|
+
if (typeof value !== 'number') return value;
|
302
|
+
return this.round(value, decimals);
|
303
|
+
}
|
304
|
+
}
|
305
|
+
|
306
|
+
export default Utils;
|
@@ -0,0 +1,205 @@
|
|
1
|
+
class Validator {
|
2
|
+
validateData(dataset) {
|
3
|
+
const errors = [];
|
4
|
+
const warnings = [];
|
5
|
+
|
6
|
+
if (!dataset || typeof dataset !== 'object') {
|
7
|
+
errors.push('Dataset must be an object');
|
8
|
+
return { valid: false, errors, warnings };
|
9
|
+
}
|
10
|
+
|
11
|
+
if (!dataset.data || !Array.isArray(dataset.data)) {
|
12
|
+
errors.push('Dataset must contain a data array');
|
13
|
+
}
|
14
|
+
|
15
|
+
if (!dataset.headers || !Array.isArray(dataset.headers)) {
|
16
|
+
errors.push('Dataset must contain a headers array');
|
17
|
+
}
|
18
|
+
|
19
|
+
if (dataset.data && dataset.data.length === 0) {
|
20
|
+
warnings.push('Dataset is empty');
|
21
|
+
}
|
22
|
+
|
23
|
+
if (dataset.data && dataset.headers) {
|
24
|
+
const headerSet = new Set(dataset.headers);
|
25
|
+
if (headerSet.size !== dataset.headers.length) {
|
26
|
+
errors.push('Duplicate column headers found');
|
27
|
+
}
|
28
|
+
|
29
|
+
dataset.data.forEach((row, index) => {
|
30
|
+
const rowKeys = Object.keys(row);
|
31
|
+
const missingHeaders = dataset.headers.filter(h => !rowKeys.includes(h));
|
32
|
+
const extraKeys = rowKeys.filter(k => !dataset.headers.includes(k));
|
33
|
+
|
34
|
+
if (missingHeaders.length > 0) {
|
35
|
+
warnings.push(`Row ${index}: Missing columns: ${missingHeaders.join(', ')}`);
|
36
|
+
}
|
37
|
+
|
38
|
+
if (extraKeys.length > 0) {
|
39
|
+
warnings.push(`Row ${index}: Extra columns: ${extraKeys.join(', ')}`);
|
40
|
+
}
|
41
|
+
});
|
42
|
+
}
|
43
|
+
|
44
|
+
return {
|
45
|
+
valid: errors.length === 0,
|
46
|
+
errors,
|
47
|
+
warnings
|
48
|
+
};
|
49
|
+
}
|
50
|
+
|
51
|
+
validateNumericColumn(column) {
|
52
|
+
if (!Array.isArray(column)) {
|
53
|
+
throw new Error('Column must be an array');
|
54
|
+
}
|
55
|
+
|
56
|
+
const numericValues = column.filter(val =>
|
57
|
+
typeof val === 'number' && !isNaN(val) && isFinite(val)
|
58
|
+
);
|
59
|
+
|
60
|
+
if (numericValues.length === 0) {
|
61
|
+
throw new Error('Column contains no valid numeric values');
|
62
|
+
}
|
63
|
+
|
64
|
+
return {
|
65
|
+
valid: true,
|
66
|
+
validCount: numericValues.length,
|
67
|
+
invalidCount: column.length - numericValues.length,
|
68
|
+
cleanData: numericValues
|
69
|
+
};
|
70
|
+
}
|
71
|
+
|
72
|
+
validateSampleSize(sample, minSize = 2) {
|
73
|
+
if (!Array.isArray(sample)) {
|
74
|
+
throw new Error('Sample must be an array');
|
75
|
+
}
|
76
|
+
|
77
|
+
if (sample.length < minSize) {
|
78
|
+
throw new Error(`Sample size (${sample.length}) must be at least ${minSize}`);
|
79
|
+
}
|
80
|
+
|
81
|
+
return true;
|
82
|
+
}
|
83
|
+
|
84
|
+
validateConfidenceLevel(confidence) {
|
85
|
+
if (typeof confidence !== 'number' || confidence <= 0 || confidence >= 1) {
|
86
|
+
throw new Error('Confidence level must be a number between 0 and 1');
|
87
|
+
}
|
88
|
+
return true;
|
89
|
+
}
|
90
|
+
|
91
|
+
validateCorrelationInputs(col1, col2) {
|
92
|
+
this.validateNumericColumn(col1);
|
93
|
+
this.validateNumericColumn(col2);
|
94
|
+
|
95
|
+
if (col1.length !== col2.length) {
|
96
|
+
throw new Error('Columns must have the same length');
|
97
|
+
}
|
98
|
+
|
99
|
+
if (col1.length < 3) {
|
100
|
+
throw new Error('Need at least 3 paired observations for correlation');
|
101
|
+
}
|
102
|
+
|
103
|
+
return true;
|
104
|
+
}
|
105
|
+
|
106
|
+
validateRegressionInputs(x, y) {
|
107
|
+
this.validateNumericColumn(x);
|
108
|
+
this.validateNumericColumn(y);
|
109
|
+
|
110
|
+
if (x.length !== y.length) {
|
111
|
+
throw new Error('X and Y arrays must have the same length');
|
112
|
+
}
|
113
|
+
|
114
|
+
if (x.length < 3) {
|
115
|
+
throw new Error('Need at least 3 data points for regression');
|
116
|
+
}
|
117
|
+
|
118
|
+
const xVariance = this.calculateVariance(x);
|
119
|
+
if (xVariance === 0) {
|
120
|
+
throw new Error('X values must have non-zero variance');
|
121
|
+
}
|
122
|
+
|
123
|
+
return true;
|
124
|
+
}
|
125
|
+
|
126
|
+
calculateVariance(arr) {
|
127
|
+
const mean = arr.reduce((sum, val) => sum + val, 0) / arr.length;
|
128
|
+
return arr.reduce((sum, val) => sum + Math.pow(val - mean, 2), 0) / (arr.length - 1);
|
129
|
+
}
|
130
|
+
|
131
|
+
validateGroupsForANOVA(groups) {
|
132
|
+
if (!Array.isArray(groups) || groups.length < 2) {
|
133
|
+
throw new Error('ANOVA requires at least 2 groups');
|
134
|
+
}
|
135
|
+
|
136
|
+
groups.forEach((group, index) => {
|
137
|
+
if (!Array.isArray(group)) {
|
138
|
+
throw new Error(`Group ${index} must be an array`);
|
139
|
+
}
|
140
|
+
|
141
|
+
this.validateSampleSize(group, 2);
|
142
|
+
this.validateNumericColumn(group);
|
143
|
+
});
|
144
|
+
|
145
|
+
return true;
|
146
|
+
}
|
147
|
+
|
148
|
+
validateContingencyTable(col1, col2) {
|
149
|
+
if (!Array.isArray(col1) || !Array.isArray(col2)) {
|
150
|
+
throw new Error('Both columns must be arrays');
|
151
|
+
}
|
152
|
+
|
153
|
+
if (col1.length !== col2.length) {
|
154
|
+
throw new Error('Columns must have the same length');
|
155
|
+
}
|
156
|
+
|
157
|
+
if (col1.length < 5) {
|
158
|
+
throw new Error('Need at least 5 observations for chi-square test');
|
159
|
+
}
|
160
|
+
|
161
|
+
return true;
|
162
|
+
}
|
163
|
+
|
164
|
+
isInteger(value) {
|
165
|
+
return typeof value === 'number' && Number.isInteger(value);
|
166
|
+
}
|
167
|
+
|
168
|
+
isPositive(value) {
|
169
|
+
return typeof value === 'number' && value > 0;
|
170
|
+
}
|
171
|
+
|
172
|
+
isInRange(value, min, max) {
|
173
|
+
return typeof value === 'number' && value >= min && value <= max;
|
174
|
+
}
|
175
|
+
|
176
|
+
hasMinimumObservations(data, minimum) {
|
177
|
+
return Array.isArray(data) && data.length >= minimum;
|
178
|
+
}
|
179
|
+
|
180
|
+
checkForConstantValues(column) {
|
181
|
+
const uniqueValues = new Set(column);
|
182
|
+
return uniqueValues.size === 1;
|
183
|
+
}
|
184
|
+
|
185
|
+
validateHypothesisTestInputs(sample1, sample2, testType) {
|
186
|
+
this.validateSampleSize(sample1, 2);
|
187
|
+
|
188
|
+
if (testType === 'two-sample' || testType === 'paired') {
|
189
|
+
this.validateSampleSize(sample2, 2);
|
190
|
+
|
191
|
+
if (testType === 'paired' && sample1.length !== sample2.length) {
|
192
|
+
throw new Error('Paired samples must have the same length');
|
193
|
+
}
|
194
|
+
}
|
195
|
+
|
196
|
+
this.validateNumericColumn(sample1);
|
197
|
+
if (sample2) {
|
198
|
+
this.validateNumericColumn(sample2);
|
199
|
+
}
|
200
|
+
|
201
|
+
return true;
|
202
|
+
}
|
203
|
+
}
|
204
|
+
|
205
|
+
export default Validator;
|