@datagrok/eda 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/README.md +3 -0
  2. package/detectors.js +9 -0
  3. package/dist/111.js +2 -0
  4. package/dist/146.js +2 -0
  5. package/dist/155.js +2 -0
  6. package/dist/355.js +2 -0
  7. package/dist/584.js +2 -0
  8. package/dist/604.js +2 -0
  9. package/dist/632.js +2 -0
  10. package/dist/645.js +2 -0
  11. package/dist/93.js +2 -0
  12. package/dist/d711f70338306e5bddc4.wasm +0 -0
  13. package/dist/package-test.js +2 -0
  14. package/dist/package.js +2 -0
  15. package/package.json +49 -0
  16. package/package.png +0 -0
  17. package/scripts/command.txt +1 -0
  18. package/scripts/exportForTS.py +862 -0
  19. package/scripts/exportForTSConstants.py +93 -0
  20. package/scripts/func.json +1 -0
  21. package/scripts/module.json +11 -0
  22. package/src/EDAtools.ts +46 -0
  23. package/src/EDAui.ts +118 -0
  24. package/src/dataGenerators.ts +74 -0
  25. package/src/demos.ts +38 -0
  26. package/src/package-test.ts +12 -0
  27. package/src/package.ts +248 -0
  28. package/src/svm.ts +485 -0
  29. package/src/utils.ts +51 -0
  30. package/tsconfig.json +71 -0
  31. package/wasm/EDA.js +443 -0
  32. package/wasm/EDA.wasm +0 -0
  33. package/wasm/EDAAPI.js +131 -0
  34. package/wasm/EDAForWebWorker.js +21 -0
  35. package/wasm/PCA/PCA.cpp +151 -0
  36. package/wasm/PCA/PCA.h +48 -0
  37. package/wasm/PLS/PLS.h +64 -0
  38. package/wasm/PLS/pls.cpp +393 -0
  39. package/wasm/callWasm.js +475 -0
  40. package/wasm/callWasmForWebWorker.js +706 -0
  41. package/wasm/dataGenerators.h +169 -0
  42. package/wasm/dataMining.h +116 -0
  43. package/wasm/pcaExport.cpp +64 -0
  44. package/wasm/plsExport.cpp +75 -0
  45. package/wasm/svm.h +608 -0
  46. package/wasm/svmApi.cpp +323 -0
  47. package/wasm/workers/errorWorker.js +13 -0
  48. package/wasm/workers/generateDatasetWorker.js +13 -0
  49. package/wasm/workers/normalizeDatasetWorker.js +13 -0
  50. package/wasm/workers/partialLeastSquareRegressionWorker.js +13 -0
  51. package/wasm/workers/predictByLSSVMWorker.js +13 -0
  52. package/wasm/workers/principalComponentAnalysisWorker.js +13 -0
  53. package/wasm/workers/trainAndAnalyzeLSSVMWorker.js +13 -0
  54. package/wasm/workers/trainLSSVMWorker.js +13 -0
  55. package/webpack.config.js +37 -0
@@ -0,0 +1,169 @@
1
+ // dataGenerators.h
2
+
3
+ // Tools for generating datasets for testing SVM.
4
+
5
+ #ifndef DATA_GENERATORS_H
6
+ #define DATA_GENERATORS_H
7
+
8
+ #include<cstdlib>
9
+ using namespace std;
10
+
11
+ #include "../../../../Eigen/Eigen/Dense"
12
+ using namespace Eigen;
13
+
14
+ #include "svm.h"
15
+
16
+ namespace svm
17
+ {
18
+ // Constants for random generation
19
+ const unsigned SEED = 10214313;
20
+ const int RAND_SCALE = 1000;
21
+
22
+ /* Change data labels by opposite values.
23
+ Each label value is replaced by the corresponding opposite one
24
+ with the specified probability
25
+ labels - data labels
26
+ samplesCount - number of labels
27
+ changeProbability - probability that each label is changed */
28
+ template<typename Float>
29
+ int changeLabels(Float* labels, int samplesCount, Float changeProbability) noexcept
30
+ {
31
+ using namespace svm;
32
+
33
+ // check probability value
34
+ if ((changeProbability < static_cast<Float>(0)) ||
35
+ (changeProbability > static_cast<Float>(1)))
36
+ return INCORRECT_PROBABILITY;
37
+
38
+ // check size
39
+ if (samplesCount < 1)
40
+ return INCORRECT_SIZE;
41
+
42
+ // randomize
43
+ srand(SEED + samplesCount);
44
+
45
+ // change values in a random manner
46
+ for (int i = 0; i < samplesCount; ++i)
47
+ if (static_cast<Float>(rand() % RAND_SCALE) / RAND_SCALE < changeProbability)
48
+ labels[i] = -labels[i];
49
+
50
+ return NO_ERRORS;
51
+ } // changeLabels
52
+
53
+ /* Generate dataset: separable case. Features are generated randomly using the uniform distribution.
54
+ Each feature belongs to the corresponding segment [min, max].
55
+ kernel - type of kernel
56
+ kernelParams - parameters of kernel
57
+ featuresCount - number of features, i.e. dimension
58
+ samplesCount - number of the generated samples
59
+ minVal - min value
60
+ maxVal - max value
61
+ data - generated data
62
+ labels - generated labels
63
+ WARNING. Memory for data and labels must be allocated outside this function. */
64
+ template<typename Float>
65
+ int generateSeparable(int kernel, float kernelParams[MAX_NUM_OF_KERNEL_PARAM],
66
+ int featuresCount, int samplesCount,
67
+ Float minVal, Float maxVal,
68
+ Float* data, Float* labels) noexcept
69
+ {
70
+ using namespace svm;
71
+
72
+ // check parameters correctness
73
+ if (!areKernelParametersCorrect(kernel, kernelParams))
74
+ return INCORRECT_PARAMETER_OF_KERNEL;
75
+
76
+ // check sizes
77
+ if ((featuresCount < 1) || (samplesCount < 1))
78
+ return INCORRECT_SIZE;
79
+
80
+ // randomize
81
+ srand(SEED + samplesCount + featuresCount);
82
+
83
+ // assign data pointer with a matrix
84
+ Map<Matrix<Float, Dynamic, Dynamic, ColMajor>> X(data, samplesCount, featuresCount);
85
+
86
+ // generate random matrix: values from [-1, 1] are generated
87
+ X = Matrix<Float, Dynamic, Dynamic, ColMajor>::Random(samplesCount, featuresCount);
88
+
89
+ // generate core vector
90
+ RowVector<float, Dynamic> v(featuresCount);
91
+
92
+ // linear transform coefficients
93
+ Float c1 = (maxVal - minVal) / 2;
94
+ Float c2 = (maxVal + minVal) / 2;
95
+
96
+ // rescale data: each feature should belong to the correspondent [min, max] segment
97
+ for (int i = 0; i < featuresCount; ++i)
98
+ {
99
+ // linear [-1,1]-to-[min,max] transform
100
+ X.col(i) = X.col(i) * c1 + c2 * Vector<Float, Dynamic>::Ones(samplesCount);
101
+
102
+ Float randNum = static_cast<Float>(-0.5) + static_cast<Float>(rand() % RAND_SCALE) / RAND_SCALE;
103
+ v(i) = randNum * c1 + c2;
104
+ }
105
+
106
+ // bias value
107
+ Float bias = kernelFunc(kernel, kernelParams, v, v);
108
+
109
+ // This is a heruistics
110
+ if (kernel == RBF)
111
+ bias /= 2;
112
+
113
+ // auxilliry vector
114
+ RowVector<float, Dynamic> w(featuresCount);
115
+
116
+ // compute labels
117
+ for (int i = 0; i < samplesCount; ++i)
118
+ {
119
+ w = X.row(i);
120
+ Float val = kernelFunc(kernel, kernelParams, w, v) - bias;
121
+
122
+ labels[i] = (val > static_cast<Float>(0)) ? static_cast<Float>(1) : static_cast<Float>(-1);
123
+ }
124
+
125
+ return NO_ERRORS;
126
+ } // generateSeparable
127
+
128
+ /* Generate dataset: non-separable case.
129
+ Features are generated randomly using the uniform distribution.
130
+ Each feature belongs to the corresponding segment [min, max].
131
+ kernel - type of kernel
132
+ kernelParams - parameters of kernel
133
+ featuresCount - number of features, i.e. dimension
134
+ samplesCount - number of the generated samples
135
+ minVal - min value
136
+ maxVal - max value
137
+ data - generated data
138
+ labels - generated labels
139
+ violatorsPercentage - percentage of values that violate separability
140
+
141
+ WARNINGS. 1. Memory for data and labels must be allocated outside this function.
142
+ 2. Since violators are generated randomly, actual number of vilators
143
+ may differ from the given percentage. */
144
+ template<typename Float>
145
+ int generateNonSeparable(int kernel, float kernelParams[MAX_NUM_OF_KERNEL_PARAM],
146
+ int featuresCount, int samplesCount,
147
+ Float minVal, Float maxVal,
148
+ Float* data, Float* labels,
149
+ Float violatorsPercentage) noexcept
150
+ {
151
+ using namespace svm;
152
+
153
+ // check percentage
154
+ if ((violatorsPercentage < static_cast<Float>(0)) ||
155
+ (violatorsPercentage > static_cast<Float>(100)))
156
+ return INCORRECT_PERCENTAGE;
157
+
158
+ // generate separable dataset
159
+ int resCode = generateSeparable(kernel, kernelParams, featuresCount, samplesCount,
160
+ minVal, maxVal, data, labels);
161
+ if (resCode != NO_ERRORS)
162
+ return resCode;
163
+
164
+ // create violators
165
+ return changeLabels(labels, samplesCount, violatorsPercentage / 100);
166
+ } // generateNonSeparable
167
+ }; // svm
168
+
169
+ #endif // DATA_GENERATORS_H
@@ -0,0 +1,116 @@
1
+ // dataMining.h
2
+
3
+ // Data mining tools
4
+
5
+ #ifndef DATA_MINING_H
6
+ #define DATA_MINING_H
7
+
8
+ #include "../../../../Eigen/Eigen/Dense"
9
+ using namespace Eigen;
10
+
11
+ // data mining tools
12
+ namespace dmt {
13
+
14
+ enum ResultCode {
15
+ NO_ERRORS = 0,
16
+ UNKNOWN_PROBLEM,
17
+ INCORRECT_SIZE
18
+ };
19
+
20
+ // confusion matrix constants
21
+ const int CONFUSION_MATR_SIZE = 4;
22
+ const int TRUE_POSITIVE_INDEX = 0;
23
+ const int FALSE_NEGATIVE_INDEX = 1;
24
+ const int FALSE_POSITIVE_INDEX = 2;
25
+ const int TRUE_NEGATIVE_INDEX = 3;
26
+
27
+ /* Create normalized dataset from columns data.
28
+ Each column of the ouput is centered and normalized.
29
+ columsData - pointer to columns data
30
+ rowCount - number of rows
31
+ colCount - number of columns
32
+ normalizedDataRows - pointer to normalized data rows
33
+ means - mean values of source columns
34
+ stdDevs - standard deviations of source columns
35
+
36
+ REMARKS. 1. In DATAGROK, column-oriented data storage is used,
37
+ but row-oriented approach is preffered in SVM, and
38
+ this function provides it.
39
+ 2. Row-oriented data storage is a result. */
40
+ template<typename Float>
41
+ int getNormalizedDataset(Float* columsData, int rowCount, int colCount,
42
+ Float* normalizedDataRows, Float* means, Float* stdDevs) noexcept
43
+ {
44
+ // check sizes
45
+ if ((rowCount < 1) || (colCount < 1))
46
+ return INCORRECT_SIZE;
47
+
48
+ // pointers-to-matrices assignment
49
+ Map < Matrix<Float, Dynamic, Dynamic, ColMajor>> A(columsData, rowCount, colCount);
50
+ Map < Matrix<Float, Dynamic, Dynamic, RowMajor>> B(normalizedDataRows, rowCount, colCount);
51
+ Map < Vector<Float, Dynamic> > mu(means, colCount);
52
+ Map < Vector<Float, Dynamic> > sigma(stdDevs, colCount);
53
+
54
+ // compute mean values & standard deviations
55
+ for (int i = 0; i < colCount; ++i)
56
+ {
57
+ mu(i) = A.col(i).mean();
58
+ sigma(i) = sqrt(A.col(i).squaredNorm() / rowCount - mu(i) * mu(i));
59
+ }
60
+
61
+ // get A centered
62
+ B = A.rowwise() - mu.transpose();
63
+
64
+ // norm columns of B
65
+ for (int i = 0; i < colCount; ++i)
66
+ {
67
+ Float current = sigma(i);
68
+
69
+ if (current > static_cast<Float>(0))
70
+ B.col(i) /= current;
71
+ }
72
+
73
+ return NO_ERRORS;
74
+ } // createNormalizedDataset
75
+
76
+ /* Compare labels and their prediciotns: BINARY CLASSIFICATION CASE.
77
+ labels - training labels
78
+ predictions - predicted labels
79
+ correctness - array of mistakes (1 - correct prediction, 0 - incorrect prediction)
80
+ samplesCount - number of training samples
81
+ confusionMatrix - confusion matrix */
82
+ template<typename Float>
83
+ int compareLabelsAndTheirPredictions(Float* labels, Float* predictions,
84
+ Float* correctness, int samplesCount,
85
+ int confusionMatrix[CONFUSION_MATR_SIZE])
86
+ {
87
+ Float zero = static_cast<Float>(0);
88
+
89
+ // initialization
90
+ for (int i = 0; i < CONFUSION_MATR_SIZE; ++i)
91
+ confusionMatrix[i] = 0;
92
+
93
+ // labels vs. prediction comparison
94
+ for (int i = 0; i < samplesCount; ++i)
95
+ {
96
+ correctness[i] = labels[i] * predictions[i];
97
+
98
+ if (labels[i] > zero)
99
+ if (predictions[i] > zero)
100
+ ++confusionMatrix[TRUE_POSITIVE_INDEX];
101
+ else
102
+ ++confusionMatrix[FALSE_NEGATIVE_INDEX];
103
+ else
104
+ if (predictions[i] > zero)
105
+ ++confusionMatrix[FALSE_POSITIVE_INDEX];
106
+ else
107
+ ++confusionMatrix[TRUE_NEGATIVE_INDEX];
108
+ }
109
+
110
+ return NO_ERRORS;
111
+ } // compareLabelsAndTheirPredictions
112
+
113
+ } // dmt
114
+
115
+ #endif // DATA_MINING_H
116
+
@@ -0,0 +1,64 @@
1
+ // This file contains C++-functions that are exported to wasm.
2
+
3
+ // The tool Emscripten is applied (the header emscripten.h is included
4
+ // and each exported function is marked by EMSCRIPTEN_KEEPALIVE).
5
+
6
+ // Also, each function has a special DATAGROK annotation for C++-functions.
7
+ // This approach provides further usage of C++-to-wasm export script that
8
+ // performes all routine steps.
9
+
10
+ #include <emscripten.h>
11
+
12
+ // The following provides convenient naming of the exported functions.
13
+ extern "C" {
14
+
15
+ int principalComponentAnalysis(float * data,
16
+ int dataNumOfRows,
17
+ int dataNumOfColumns,
18
+ int numOfPrincipalComponents,
19
+ int centerNum,
20
+ int scaleNum,
21
+ float * principalComponents,
22
+ int principalComponentsNumOfRows,
23
+ int principalComponentsNumOfColumns);
24
+
25
+ float error(float * data1, int data1Length, float * data2, int data2Length);
26
+ }
27
+
28
+ #include "PCA\PCA.h"
29
+
30
+ //name: principalComponentAnalysis
31
+ //input: dataframe table
32
+ //input: column_list columns
33
+ //input: int componentsCount
34
+ //input: int centerNum
35
+ //input: int scaleNum
36
+ //output: column_list components [new(columns.rowCount, componentsCount)]
37
+ //output: dataframe result [components]
38
+ EMSCRIPTEN_KEEPALIVE
39
+ int principalComponentAnalysis(float * data,
40
+ int dataNumOfRows,
41
+ int dataNumOfColumns,
42
+ int numOfPrincipalComponents,
43
+ int centerNum,
44
+ int scaleNum,
45
+ float * principalComponents,
46
+ int principalComponentsNumOfRows,
47
+ int principalComponentsNumOfColumns)
48
+ {
49
+ return pca::pcaUsingCorrelationMatrix(data, dataNumOfColumns, dataNumOfRows,
50
+ numOfPrincipalComponents, centerNum, scaleNum, principalComponents, 0);
51
+ }
52
+
53
+ //name: error
54
+ //input: dataframe df
55
+ //input: column col1
56
+ //input: column col2
57
+ //output: double mad [_callResult]
58
+ EMSCRIPTEN_KEEPALIVE
59
+ float error(float * data1, int data1Length, float * data2, int data2Length)
60
+ {
61
+ return pca::mad(data1, data2, data1Length);
62
+ }
63
+
64
+
@@ -0,0 +1,75 @@
1
+ // This file contains C++-functions that are exported to wasm.
2
+
3
+ // The tool Emscripten is applied (the header emscripten.h is included
4
+ // and each exported function is marked by EMSCRIPTEN_KEEPALIVE).
5
+
6
+ // Also, each function has a special DATAGROK annotation for C++-functions.
7
+ // This approach provides further usage of C++-to-wasm export script that
8
+ // performes all routine steps.
9
+
10
+ #include <emscripten.h>
11
+
12
+ // The following provides convenient naming of the exported functions.
13
+ extern "C" {
14
+
15
+ int partialLeastSquareRegression(float * predictorColumns,
16
+ int rowCount,
17
+ int columnCount,
18
+ float * responseColumn,
19
+ int responceColumnLength,
20
+ int componentsCount,
21
+ float * predictionColumn,
22
+ int predictionColumnLength,
23
+ float * regressionCoefficients,
24
+ int regressionCoefficientsLength,
25
+ float * predictorScoresColumns,
26
+ int predictorScoresColumnsRowCount,
27
+ int predictorScoresColumnsColumnCount,
28
+ float * predictionScoresColumns,
29
+ int predictionScoresColumnsRowCount,
30
+ int predictionScoresColumnsColumnCount,
31
+ float * predictionLoadingsColumns,
32
+ int predictionLoadingsColumnsRowCount,
33
+ int predictionLoadingsColumnsColumnCount);
34
+ }
35
+
36
+ #include "PLS\PLS.h"
37
+
38
+ //name: partialLeastSquareRegression
39
+ //input: dataframe table
40
+ //input: column_list features
41
+ //input: column predict
42
+ //input: int componentsCount
43
+ //output: column prediction [new(predict.rowCount)]
44
+ //output: column regressionCoefficients [new(features.columnCount)]
45
+ //output: column_list tScores [new(predict.rowCount, componentsCount)]
46
+ //output: column_list uScores [new(predict.rowCount, componentsCount)]
47
+ //output: column_list xLoadings [new(features.columnCount, componentsCount)]
48
+ EMSCRIPTEN_KEEPALIVE
49
+ int partialLeastSquareRegression(float * predictorColumns,
50
+ int rowCount,
51
+ int columnCount,
52
+ float * responseColumn,
53
+ int responceColumnLength,
54
+ int componentsCount,
55
+ float * predictionColumn,
56
+ int predictionColumnLength,
57
+ float * regressionCoefficients,
58
+ int regressionCoefficientsLength,
59
+ float * predictorScoresColumns,
60
+ int predictorScoresColumnsRowCount,
61
+ int predictorScoresColumnsColumnCount,
62
+ float * predictionScoresColumns,
63
+ int predictionScoresColumnsRowCount,
64
+ int predictionScoresColumnsColumnCount,
65
+ float * predictionLoadingsColumns,
66
+ int predictionLoadingsColumnsRowCount,
67
+ int predictionLoadingsColumnsColumnCount)
68
+ {
69
+ return pls::partialLeastSquareExtended(predictorColumns, rowCount, columnCount,
70
+ responseColumn, componentsCount, predictionColumn, regressionCoefficients,
71
+ predictorScoresColumns, predictionScoresColumns, predictionLoadingsColumns);
72
+ }
73
+
74
+
75
+