@memlab/core 1.1.5 → 1.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. package/dist/__tests__/parser/HeapParser.test.d.ts +1 -1
  2. package/dist/__tests__/parser/HeapParser.test.js +3 -3
  3. package/dist/__tests__/parser/NodeHeap.test.d.ts +1 -1
  4. package/dist/__tests__/parser/NodeHeap.test.js +6 -6
  5. package/dist/__tests__/parser/StringNode.test.d.ts +1 -1
  6. package/dist/__tests__/parser/StringNode.test.js +2 -2
  7. package/dist/__tests__/parser/traverse/HeapNodeTraverse.test.d.ts +1 -1
  8. package/dist/__tests__/parser/traverse/HeapNodeTraverse.test.js +3 -3
  9. package/dist/__tests__/utils/utils.test.d.ts +1 -1
  10. package/dist/__tests__/utils/utils.test.js +1 -1
  11. package/dist/index.d.ts +4 -2
  12. package/dist/index.js +6 -3
  13. package/dist/lib/BaseOption.d.ts +1 -1
  14. package/dist/lib/BaseOption.js +1 -1
  15. package/dist/lib/BrowserInfo.d.ts +1 -1
  16. package/dist/lib/BrowserInfo.js +1 -1
  17. package/dist/lib/Config.d.ts +11 -2
  18. package/dist/lib/Config.js +25 -5
  19. package/dist/lib/Console.d.ts +1 -1
  20. package/dist/lib/Console.js +1 -1
  21. package/dist/lib/Constant.d.ts +1 -1
  22. package/dist/lib/Constant.js +1 -1
  23. package/dist/lib/FileManager.d.ts +1 -1
  24. package/dist/lib/FileManager.js +5 -3
  25. package/dist/lib/HeapAnalyzer.d.ts +1 -1
  26. package/dist/lib/HeapAnalyzer.js +26 -10
  27. package/dist/lib/HeapParser.d.ts +2 -2
  28. package/dist/lib/HeapParser.js +2 -2
  29. package/dist/lib/InternalValueSetter.d.ts +1 -1
  30. package/dist/lib/InternalValueSetter.js +1 -1
  31. package/dist/lib/NodeHeap.d.ts +53 -10
  32. package/dist/lib/NodeHeap.js +73 -22
  33. package/dist/lib/PackageInfoLoader.js +2 -2
  34. package/dist/lib/ProcessManager.d.ts +1 -1
  35. package/dist/lib/ProcessManager.js +1 -1
  36. package/dist/lib/Serializer.d.ts +1 -1
  37. package/dist/lib/Serializer.js +49 -26
  38. package/dist/lib/StringLoader.d.ts +2 -2
  39. package/dist/lib/StringLoader.js +2 -2
  40. package/dist/lib/Types.d.ts +111 -36
  41. package/dist/lib/Types.js +1 -1
  42. package/dist/lib/Utils.d.ts +1 -1
  43. package/dist/lib/Utils.js +55 -31
  44. package/dist/lib/heap-data/HeapEdge.d.ts +2 -2
  45. package/dist/lib/heap-data/HeapEdge.js +2 -2
  46. package/dist/lib/heap-data/HeapLocation.d.ts +2 -2
  47. package/dist/lib/heap-data/HeapLocation.js +2 -2
  48. package/dist/lib/heap-data/HeapNode.d.ts +3 -2
  49. package/dist/lib/heap-data/HeapNode.js +6 -2
  50. package/dist/lib/heap-data/HeapSnapshot.d.ts +3 -2
  51. package/dist/lib/heap-data/HeapSnapshot.js +6 -33
  52. package/dist/lib/heap-data/HeapStringNode.d.ts +2 -2
  53. package/dist/lib/heap-data/HeapStringNode.js +4 -2
  54. package/dist/lib/heap-data/HeapUtils.d.ts +2 -2
  55. package/dist/lib/heap-data/HeapUtils.js +2 -2
  56. package/dist/lib/heap-data/MemLabTagStore.d.ts +23 -0
  57. package/dist/lib/heap-data/MemLabTagStore.js +110 -0
  58. package/dist/lib/leak-filters/BaseLeakFilter.rule.d.ts +1 -1
  59. package/dist/lib/leak-filters/BaseLeakFilter.rule.js +1 -1
  60. package/dist/lib/leak-filters/LeakFilterRuleList.d.ts +1 -1
  61. package/dist/lib/leak-filters/LeakFilterRuleList.js +1 -1
  62. package/dist/lib/leak-filters/LeakObjectFilter.d.ts +1 -1
  63. package/dist/lib/leak-filters/LeakObjectFilter.js +1 -1
  64. package/dist/lib/leak-filters/rules/FilterByExternalFilter.rule.d.ts +1 -1
  65. package/dist/lib/leak-filters/rules/FilterByExternalFilter.rule.js +1 -1
  66. package/dist/lib/leak-filters/rules/FilterDetachedDOMElement.rule.d.ts +1 -1
  67. package/dist/lib/leak-filters/rules/FilterDetachedDOMElement.rule.js +1 -1
  68. package/dist/lib/leak-filters/rules/FilterHermesNode.rule.d.ts +1 -1
  69. package/dist/lib/leak-filters/rules/FilterHermesNode.rule.js +1 -1
  70. package/dist/lib/leak-filters/rules/FilterOverSizedNodeAsLeak.rule.d.ts +1 -1
  71. package/dist/lib/leak-filters/rules/FilterOverSizedNodeAsLeak.rule.js +1 -1
  72. package/dist/lib/leak-filters/rules/FilterStackTraceFrame.rule.d.ts +1 -1
  73. package/dist/lib/leak-filters/rules/FilterStackTraceFrame.rule.js +1 -1
  74. package/dist/lib/leak-filters/rules/FilterTrivialNode.rule.d.ts +1 -1
  75. package/dist/lib/leak-filters/rules/FilterTrivialNode.rule.js +1 -1
  76. package/dist/lib/leak-filters/rules/FilterUnmountedFiberNode.rule.d.ts +1 -1
  77. package/dist/lib/leak-filters/rules/FilterUnmountedFiberNode.rule.js +1 -1
  78. package/dist/logger/LeakClusterLogger.d.ts +1 -1
  79. package/dist/logger/LeakClusterLogger.js +1 -1
  80. package/dist/logger/LeakTraceDetailsLogger.d.ts +1 -1
  81. package/dist/logger/LeakTraceDetailsLogger.js +1 -1
  82. package/dist/modes/BaseMode.d.ts +1 -1
  83. package/dist/modes/BaseMode.js +1 -1
  84. package/dist/modes/InteractionTestMode.d.ts +1 -1
  85. package/dist/modes/InteractionTestMode.js +1 -1
  86. package/dist/modes/MeasureMode.d.ts +1 -1
  87. package/dist/modes/MeasureMode.js +1 -1
  88. package/dist/modes/RunningModes.d.ts +1 -1
  89. package/dist/modes/RunningModes.js +1 -1
  90. package/dist/paths/TraceFinder.d.ts +1 -1
  91. package/dist/paths/TraceFinder.js +41 -42
  92. package/dist/trace-cluster/ClusterUtils.d.ts +1 -1
  93. package/dist/trace-cluster/ClusterUtils.js +1 -1
  94. package/dist/trace-cluster/ClusterUtilsHelper.d.ts +1 -1
  95. package/dist/trace-cluster/ClusterUtilsHelper.js +1 -1
  96. package/dist/trace-cluster/ClusteringHeuristics.d.ts +1 -1
  97. package/dist/trace-cluster/ClusteringHeuristics.js +1 -1
  98. package/dist/trace-cluster/EvalutationMetric.d.ts +1 -1
  99. package/dist/trace-cluster/EvalutationMetric.js +1 -1
  100. package/dist/trace-cluster/SequentialClustering.d.ts +17 -0
  101. package/dist/trace-cluster/SequentialClustering.js +47 -0
  102. package/dist/trace-cluster/TraceBucket.d.ts +2 -1
  103. package/dist/trace-cluster/TraceBucket.js +10 -2
  104. package/dist/trace-cluster/TraceElement.d.ts +3 -1
  105. package/dist/trace-cluster/TraceElement.js +7 -1
  106. package/dist/trace-cluster/strategies/MLTraceSimilarityStrategy.d.ts +15 -0
  107. package/dist/trace-cluster/strategies/MLTraceSimilarityStrategy.js +61 -0
  108. package/dist/trace-cluster/strategies/TraceAsClusterStrategy.d.ts +1 -1
  109. package/dist/trace-cluster/strategies/TraceAsClusterStrategy.js +1 -1
  110. package/dist/trace-cluster/strategies/TraceSimilarityStrategy.d.ts +1 -1
  111. package/dist/trace-cluster/strategies/TraceSimilarityStrategy.js +1 -1
  112. package/dist/trace-cluster/strategies/machine-learning/DistanceMatrix.d.ts +11 -0
  113. package/dist/trace-cluster/strategies/machine-learning/DistanceMatrix.js +54 -0
  114. package/dist/trace-cluster/strategies/machine-learning/HAC.d.ts +17 -0
  115. package/dist/trace-cluster/strategies/machine-learning/HAC.js +122 -0
  116. package/dist/trace-cluster/strategies/machine-learning/Ngram.d.ts +11 -0
  117. package/dist/trace-cluster/strategies/machine-learning/Ngram.js +22 -0
  118. package/dist/trace-cluster/strategies/machine-learning/TfidfVectorizer.d.ts +38 -0
  119. package/dist/trace-cluster/strategies/machine-learning/TfidfVectorizer.js +144 -0
  120. package/package.json +1 -1
@@ -0,0 +1,11 @@
1
+ /**
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ *
7
+ * @format
8
+ * @oncall ws_labs
9
+ */
10
+ export declare const distance: (tfidfs: Record<string, number>[]) => Float32Array;
11
+ //# sourceMappingURL=DistanceMatrix.d.ts.map
@@ -0,0 +1,54 @@
1
+ "use strict";
2
+ /**
3
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
4
+ *
5
+ * This source code is licensed under the MIT license found in the
6
+ * LICENSE file in the root directory of this source tree.
7
+ *
8
+ * @format
9
+ * @oncall ws_labs
10
+ */
11
+ Object.defineProperty(exports, "__esModule", { value: true });
12
+ exports.distance = void 0;
13
+ const cache = new Map();
14
+ const buildIntersection = (tfidfs, i, j) => {
15
+ const intersection = [];
16
+ if (!cache.has(i)) {
17
+ cache.set(i, Object.keys(tfidfs[i]));
18
+ }
19
+ if (!cache.has(j)) {
20
+ cache.set(j, Object.keys(tfidfs[j]));
21
+ }
22
+ const [keys, tfidf] = cache.get(i).length > cache.get(j).length
23
+ ? [cache.get(j), tfidfs[i]]
24
+ : [cache.get(i), tfidfs[j]];
25
+ for (const k of keys) {
26
+ if (tfidf[k]) {
27
+ intersection.push(k);
28
+ }
29
+ }
30
+ return intersection;
31
+ };
32
+ const distance = (tfidfs) => {
33
+ const n = tfidfs.length;
34
+ const distances = new Float32Array((n * (n - 1)) / 2);
35
+ let distIdx = 0;
36
+ const dotProducs = tfidfs.map(atfidf => Object.values(atfidf).reduce((sum, v) => sum + v * v, 0));
37
+ for (let i = 0; i < tfidfs.length; i++) {
38
+ const a = tfidfs[i];
39
+ for (let j = i + 1; j < tfidfs.length; j++) {
40
+ const b = tfidfs[j];
41
+ const intersection = buildIntersection(tfidfs, i, j);
42
+ const dotProdOfCommons = intersection.reduce((sum, vidx) => sum + a[vidx] * b[vidx], 0);
43
+ // TODO make it pluggable to use other distance measures like euclidean, manhattan
44
+ const cosineSimilarity = 1 -
45
+ dotProdOfCommons /
46
+ (Math.sqrt(dotProducs[i]) / Math.sqrt(dotProducs[j]));
47
+ distances[distIdx] = cosineSimilarity;
48
+ distIdx++;
49
+ }
50
+ }
51
+ cache.clear();
52
+ return distances;
53
+ };
54
+ exports.distance = distance;
@@ -0,0 +1,17 @@
1
+ /**
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ *
7
+ * @format
8
+ * @oncall ws_labs
9
+ */
10
+ /**
11
+ *
12
+ * @param {*} nDocs number of docs
13
+ * @param {*} D condenced distance matrix
14
+ * @returns labels - list of doc ids as clusters
15
+ */
16
+ export declare const cluster: (nDocs: number, condensedDistanceMatrix: Float32Array, maxDistanceThreshold: number) => number[] | Uint32Array;
17
+ //# sourceMappingURL=HAC.d.ts.map
@@ -0,0 +1,122 @@
1
+ "use strict";
2
+ /**
3
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
4
+ *
5
+ * This source code is licensed under the MIT license found in the
6
+ * LICENSE file in the root directory of this source tree.
7
+ *
8
+ * @format
9
+ * @oncall ws_labs
10
+ */
11
+ Object.defineProperty(exports, "__esModule", { value: true });
12
+ exports.cluster = void 0;
13
+ const condensedIndex = (n, i, j) => {
14
+ if (i > j) {
15
+ return condensedIndex(n, j, i);
16
+ }
17
+ // to get distance between (i, j) think of this sequence.
18
+ // (n - 1) + (n - 2) + ... + (n - i) + (j - i) - 1
19
+ return n * i - (i * (i + 1)) / 2 + (j - i - 1);
20
+ };
21
+ function getRootLabel(array, idx) {
22
+ let rootIdx = idx;
23
+ while (array[rootIdx] !== rootIdx) {
24
+ rootIdx = array[rootIdx];
25
+ }
26
+ return rootIdx;
27
+ }
28
+ /**
29
+ *
30
+ * @param {*} nDocs number of docs
31
+ * @param {*} D condenced distance matrix
32
+ * @returns labels - list of doc ids as clusters
33
+ */
34
+ const cluster = (nDocs, condensedDistanceMatrix, maxDistanceThreshold) => {
35
+ if (nDocs <= 1)
36
+ return [0];
37
+ const condencedDistanceMatrixCopy = new Float32Array(condensedDistanceMatrix);
38
+ const sizeOfClusters = new Uint32Array(nDocs).fill(1);
39
+ let chainLength = 0;
40
+ let clusterChain = [];
41
+ let traceAIdx = -1;
42
+ let traceBIdx = -1;
43
+ let currentMin = Number.MAX_SAFE_INTEGER;
44
+ let distanceBetweenTraces;
45
+ const labels = new Uint32Array(nDocs).map((_, idx) => idx);
46
+ for (let k = 0; k < nDocs - 1; k++) {
47
+ traceBIdx = -1;
48
+ if (chainLength === 0) {
49
+ for (let i = 0; i < nDocs; i++) {
50
+ if (sizeOfClusters[i] > 0) {
51
+ clusterChain[0] = i;
52
+ chainLength = 1;
53
+ break;
54
+ }
55
+ }
56
+ }
57
+ while (chainLength > 0) {
58
+ traceAIdx = clusterChain[chainLength - 1];
59
+ if (chainLength > 1) {
60
+ traceBIdx = clusterChain[chainLength - 2];
61
+ currentMin =
62
+ condencedDistanceMatrixCopy[condensedIndex(nDocs, traceAIdx, traceBIdx)];
63
+ }
64
+ else {
65
+ currentMin = Number.MAX_SAFE_INTEGER;
66
+ }
67
+ for (let i = 0; i < nDocs; i++) {
68
+ if (sizeOfClusters[i] == 0 || traceAIdx == i) {
69
+ continue;
70
+ }
71
+ distanceBetweenTraces =
72
+ condencedDistanceMatrixCopy[condensedIndex(nDocs, traceAIdx, i)];
73
+ if (distanceBetweenTraces < currentMin) {
74
+ currentMin = distanceBetweenTraces;
75
+ traceBIdx = i;
76
+ }
77
+ }
78
+ // make sure that traceA and traceB are closest to each other
79
+ if (chainLength > 1 &&
80
+ traceBIdx !== -1 &&
81
+ traceBIdx === clusterChain[chainLength - 2]) {
82
+ break;
83
+ }
84
+ clusterChain[chainLength] = traceBIdx;
85
+ chainLength = chainLength + 1;
86
+ }
87
+ clusterChain = [];
88
+ chainLength = 0;
89
+ if (currentMin > maxDistanceThreshold) {
90
+ sizeOfClusters[traceAIdx] = 0;
91
+ sizeOfClusters[traceBIdx] = 0;
92
+ continue;
93
+ }
94
+ if (traceAIdx === -1 || traceBIdx === -1) {
95
+ continue;
96
+ }
97
+ if (traceAIdx > traceBIdx) {
98
+ [traceAIdx, traceBIdx] = [traceBIdx, traceAIdx];
99
+ }
100
+ const nx = sizeOfClusters[traceAIdx];
101
+ const ny = sizeOfClusters[traceBIdx];
102
+ labels[traceAIdx] = traceBIdx;
103
+ sizeOfClusters[traceAIdx] = 0;
104
+ sizeOfClusters[traceBIdx] = nx + ny;
105
+ for (let i = 0; i < nDocs; i++) {
106
+ const ni = sizeOfClusters[i];
107
+ if (ni === 0 || i === traceBIdx) {
108
+ continue;
109
+ }
110
+ const d_xi = condencedDistanceMatrixCopy[condensedIndex(nDocs, i, traceAIdx)];
111
+ const d_yi = condencedDistanceMatrixCopy[condensedIndex(nDocs, i, traceBIdx)];
112
+ const size_x = nx;
113
+ const size_y = ny;
114
+ // TODO make it generic to support other linkage methods like complete, weighted etc...
115
+ const updatedDist = (size_x * d_xi + size_y * d_yi) / (size_x + size_y);
116
+ condencedDistanceMatrixCopy[condensedIndex(nDocs, i, traceBIdx)] =
117
+ updatedDist;
118
+ }
119
+ }
120
+ return labels.map((_, idx) => getRootLabel(labels, idx));
121
+ };
122
+ exports.cluster = cluster;
@@ -0,0 +1,11 @@
1
+ /**
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ *
7
+ * @format
8
+ * @oncall ws_labs
9
+ */
10
+ export declare function nGram(n: number, terms: string[]): string[];
11
+ //# sourceMappingURL=Ngram.d.ts.map
@@ -0,0 +1,22 @@
1
+ "use strict";
2
+ /**
3
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
4
+ *
5
+ * This source code is licensed under the MIT license found in the
6
+ * LICENSE file in the root directory of this source tree.
7
+ *
8
+ * @format
9
+ * @oncall ws_labs
10
+ */
11
+ Object.defineProperty(exports, "__esModule", { value: true });
12
+ exports.nGram = void 0;
13
+ function nGram(n, terms) {
14
+ const nGrams = [];
15
+ let index = 0;
16
+ while (index <= terms.length - n) {
17
+ nGrams[index] = terms.slice(index, index + n).join(' ');
18
+ ++index;
19
+ }
20
+ return nGrams;
21
+ }
22
+ exports.nGram = nGram;
@@ -0,0 +1,38 @@
1
+ /**
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ *
7
+ * @format
8
+ * @oncall ws_labs
9
+ */
10
+ interface TfidfVectorizerProps {
11
+ rawDocuments: string[];
12
+ maxDF?: number;
13
+ }
14
+ export declare class TfidfVectorizer {
15
+ rawDocuments: string[];
16
+ vocabulary: Record<string, string>;
17
+ documentFrequency: Record<string, number>;
18
+ maxDF: number;
19
+ documents: Record<string, number>[];
20
+ tfidfs: Record<string, number>[];
21
+ constructor({ rawDocuments, maxDF }: TfidfVectorizerProps);
22
+ computeTfidfs(): Record<string, number>[];
23
+ tokenize(text: string): string[];
24
+ buildVocabulary(tokenizedDocuments: string[][]): Record<string, string>;
25
+ processDocuments(tokenizedDocuments: string[][]): void;
26
+ limit(): void;
27
+ /**
28
+ * Smooth idf weights by adding 1 to document frequencies (DF), as if an extra
29
+ * document was seen containing every term in the collection exactly once.
30
+ * This prevents zero divisions.
31
+ * */
32
+ smooth(): void;
33
+ buildTfidfs(): Record<string, number>[];
34
+ tf(vocabIdx: string, document: Record<string, number>): number;
35
+ idf(vocabIdx: string): number;
36
+ }
37
+ export {};
38
+ //# sourceMappingURL=TfidfVectorizer.d.ts.map
@@ -0,0 +1,144 @@
1
+ "use strict";
2
+ /**
3
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
4
+ *
5
+ * This source code is licensed under the MIT license found in the
6
+ * LICENSE file in the root directory of this source tree.
7
+ *
8
+ * @format
9
+ * @oncall ws_labs
10
+ */
11
+ var __importDefault = (this && this.__importDefault) || function (mod) {
12
+ return (mod && mod.__esModule) ? mod : { "default": mod };
13
+ };
14
+ Object.defineProperty(exports, "__esModule", { value: true });
15
+ exports.TfidfVectorizer = void 0;
16
+ const Config_1 = __importDefault(require("../../../lib/Config"));
17
+ const Ngram_1 = require("./Ngram");
18
+ const SMOOTHING_KEY = '__smoothObjectKey';
19
+ const VOCAB_IDX_FOR_DOC_WITH_HIGH_DF = '-1';
20
+ class TfidfVectorizer {
21
+ constructor({ rawDocuments, maxDF }) {
22
+ this.rawDocuments = [];
23
+ this.vocabulary = Object.create(null);
24
+ this.documentFrequency = Object.create(null);
25
+ this.documents = [];
26
+ this.rawDocuments = rawDocuments;
27
+ this.maxDF = maxDF !== null && maxDF !== void 0 ? maxDF : Config_1.default.mlMaxDF;
28
+ }
29
+ computeTfidfs() {
30
+ const tokenizedDocuments = this.rawDocuments.map(this.tokenize);
31
+ this.vocabulary = this.buildVocabulary(tokenizedDocuments);
32
+ this.processDocuments(tokenizedDocuments);
33
+ this.limit();
34
+ this.smooth();
35
+ this.tfidfs = this.buildTfidfs();
36
+ return this.tfidfs;
37
+ }
38
+ tokenize(text) {
39
+ const terms = text.split(' ');
40
+ return [...terms, ...(0, Ngram_1.nGram)(2, terms), ...(0, Ngram_1.nGram)(3, terms)];
41
+ }
42
+ buildVocabulary(tokenizedDocuments) {
43
+ let vocabIdx = 0;
44
+ const vocabulary = Object.create(null);
45
+ tokenizedDocuments.forEach(doc => {
46
+ doc.forEach(term => {
47
+ if (!vocabulary[String(term)]) {
48
+ vocabulary[String(term)] = String(vocabIdx);
49
+ vocabIdx++;
50
+ }
51
+ });
52
+ });
53
+ return vocabulary;
54
+ }
55
+ processDocuments(tokenizedDocuments) {
56
+ tokenizedDocuments.forEach(terms => {
57
+ const document = {};
58
+ terms.forEach(t => {
59
+ const vocabIdx = this.vocabulary[t];
60
+ if (document[vocabIdx]) {
61
+ document[vocabIdx] += 1;
62
+ }
63
+ else {
64
+ if (this.documentFrequency[vocabIdx]) {
65
+ this.documentFrequency[vocabIdx] += 1;
66
+ }
67
+ else {
68
+ this.documentFrequency[vocabIdx] = 1;
69
+ }
70
+ document[vocabIdx] = 1;
71
+ }
72
+ });
73
+ this.documents.push(document);
74
+ });
75
+ }
76
+ limit() {
77
+ const nMaxDF = Math.floor(this.documents.length * this.maxDF);
78
+ const vocabIdxsToDelete = [];
79
+ this.documents.forEach(doc => {
80
+ Object.keys(doc).forEach(vocabIdx => {
81
+ if (this.documentFrequency[vocabIdx] > nMaxDF) {
82
+ delete doc[vocabIdx];
83
+ vocabIdxsToDelete.push(vocabIdx);
84
+ }
85
+ });
86
+ });
87
+ vocabIdxsToDelete.forEach(vocabIdx => {
88
+ delete this.documentFrequency[vocabIdx];
89
+ delete this.vocabulary[vocabIdx];
90
+ });
91
+ }
92
+ /**
93
+ * Smooth idf weights by adding 1 to document frequencies (DF), as if an extra
94
+ * document was seen containing every term in the collection exactly once.
95
+ * This prevents zero divisions.
96
+ * */
97
+ smooth() {
98
+ // for each vocabulary
99
+ Object.values(this.vocabulary).forEach(vocabIdx => (this.documentFrequency[vocabIdx] =
100
+ this.documentFrequency[vocabIdx] + 1));
101
+ this.documents.push({ [SMOOTHING_KEY]: 1 });
102
+ }
103
+ buildTfidfs() {
104
+ const tfidfs = [];
105
+ this.documents.forEach(document => {
106
+ // this means all the terms in the document are the terms
107
+ // that have high document frequency.
108
+ // This will make all the docs with high DF to be clustered together.
109
+ if (Object.keys(document).length === 0) {
110
+ tfidfs.push({ [VOCAB_IDX_FOR_DOC_WITH_HIGH_DF]: 1 });
111
+ return;
112
+ }
113
+ if (!document[SMOOTHING_KEY]) {
114
+ const atfidf = Object.keys(document).map(vocabIdx => {
115
+ return [vocabIdx, this.tf(vocabIdx, document) * this.idf(vocabIdx)];
116
+ });
117
+ // normalizing the values
118
+ const dotSum = atfidf
119
+ .map(([_, tfidfValue]) => tfidfValue * tfidfValue)
120
+ .reduce((sum, tfidfValueSquered) => sum + tfidfValueSquered, 0);
121
+ const dotSumSqrRoot = Math.sqrt(dotSum);
122
+ // Normalizing tfidfs
123
+ const atfidfVocabIdxValueObject = atfidf
124
+ .map(([vocabIdx, tfidfValue]) => [
125
+ vocabIdx,
126
+ tfidfValue / dotSumSqrRoot,
127
+ ])
128
+ .reduce((obj, [vocabIdx, value]) => {
129
+ obj[vocabIdx] = value;
130
+ return obj;
131
+ }, {});
132
+ tfidfs.push(atfidfVocabIdxValueObject);
133
+ }
134
+ });
135
+ return tfidfs;
136
+ }
137
+ tf(vocabIdx, document) {
138
+ return 1 + Math.log(document[vocabIdx]);
139
+ }
140
+ idf(vocabIdx) {
141
+ return (1 + Math.log(this.documents.length / this.documentFrequency[vocabIdx]));
142
+ }
143
+ }
144
+ exports.TfidfVectorizer = TfidfVectorizer;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@memlab/core",
3
- "version": "1.1.5",
3
+ "version": "1.1.9",
4
4
  "license": "MIT",
5
5
  "description": "memlab core libraries",
6
6
  "author": "Liang Gong <lgong@fb.com>",