@memlab/core 1.1.5 → 1.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/__tests__/parser/HeapParser.test.d.ts +1 -1
- package/dist/__tests__/parser/HeapParser.test.js +3 -3
- package/dist/__tests__/parser/NodeHeap.test.d.ts +1 -1
- package/dist/__tests__/parser/NodeHeap.test.js +6 -6
- package/dist/__tests__/parser/StringNode.test.d.ts +1 -1
- package/dist/__tests__/parser/StringNode.test.js +2 -2
- package/dist/__tests__/parser/traverse/HeapNodeTraverse.test.d.ts +1 -1
- package/dist/__tests__/parser/traverse/HeapNodeTraverse.test.js +3 -3
- package/dist/__tests__/utils/utils.test.d.ts +1 -1
- package/dist/__tests__/utils/utils.test.js +1 -1
- package/dist/index.d.ts +4 -2
- package/dist/index.js +6 -3
- package/dist/lib/BaseOption.d.ts +1 -1
- package/dist/lib/BaseOption.js +1 -1
- package/dist/lib/BrowserInfo.d.ts +1 -1
- package/dist/lib/BrowserInfo.js +1 -1
- package/dist/lib/Config.d.ts +11 -2
- package/dist/lib/Config.js +25 -5
- package/dist/lib/Console.d.ts +1 -1
- package/dist/lib/Console.js +1 -1
- package/dist/lib/Constant.d.ts +1 -1
- package/dist/lib/Constant.js +1 -1
- package/dist/lib/FileManager.d.ts +1 -1
- package/dist/lib/FileManager.js +5 -3
- package/dist/lib/HeapAnalyzer.d.ts +1 -1
- package/dist/lib/HeapAnalyzer.js +26 -10
- package/dist/lib/HeapParser.d.ts +2 -2
- package/dist/lib/HeapParser.js +2 -2
- package/dist/lib/InternalValueSetter.d.ts +1 -1
- package/dist/lib/InternalValueSetter.js +1 -1
- package/dist/lib/NodeHeap.d.ts +53 -10
- package/dist/lib/NodeHeap.js +73 -22
- package/dist/lib/PackageInfoLoader.js +2 -2
- package/dist/lib/ProcessManager.d.ts +1 -1
- package/dist/lib/ProcessManager.js +1 -1
- package/dist/lib/Serializer.d.ts +1 -1
- package/dist/lib/Serializer.js +49 -26
- package/dist/lib/StringLoader.d.ts +2 -2
- package/dist/lib/StringLoader.js +2 -2
- package/dist/lib/Types.d.ts +111 -36
- package/dist/lib/Types.js +1 -1
- package/dist/lib/Utils.d.ts +1 -1
- package/dist/lib/Utils.js +55 -31
- package/dist/lib/heap-data/HeapEdge.d.ts +2 -2
- package/dist/lib/heap-data/HeapEdge.js +2 -2
- package/dist/lib/heap-data/HeapLocation.d.ts +2 -2
- package/dist/lib/heap-data/HeapLocation.js +2 -2
- package/dist/lib/heap-data/HeapNode.d.ts +3 -2
- package/dist/lib/heap-data/HeapNode.js +6 -2
- package/dist/lib/heap-data/HeapSnapshot.d.ts +3 -2
- package/dist/lib/heap-data/HeapSnapshot.js +6 -33
- package/dist/lib/heap-data/HeapStringNode.d.ts +2 -2
- package/dist/lib/heap-data/HeapStringNode.js +4 -2
- package/dist/lib/heap-data/HeapUtils.d.ts +2 -2
- package/dist/lib/heap-data/HeapUtils.js +2 -2
- package/dist/lib/heap-data/MemLabTagStore.d.ts +23 -0
- package/dist/lib/heap-data/MemLabTagStore.js +110 -0
- package/dist/lib/leak-filters/BaseLeakFilter.rule.d.ts +1 -1
- package/dist/lib/leak-filters/BaseLeakFilter.rule.js +1 -1
- package/dist/lib/leak-filters/LeakFilterRuleList.d.ts +1 -1
- package/dist/lib/leak-filters/LeakFilterRuleList.js +1 -1
- package/dist/lib/leak-filters/LeakObjectFilter.d.ts +1 -1
- package/dist/lib/leak-filters/LeakObjectFilter.js +1 -1
- package/dist/lib/leak-filters/rules/FilterByExternalFilter.rule.d.ts +1 -1
- package/dist/lib/leak-filters/rules/FilterByExternalFilter.rule.js +1 -1
- package/dist/lib/leak-filters/rules/FilterDetachedDOMElement.rule.d.ts +1 -1
- package/dist/lib/leak-filters/rules/FilterDetachedDOMElement.rule.js +1 -1
- package/dist/lib/leak-filters/rules/FilterHermesNode.rule.d.ts +1 -1
- package/dist/lib/leak-filters/rules/FilterHermesNode.rule.js +1 -1
- package/dist/lib/leak-filters/rules/FilterOverSizedNodeAsLeak.rule.d.ts +1 -1
- package/dist/lib/leak-filters/rules/FilterOverSizedNodeAsLeak.rule.js +1 -1
- package/dist/lib/leak-filters/rules/FilterStackTraceFrame.rule.d.ts +1 -1
- package/dist/lib/leak-filters/rules/FilterStackTraceFrame.rule.js +1 -1
- package/dist/lib/leak-filters/rules/FilterTrivialNode.rule.d.ts +1 -1
- package/dist/lib/leak-filters/rules/FilterTrivialNode.rule.js +1 -1
- package/dist/lib/leak-filters/rules/FilterUnmountedFiberNode.rule.d.ts +1 -1
- package/dist/lib/leak-filters/rules/FilterUnmountedFiberNode.rule.js +1 -1
- package/dist/logger/LeakClusterLogger.d.ts +1 -1
- package/dist/logger/LeakClusterLogger.js +1 -1
- package/dist/logger/LeakTraceDetailsLogger.d.ts +1 -1
- package/dist/logger/LeakTraceDetailsLogger.js +1 -1
- package/dist/modes/BaseMode.d.ts +1 -1
- package/dist/modes/BaseMode.js +1 -1
- package/dist/modes/InteractionTestMode.d.ts +1 -1
- package/dist/modes/InteractionTestMode.js +1 -1
- package/dist/modes/MeasureMode.d.ts +1 -1
- package/dist/modes/MeasureMode.js +1 -1
- package/dist/modes/RunningModes.d.ts +1 -1
- package/dist/modes/RunningModes.js +1 -1
- package/dist/paths/TraceFinder.d.ts +1 -1
- package/dist/paths/TraceFinder.js +41 -42
- package/dist/trace-cluster/ClusterUtils.d.ts +1 -1
- package/dist/trace-cluster/ClusterUtils.js +1 -1
- package/dist/trace-cluster/ClusterUtilsHelper.d.ts +1 -1
- package/dist/trace-cluster/ClusterUtilsHelper.js +1 -1
- package/dist/trace-cluster/ClusteringHeuristics.d.ts +1 -1
- package/dist/trace-cluster/ClusteringHeuristics.js +1 -1
- package/dist/trace-cluster/EvalutationMetric.d.ts +1 -1
- package/dist/trace-cluster/EvalutationMetric.js +1 -1
- package/dist/trace-cluster/SequentialClustering.d.ts +17 -0
- package/dist/trace-cluster/SequentialClustering.js +47 -0
- package/dist/trace-cluster/TraceBucket.d.ts +2 -1
- package/dist/trace-cluster/TraceBucket.js +10 -2
- package/dist/trace-cluster/TraceElement.d.ts +3 -1
- package/dist/trace-cluster/TraceElement.js +7 -1
- package/dist/trace-cluster/strategies/MLTraceSimilarityStrategy.d.ts +15 -0
- package/dist/trace-cluster/strategies/MLTraceSimilarityStrategy.js +61 -0
- package/dist/trace-cluster/strategies/TraceAsClusterStrategy.d.ts +1 -1
- package/dist/trace-cluster/strategies/TraceAsClusterStrategy.js +1 -1
- package/dist/trace-cluster/strategies/TraceSimilarityStrategy.d.ts +1 -1
- package/dist/trace-cluster/strategies/TraceSimilarityStrategy.js +1 -1
- package/dist/trace-cluster/strategies/machine-learning/DistanceMatrix.d.ts +11 -0
- package/dist/trace-cluster/strategies/machine-learning/DistanceMatrix.js +54 -0
- package/dist/trace-cluster/strategies/machine-learning/HAC.d.ts +17 -0
- package/dist/trace-cluster/strategies/machine-learning/HAC.js +122 -0
- package/dist/trace-cluster/strategies/machine-learning/Ngram.d.ts +11 -0
- package/dist/trace-cluster/strategies/machine-learning/Ngram.js +22 -0
- package/dist/trace-cluster/strategies/machine-learning/TfidfVectorizer.d.ts +38 -0
- package/dist/trace-cluster/strategies/machine-learning/TfidfVectorizer.js +144 -0
- package/package.json +1 -1
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*
|
|
7
|
+
* @format
|
|
8
|
+
* @oncall ws_labs
|
|
9
|
+
*/
|
|
10
|
+
export declare const distance: (tfidfs: Record<string, number>[]) => Float32Array;
|
|
11
|
+
//# sourceMappingURL=DistanceMatrix.d.ts.map
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
4
|
+
*
|
|
5
|
+
* This source code is licensed under the MIT license found in the
|
|
6
|
+
* LICENSE file in the root directory of this source tree.
|
|
7
|
+
*
|
|
8
|
+
* @format
|
|
9
|
+
* @oncall ws_labs
|
|
10
|
+
*/
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.distance = void 0;
|
|
13
|
+
const cache = new Map();
|
|
14
|
+
const buildIntersection = (tfidfs, i, j) => {
|
|
15
|
+
const intersection = [];
|
|
16
|
+
if (!cache.has(i)) {
|
|
17
|
+
cache.set(i, Object.keys(tfidfs[i]));
|
|
18
|
+
}
|
|
19
|
+
if (!cache.has(j)) {
|
|
20
|
+
cache.set(j, Object.keys(tfidfs[j]));
|
|
21
|
+
}
|
|
22
|
+
const [keys, tfidf] = cache.get(i).length > cache.get(j).length
|
|
23
|
+
? [cache.get(j), tfidfs[i]]
|
|
24
|
+
: [cache.get(i), tfidfs[j]];
|
|
25
|
+
for (const k of keys) {
|
|
26
|
+
if (tfidf[k]) {
|
|
27
|
+
intersection.push(k);
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
return intersection;
|
|
31
|
+
};
|
|
32
|
+
const distance = (tfidfs) => {
|
|
33
|
+
const n = tfidfs.length;
|
|
34
|
+
const distances = new Float32Array((n * (n - 1)) / 2);
|
|
35
|
+
let distIdx = 0;
|
|
36
|
+
const dotProducs = tfidfs.map(atfidf => Object.values(atfidf).reduce((sum, v) => sum + v * v, 0));
|
|
37
|
+
for (let i = 0; i < tfidfs.length; i++) {
|
|
38
|
+
const a = tfidfs[i];
|
|
39
|
+
for (let j = i + 1; j < tfidfs.length; j++) {
|
|
40
|
+
const b = tfidfs[j];
|
|
41
|
+
const intersection = buildIntersection(tfidfs, i, j);
|
|
42
|
+
const dotProdOfCommons = intersection.reduce((sum, vidx) => sum + a[vidx] * b[vidx], 0);
|
|
43
|
+
// TODO make it pluggable to use other distance measures like euclidean, manhattan
|
|
44
|
+
const cosineSimilarity = 1 -
|
|
45
|
+
dotProdOfCommons /
|
|
46
|
+
(Math.sqrt(dotProducs[i]) / Math.sqrt(dotProducs[j]));
|
|
47
|
+
distances[distIdx] = cosineSimilarity;
|
|
48
|
+
distIdx++;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
cache.clear();
|
|
52
|
+
return distances;
|
|
53
|
+
};
|
|
54
|
+
exports.distance = distance;
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*
|
|
7
|
+
* @format
|
|
8
|
+
* @oncall ws_labs
|
|
9
|
+
*/
|
|
10
|
+
/**
|
|
11
|
+
*
|
|
12
|
+
* @param {*} nDocs number of docs
|
|
13
|
+
* @param {*} D condenced distance matrix
|
|
14
|
+
* @returns labels - list of doc ids as clusters
|
|
15
|
+
*/
|
|
16
|
+
export declare const cluster: (nDocs: number, condensedDistanceMatrix: Float32Array, maxDistanceThreshold: number) => number[] | Uint32Array;
|
|
17
|
+
//# sourceMappingURL=HAC.d.ts.map
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
4
|
+
*
|
|
5
|
+
* This source code is licensed under the MIT license found in the
|
|
6
|
+
* LICENSE file in the root directory of this source tree.
|
|
7
|
+
*
|
|
8
|
+
* @format
|
|
9
|
+
* @oncall ws_labs
|
|
10
|
+
*/
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.cluster = void 0;
|
|
13
|
+
const condensedIndex = (n, i, j) => {
|
|
14
|
+
if (i > j) {
|
|
15
|
+
return condensedIndex(n, j, i);
|
|
16
|
+
}
|
|
17
|
+
// to get distance between (i, j) think of this sequence.
|
|
18
|
+
// (n - 1) + (n - 2) + ... + (n - i) + (j - i) - 1
|
|
19
|
+
return n * i - (i * (i + 1)) / 2 + (j - i - 1);
|
|
20
|
+
};
|
|
21
|
+
function getRootLabel(array, idx) {
|
|
22
|
+
let rootIdx = idx;
|
|
23
|
+
while (array[rootIdx] !== rootIdx) {
|
|
24
|
+
rootIdx = array[rootIdx];
|
|
25
|
+
}
|
|
26
|
+
return rootIdx;
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
*
|
|
30
|
+
* @param {*} nDocs number of docs
|
|
31
|
+
* @param {*} D condenced distance matrix
|
|
32
|
+
* @returns labels - list of doc ids as clusters
|
|
33
|
+
*/
|
|
34
|
+
const cluster = (nDocs, condensedDistanceMatrix, maxDistanceThreshold) => {
|
|
35
|
+
if (nDocs <= 1)
|
|
36
|
+
return [0];
|
|
37
|
+
const condencedDistanceMatrixCopy = new Float32Array(condensedDistanceMatrix);
|
|
38
|
+
const sizeOfClusters = new Uint32Array(nDocs).fill(1);
|
|
39
|
+
let chainLength = 0;
|
|
40
|
+
let clusterChain = [];
|
|
41
|
+
let traceAIdx = -1;
|
|
42
|
+
let traceBIdx = -1;
|
|
43
|
+
let currentMin = Number.MAX_SAFE_INTEGER;
|
|
44
|
+
let distanceBetweenTraces;
|
|
45
|
+
const labels = new Uint32Array(nDocs).map((_, idx) => idx);
|
|
46
|
+
for (let k = 0; k < nDocs - 1; k++) {
|
|
47
|
+
traceBIdx = -1;
|
|
48
|
+
if (chainLength === 0) {
|
|
49
|
+
for (let i = 0; i < nDocs; i++) {
|
|
50
|
+
if (sizeOfClusters[i] > 0) {
|
|
51
|
+
clusterChain[0] = i;
|
|
52
|
+
chainLength = 1;
|
|
53
|
+
break;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
while (chainLength > 0) {
|
|
58
|
+
traceAIdx = clusterChain[chainLength - 1];
|
|
59
|
+
if (chainLength > 1) {
|
|
60
|
+
traceBIdx = clusterChain[chainLength - 2];
|
|
61
|
+
currentMin =
|
|
62
|
+
condencedDistanceMatrixCopy[condensedIndex(nDocs, traceAIdx, traceBIdx)];
|
|
63
|
+
}
|
|
64
|
+
else {
|
|
65
|
+
currentMin = Number.MAX_SAFE_INTEGER;
|
|
66
|
+
}
|
|
67
|
+
for (let i = 0; i < nDocs; i++) {
|
|
68
|
+
if (sizeOfClusters[i] == 0 || traceAIdx == i) {
|
|
69
|
+
continue;
|
|
70
|
+
}
|
|
71
|
+
distanceBetweenTraces =
|
|
72
|
+
condencedDistanceMatrixCopy[condensedIndex(nDocs, traceAIdx, i)];
|
|
73
|
+
if (distanceBetweenTraces < currentMin) {
|
|
74
|
+
currentMin = distanceBetweenTraces;
|
|
75
|
+
traceBIdx = i;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
// make sure that traceA and traceB are closest to each other
|
|
79
|
+
if (chainLength > 1 &&
|
|
80
|
+
traceBIdx !== -1 &&
|
|
81
|
+
traceBIdx === clusterChain[chainLength - 2]) {
|
|
82
|
+
break;
|
|
83
|
+
}
|
|
84
|
+
clusterChain[chainLength] = traceBIdx;
|
|
85
|
+
chainLength = chainLength + 1;
|
|
86
|
+
}
|
|
87
|
+
clusterChain = [];
|
|
88
|
+
chainLength = 0;
|
|
89
|
+
if (currentMin > maxDistanceThreshold) {
|
|
90
|
+
sizeOfClusters[traceAIdx] = 0;
|
|
91
|
+
sizeOfClusters[traceBIdx] = 0;
|
|
92
|
+
continue;
|
|
93
|
+
}
|
|
94
|
+
if (traceAIdx === -1 || traceBIdx === -1) {
|
|
95
|
+
continue;
|
|
96
|
+
}
|
|
97
|
+
if (traceAIdx > traceBIdx) {
|
|
98
|
+
[traceAIdx, traceBIdx] = [traceBIdx, traceAIdx];
|
|
99
|
+
}
|
|
100
|
+
const nx = sizeOfClusters[traceAIdx];
|
|
101
|
+
const ny = sizeOfClusters[traceBIdx];
|
|
102
|
+
labels[traceAIdx] = traceBIdx;
|
|
103
|
+
sizeOfClusters[traceAIdx] = 0;
|
|
104
|
+
sizeOfClusters[traceBIdx] = nx + ny;
|
|
105
|
+
for (let i = 0; i < nDocs; i++) {
|
|
106
|
+
const ni = sizeOfClusters[i];
|
|
107
|
+
if (ni === 0 || i === traceBIdx) {
|
|
108
|
+
continue;
|
|
109
|
+
}
|
|
110
|
+
const d_xi = condencedDistanceMatrixCopy[condensedIndex(nDocs, i, traceAIdx)];
|
|
111
|
+
const d_yi = condencedDistanceMatrixCopy[condensedIndex(nDocs, i, traceBIdx)];
|
|
112
|
+
const size_x = nx;
|
|
113
|
+
const size_y = ny;
|
|
114
|
+
// TODO make it generic to support other linkage methods like complete, weighted etc...
|
|
115
|
+
const updatedDist = (size_x * d_xi + size_y * d_yi) / (size_x + size_y);
|
|
116
|
+
condencedDistanceMatrixCopy[condensedIndex(nDocs, i, traceBIdx)] =
|
|
117
|
+
updatedDist;
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
return labels.map((_, idx) => getRootLabel(labels, idx));
|
|
121
|
+
};
|
|
122
|
+
exports.cluster = cluster;
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*
|
|
7
|
+
* @format
|
|
8
|
+
* @oncall ws_labs
|
|
9
|
+
*/
|
|
10
|
+
export declare function nGram(n: number, terms: string[]): string[];
|
|
11
|
+
//# sourceMappingURL=Ngram.d.ts.map
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
4
|
+
*
|
|
5
|
+
* This source code is licensed under the MIT license found in the
|
|
6
|
+
* LICENSE file in the root directory of this source tree.
|
|
7
|
+
*
|
|
8
|
+
* @format
|
|
9
|
+
* @oncall ws_labs
|
|
10
|
+
*/
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.nGram = void 0;
|
|
13
|
+
function nGram(n, terms) {
|
|
14
|
+
const nGrams = [];
|
|
15
|
+
let index = 0;
|
|
16
|
+
while (index <= terms.length - n) {
|
|
17
|
+
nGrams[index] = terms.slice(index, index + n).join(' ');
|
|
18
|
+
++index;
|
|
19
|
+
}
|
|
20
|
+
return nGrams;
|
|
21
|
+
}
|
|
22
|
+
exports.nGram = nGram;
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*
|
|
7
|
+
* @format
|
|
8
|
+
* @oncall ws_labs
|
|
9
|
+
*/
|
|
10
|
+
interface TfidfVectorizerProps {
|
|
11
|
+
rawDocuments: string[];
|
|
12
|
+
maxDF?: number;
|
|
13
|
+
}
|
|
14
|
+
export declare class TfidfVectorizer {
|
|
15
|
+
rawDocuments: string[];
|
|
16
|
+
vocabulary: Record<string, string>;
|
|
17
|
+
documentFrequency: Record<string, number>;
|
|
18
|
+
maxDF: number;
|
|
19
|
+
documents: Record<string, number>[];
|
|
20
|
+
tfidfs: Record<string, number>[];
|
|
21
|
+
constructor({ rawDocuments, maxDF }: TfidfVectorizerProps);
|
|
22
|
+
computeTfidfs(): Record<string, number>[];
|
|
23
|
+
tokenize(text: string): string[];
|
|
24
|
+
buildVocabulary(tokenizedDocuments: string[][]): Record<string, string>;
|
|
25
|
+
processDocuments(tokenizedDocuments: string[][]): void;
|
|
26
|
+
limit(): void;
|
|
27
|
+
/**
|
|
28
|
+
* Smooth idf weights by adding 1 to document frequencies (DF), as if an extra
|
|
29
|
+
* document was seen containing every term in the collection exactly once.
|
|
30
|
+
* This prevents zero divisions.
|
|
31
|
+
* */
|
|
32
|
+
smooth(): void;
|
|
33
|
+
buildTfidfs(): Record<string, number>[];
|
|
34
|
+
tf(vocabIdx: string, document: Record<string, number>): number;
|
|
35
|
+
idf(vocabIdx: string): number;
|
|
36
|
+
}
|
|
37
|
+
export {};
|
|
38
|
+
//# sourceMappingURL=TfidfVectorizer.d.ts.map
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
4
|
+
*
|
|
5
|
+
* This source code is licensed under the MIT license found in the
|
|
6
|
+
* LICENSE file in the root directory of this source tree.
|
|
7
|
+
*
|
|
8
|
+
* @format
|
|
9
|
+
* @oncall ws_labs
|
|
10
|
+
*/
|
|
11
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
12
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
|
+
};
|
|
14
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
|
+
exports.TfidfVectorizer = void 0;
|
|
16
|
+
const Config_1 = __importDefault(require("../../../lib/Config"));
|
|
17
|
+
const Ngram_1 = require("./Ngram");
|
|
18
|
+
const SMOOTHING_KEY = '__smoothObjectKey';
|
|
19
|
+
const VOCAB_IDX_FOR_DOC_WITH_HIGH_DF = '-1';
|
|
20
|
+
class TfidfVectorizer {
|
|
21
|
+
constructor({ rawDocuments, maxDF }) {
|
|
22
|
+
this.rawDocuments = [];
|
|
23
|
+
this.vocabulary = Object.create(null);
|
|
24
|
+
this.documentFrequency = Object.create(null);
|
|
25
|
+
this.documents = [];
|
|
26
|
+
this.rawDocuments = rawDocuments;
|
|
27
|
+
this.maxDF = maxDF !== null && maxDF !== void 0 ? maxDF : Config_1.default.mlMaxDF;
|
|
28
|
+
}
|
|
29
|
+
computeTfidfs() {
|
|
30
|
+
const tokenizedDocuments = this.rawDocuments.map(this.tokenize);
|
|
31
|
+
this.vocabulary = this.buildVocabulary(tokenizedDocuments);
|
|
32
|
+
this.processDocuments(tokenizedDocuments);
|
|
33
|
+
this.limit();
|
|
34
|
+
this.smooth();
|
|
35
|
+
this.tfidfs = this.buildTfidfs();
|
|
36
|
+
return this.tfidfs;
|
|
37
|
+
}
|
|
38
|
+
tokenize(text) {
|
|
39
|
+
const terms = text.split(' ');
|
|
40
|
+
return [...terms, ...(0, Ngram_1.nGram)(2, terms), ...(0, Ngram_1.nGram)(3, terms)];
|
|
41
|
+
}
|
|
42
|
+
buildVocabulary(tokenizedDocuments) {
|
|
43
|
+
let vocabIdx = 0;
|
|
44
|
+
const vocabulary = Object.create(null);
|
|
45
|
+
tokenizedDocuments.forEach(doc => {
|
|
46
|
+
doc.forEach(term => {
|
|
47
|
+
if (!vocabulary[String(term)]) {
|
|
48
|
+
vocabulary[String(term)] = String(vocabIdx);
|
|
49
|
+
vocabIdx++;
|
|
50
|
+
}
|
|
51
|
+
});
|
|
52
|
+
});
|
|
53
|
+
return vocabulary;
|
|
54
|
+
}
|
|
55
|
+
processDocuments(tokenizedDocuments) {
|
|
56
|
+
tokenizedDocuments.forEach(terms => {
|
|
57
|
+
const document = {};
|
|
58
|
+
terms.forEach(t => {
|
|
59
|
+
const vocabIdx = this.vocabulary[t];
|
|
60
|
+
if (document[vocabIdx]) {
|
|
61
|
+
document[vocabIdx] += 1;
|
|
62
|
+
}
|
|
63
|
+
else {
|
|
64
|
+
if (this.documentFrequency[vocabIdx]) {
|
|
65
|
+
this.documentFrequency[vocabIdx] += 1;
|
|
66
|
+
}
|
|
67
|
+
else {
|
|
68
|
+
this.documentFrequency[vocabIdx] = 1;
|
|
69
|
+
}
|
|
70
|
+
document[vocabIdx] = 1;
|
|
71
|
+
}
|
|
72
|
+
});
|
|
73
|
+
this.documents.push(document);
|
|
74
|
+
});
|
|
75
|
+
}
|
|
76
|
+
limit() {
|
|
77
|
+
const nMaxDF = Math.floor(this.documents.length * this.maxDF);
|
|
78
|
+
const vocabIdxsToDelete = [];
|
|
79
|
+
this.documents.forEach(doc => {
|
|
80
|
+
Object.keys(doc).forEach(vocabIdx => {
|
|
81
|
+
if (this.documentFrequency[vocabIdx] > nMaxDF) {
|
|
82
|
+
delete doc[vocabIdx];
|
|
83
|
+
vocabIdxsToDelete.push(vocabIdx);
|
|
84
|
+
}
|
|
85
|
+
});
|
|
86
|
+
});
|
|
87
|
+
vocabIdxsToDelete.forEach(vocabIdx => {
|
|
88
|
+
delete this.documentFrequency[vocabIdx];
|
|
89
|
+
delete this.vocabulary[vocabIdx];
|
|
90
|
+
});
|
|
91
|
+
}
|
|
92
|
+
/**
|
|
93
|
+
* Smooth idf weights by adding 1 to document frequencies (DF), as if an extra
|
|
94
|
+
* document was seen containing every term in the collection exactly once.
|
|
95
|
+
* This prevents zero divisions.
|
|
96
|
+
* */
|
|
97
|
+
smooth() {
|
|
98
|
+
// for each vocabulary
|
|
99
|
+
Object.values(this.vocabulary).forEach(vocabIdx => (this.documentFrequency[vocabIdx] =
|
|
100
|
+
this.documentFrequency[vocabIdx] + 1));
|
|
101
|
+
this.documents.push({ [SMOOTHING_KEY]: 1 });
|
|
102
|
+
}
|
|
103
|
+
buildTfidfs() {
|
|
104
|
+
const tfidfs = [];
|
|
105
|
+
this.documents.forEach(document => {
|
|
106
|
+
// this means all the terms in the document are the terms
|
|
107
|
+
// that have high document frequency.
|
|
108
|
+
// This will make all the docs with high DF to be clustered together.
|
|
109
|
+
if (Object.keys(document).length === 0) {
|
|
110
|
+
tfidfs.push({ [VOCAB_IDX_FOR_DOC_WITH_HIGH_DF]: 1 });
|
|
111
|
+
return;
|
|
112
|
+
}
|
|
113
|
+
if (!document[SMOOTHING_KEY]) {
|
|
114
|
+
const atfidf = Object.keys(document).map(vocabIdx => {
|
|
115
|
+
return [vocabIdx, this.tf(vocabIdx, document) * this.idf(vocabIdx)];
|
|
116
|
+
});
|
|
117
|
+
// normalizing the values
|
|
118
|
+
const dotSum = atfidf
|
|
119
|
+
.map(([_, tfidfValue]) => tfidfValue * tfidfValue)
|
|
120
|
+
.reduce((sum, tfidfValueSquered) => sum + tfidfValueSquered, 0);
|
|
121
|
+
const dotSumSqrRoot = Math.sqrt(dotSum);
|
|
122
|
+
// Normalizing tfidfs
|
|
123
|
+
const atfidfVocabIdxValueObject = atfidf
|
|
124
|
+
.map(([vocabIdx, tfidfValue]) => [
|
|
125
|
+
vocabIdx,
|
|
126
|
+
tfidfValue / dotSumSqrRoot,
|
|
127
|
+
])
|
|
128
|
+
.reduce((obj, [vocabIdx, value]) => {
|
|
129
|
+
obj[vocabIdx] = value;
|
|
130
|
+
return obj;
|
|
131
|
+
}, {});
|
|
132
|
+
tfidfs.push(atfidfVocabIdxValueObject);
|
|
133
|
+
}
|
|
134
|
+
});
|
|
135
|
+
return tfidfs;
|
|
136
|
+
}
|
|
137
|
+
tf(vocabIdx, document) {
|
|
138
|
+
return 1 + Math.log(document[vocabIdx]);
|
|
139
|
+
}
|
|
140
|
+
idf(vocabIdx) {
|
|
141
|
+
return (1 + Math.log(this.documents.length / this.documentFrequency[vocabIdx]));
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
exports.TfidfVectorizer = TfidfVectorizer;
|