@memlab/core 1.1.5 → 1.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/lib/Config.d.ts +4 -1
- package/dist/lib/Config.js +7 -1
- package/dist/lib/HeapAnalyzer.js +6 -1
- package/dist/lib/NodeHeap.d.ts +20 -0
- package/dist/lib/NodeHeap.js +20 -0
- package/dist/lib/PackageInfoLoader.js +1 -1
- package/dist/lib/Serializer.js +48 -25
- package/dist/lib/Types.d.ts +61 -4
- package/dist/trace-cluster/TraceBucket.js +6 -1
- package/dist/trace-cluster/strategies/MLTraceSimilarityStrategy.d.ts +15 -0
- package/dist/trace-cluster/strategies/MLTraceSimilarityStrategy.js +61 -0
- package/dist/trace-cluster/strategies/machine-learning/DistanceMatrix.d.ts +11 -0
- package/dist/trace-cluster/strategies/machine-learning/DistanceMatrix.js +54 -0
- package/dist/trace-cluster/strategies/machine-learning/HAC.d.ts +17 -0
- package/dist/trace-cluster/strategies/machine-learning/HAC.js +124 -0
- package/dist/trace-cluster/strategies/machine-learning/Ngram.d.ts +11 -0
- package/dist/trace-cluster/strategies/machine-learning/Ngram.js +22 -0
- package/dist/trace-cluster/strategies/machine-learning/TfidfVectorizer.d.ts +38 -0
- package/dist/trace-cluster/strategies/machine-learning/TfidfVectorizer.js +140 -0
- package/package.json +1 -1
package/dist/lib/Config.d.ts
CHANGED
|
@@ -52,7 +52,6 @@ export declare class MemLabConfig {
|
|
|
52
52
|
_scenario: Optional<IScenario>;
|
|
53
53
|
_isHeadfulBrowser: boolean;
|
|
54
54
|
_browser: string;
|
|
55
|
-
_packageInfo: IPackageInfo[];
|
|
56
55
|
snapshotHasDetachedness: boolean;
|
|
57
56
|
specifiedEngine: boolean;
|
|
58
57
|
verbose: boolean;
|
|
@@ -176,9 +175,13 @@ export declare class MemLabConfig {
|
|
|
176
175
|
externalLeakFilter?: Optional<ILeakFilter>;
|
|
177
176
|
monoRepoDir: string;
|
|
178
177
|
muteConsole: boolean;
|
|
178
|
+
includeObjectInfoInTraceReturnChain: boolean;
|
|
179
179
|
logUnclassifiedClusters: boolean;
|
|
180
180
|
errorHandling: ErrorHandling;
|
|
181
181
|
clusterStrategy: Optional<IClusterStrategy>;
|
|
182
|
+
packageInfo: IPackageInfo[];
|
|
183
|
+
isMLClustering: boolean;
|
|
184
|
+
mlClusteringLinkageMaxDistance: number;
|
|
182
185
|
constructor(options?: ConfigOption);
|
|
183
186
|
private initInternalConfigs;
|
|
184
187
|
private init;
|
package/dist/lib/Config.js
CHANGED
|
@@ -98,7 +98,7 @@ class MemLabConfig {
|
|
|
98
98
|
// the default browser (Chromium)
|
|
99
99
|
this._browser = 'chrome';
|
|
100
100
|
// a list of package information
|
|
101
|
-
this.
|
|
101
|
+
this.packageInfo = [];
|
|
102
102
|
// a set of additional GKs to be enabled
|
|
103
103
|
this.addEnableGK = new Set();
|
|
104
104
|
// a set of additional GKs to be disabled
|
|
@@ -119,8 +119,14 @@ class MemLabConfig {
|
|
|
119
119
|
this.muteConsole = false;
|
|
120
120
|
// log all leak traces, each as an unclassified cluster
|
|
121
121
|
this.logUnclassifiedClusters = false;
|
|
122
|
+
// If true, the detailed JSON file of each representative
|
|
123
|
+
// trace (for visualization) will include detailed object
|
|
124
|
+
// info for each Fiber node on the return chain.
|
|
125
|
+
// This may bloat the trace size from 100KB to 50MB.
|
|
126
|
+
this.includeObjectInfoInTraceReturnChain = false;
|
|
122
127
|
// by default halt the program when utils.haltOrThrow is calleds
|
|
123
128
|
this.errorHandling = ErrorHandling.Halt;
|
|
129
|
+
this.mlClusteringLinkageMaxDistance = 0.7;
|
|
124
130
|
}
|
|
125
131
|
// initialize configurable parameters
|
|
126
132
|
init(options = {}) {
|
package/dist/lib/HeapAnalyzer.js
CHANGED
|
@@ -33,6 +33,7 @@ const Console_1 = __importDefault(require("./Console"));
|
|
|
33
33
|
const Serializer_1 = __importDefault(require("./Serializer"));
|
|
34
34
|
const Utils_1 = __importDefault(require("./Utils"));
|
|
35
35
|
const LeakObjectFilter_1 = require("./leak-filters/LeakObjectFilter");
|
|
36
|
+
const MLTraceSimilarityStrategy_1 = __importDefault(require("../trace-cluster/strategies/MLTraceSimilarityStrategy"));
|
|
36
37
|
class MemoryAnalyst {
|
|
37
38
|
checkLeak() {
|
|
38
39
|
return __awaiter(this, void 0, void 0, function* () {
|
|
@@ -630,7 +631,11 @@ class MemoryAnalyst {
|
|
|
630
631
|
Console_1.default.midLevel(`${numOfLeakedObjects} leaked objects`);
|
|
631
632
|
}
|
|
632
633
|
// cluster traces from the current run
|
|
633
|
-
const clusters = TraceBucket_1.default.clusterPaths(paths, snapshot, this.aggregateDominatorMetrics
|
|
634
|
+
const clusters = TraceBucket_1.default.clusterPaths(paths, snapshot, this.aggregateDominatorMetrics, {
|
|
635
|
+
strategy: Config_1.default.isMLClustering
|
|
636
|
+
? new MLTraceSimilarityStrategy_1.default()
|
|
637
|
+
: undefined,
|
|
638
|
+
});
|
|
634
639
|
yield this.serializeClusterUpdate(clusters);
|
|
635
640
|
if (Config_1.default.logUnclassifiedClusters) {
|
|
636
641
|
// cluster traces from the current run
|
package/dist/lib/NodeHeap.d.ts
CHANGED
|
@@ -48,6 +48,26 @@ import type { IHeapSnapshot } from './Types';
|
|
|
48
48
|
* ```
|
|
49
49
|
*/
|
|
50
50
|
export declare function tagObject<T extends object>(o: T, tag: string): T;
|
|
51
|
+
/**
|
|
52
|
+
* Take a heap snapshot of the current program state and save it as a
|
|
53
|
+
* `.heapsnapshot` file under a randomly generated folder inside the system's
|
|
54
|
+
* temp folder.
|
|
55
|
+
*
|
|
56
|
+
* **Note**: All `.heapsnapshot` files could also be loaded by Chrome DevTools.
|
|
57
|
+
* @returns the absolute file path to the saved `.heapsnapshot` file.
|
|
58
|
+
*
|
|
59
|
+
* * **Examples**:
|
|
60
|
+
* ```typescript
|
|
61
|
+
* import type {IHeapSnapshot} from '@memlab/core';
|
|
62
|
+
* import {dumpNodeHeapSnapshot} from '@memlab/core';
|
|
63
|
+
* import {getHeapFromFile} from '@memlab/heap-analysis';
|
|
64
|
+
*
|
|
65
|
+
* (async function () {
|
|
66
|
+
* const heapFile = dumpNodeHeapSnapshot();
|
|
67
|
+
* const heap: IHeapSnapshot = await getHeapFromFile(heapFile);
|
|
68
|
+
* })();
|
|
69
|
+
* ```
|
|
70
|
+
*/
|
|
51
71
|
export declare function dumpNodeHeapSnapshot(): string;
|
|
52
72
|
/**
|
|
53
73
|
* Take a heap snapshot of the current program state
|
package/dist/lib/NodeHeap.js
CHANGED
|
@@ -80,6 +80,26 @@ function tagObject(o, tag) {
|
|
|
80
80
|
return o;
|
|
81
81
|
}
|
|
82
82
|
exports.tagObject = tagObject;
|
|
83
|
+
/**
|
|
84
|
+
* Take a heap snapshot of the current program state and save it as a
|
|
85
|
+
* `.heapsnapshot` file under a randomly generated folder inside the system's
|
|
86
|
+
* temp folder.
|
|
87
|
+
*
|
|
88
|
+
* **Note**: All `.heapsnapshot` files could also be loaded by Chrome DevTools.
|
|
89
|
+
* @returns the absolute file path to the saved `.heapsnapshot` file.
|
|
90
|
+
*
|
|
91
|
+
* * **Examples**:
|
|
92
|
+
* ```typescript
|
|
93
|
+
* import type {IHeapSnapshot} from '@memlab/core';
|
|
94
|
+
* import {dumpNodeHeapSnapshot} from '@memlab/core';
|
|
95
|
+
* import {getHeapFromFile} from '@memlab/heap-analysis';
|
|
96
|
+
*
|
|
97
|
+
* (async function () {
|
|
98
|
+
* const heapFile = dumpNodeHeapSnapshot();
|
|
99
|
+
* const heap: IHeapSnapshot = await getHeapFromFile(heapFile);
|
|
100
|
+
* })();
|
|
101
|
+
* ```
|
|
102
|
+
*/
|
|
83
103
|
function dumpNodeHeapSnapshot() {
|
|
84
104
|
const file = path_1.default.join(FileManager_1.default.generateTmpHeapDir(), `nodejs.heapsnapshot`);
|
|
85
105
|
v8_1.default.writeHeapSnapshot(file);
|
|
@@ -57,7 +57,7 @@ class PackageInfoLoader {
|
|
|
57
57
|
if (!PackageInfoLoader.registeredPackages.has(packageDirectory)) {
|
|
58
58
|
PackageInfoLoader.registeredPackages.add(packageDirectory);
|
|
59
59
|
const packageInfo = yield PackageInfoLoader.loadFrom(packageDirectory);
|
|
60
|
-
Config_1.default.
|
|
60
|
+
Config_1.default.packageInfo.push(packageInfo);
|
|
61
61
|
}
|
|
62
62
|
});
|
|
63
63
|
}
|
package/dist/lib/Serializer.js
CHANGED
|
@@ -99,7 +99,7 @@ function JSONifyDetachedHTMLElement(node, args, options) {
|
|
|
99
99
|
// options for elem.__reactProps$xxx
|
|
100
100
|
const propsOptions = Object.assign({}, options);
|
|
101
101
|
propsOptions.forceJSONifyDepth = 1;
|
|
102
|
-
|
|
102
|
+
iterateSelectedEdges(node, (edge) => {
|
|
103
103
|
const key = JSONifyEdgeNameAndType(edge);
|
|
104
104
|
if (Utils_1.default.isReactFiberEdge(edge)) {
|
|
105
105
|
info[key] = JSONifyNode(edge.toNode, args, fiberOptions);
|
|
@@ -110,7 +110,8 @@ function JSONifyDetachedHTMLElement(node, args, options) {
|
|
|
110
110
|
else {
|
|
111
111
|
info[key] = JSONifyNodeInShort(edge.toNode);
|
|
112
112
|
}
|
|
113
|
-
|
|
113
|
+
return null;
|
|
114
|
+
});
|
|
114
115
|
return info;
|
|
115
116
|
}
|
|
116
117
|
function calculateReturnTrace(node, cache) {
|
|
@@ -128,15 +129,16 @@ function calculateReturnTrace(node, cache) {
|
|
|
128
129
|
const objectNodeUsefulProps = new Set(['_context']);
|
|
129
130
|
function JSONifyNodeOneLevel(node) {
|
|
130
131
|
const info = Object.create(null);
|
|
131
|
-
|
|
132
|
+
iterateSelectedEdges(node, (edge) => {
|
|
132
133
|
const key = JSONifyEdgeNameAndType(edge);
|
|
133
134
|
info[key] = JSONifyNodeShallow(edge.toNode);
|
|
134
|
-
|
|
135
|
+
return null;
|
|
136
|
+
});
|
|
135
137
|
return info;
|
|
136
138
|
}
|
|
137
139
|
function JSONifyNodeShallow(node) {
|
|
138
140
|
const info = Object.create(null);
|
|
139
|
-
|
|
141
|
+
iterateSelectedEdges(node, (edge) => {
|
|
140
142
|
const key = JSONifyEdgeNameAndType(edge);
|
|
141
143
|
if (objectNodeUsefulProps.has(edge.name_or_index)) {
|
|
142
144
|
info[key] = JSONifyNodeShallow(edge.toNode);
|
|
@@ -144,7 +146,8 @@ function JSONifyNodeShallow(node) {
|
|
|
144
146
|
else {
|
|
145
147
|
info[key] = JSONifyNodeInShort(edge.toNode);
|
|
146
148
|
}
|
|
147
|
-
|
|
149
|
+
return null;
|
|
150
|
+
});
|
|
148
151
|
return info;
|
|
149
152
|
}
|
|
150
153
|
const fiberNodeUsefulProps = new Set([
|
|
@@ -154,15 +157,17 @@ const fiberNodeUsefulProps = new Set([
|
|
|
154
157
|
]);
|
|
155
158
|
function JSONifyFiberNodeShallow(node) {
|
|
156
159
|
const info = Object.create(null);
|
|
157
|
-
|
|
160
|
+
iterateSelectedEdges(node, (edge) => {
|
|
158
161
|
const key = JSONifyEdgeNameAndType(edge);
|
|
159
162
|
if (fiberNodeUsefulProps.has(edge.name_or_index) &&
|
|
160
163
|
Utils_1.default.isObjectNode(edge.toNode)) {
|
|
161
164
|
info[key] = JSONifyNodeShallow(edge.toNode);
|
|
162
|
-
continue;
|
|
163
165
|
}
|
|
164
|
-
|
|
165
|
-
|
|
166
|
+
else {
|
|
167
|
+
info[key] = JSONifyNodeInShort(edge.toNode);
|
|
168
|
+
}
|
|
169
|
+
return null;
|
|
170
|
+
});
|
|
166
171
|
return info;
|
|
167
172
|
}
|
|
168
173
|
// calculate the summary of return chain of the FiberNode
|
|
@@ -187,7 +192,9 @@ function JSONifyFiberNodeReturnTrace(node, args, options) {
|
|
|
187
192
|
}
|
|
188
193
|
const parentInfo = getNodeNameInJSON(parent, args);
|
|
189
194
|
key = `${key}: --return (property)---> ${parentInfo}`;
|
|
190
|
-
const info =
|
|
195
|
+
const info = Config_1.default.includeObjectInfoInTraceReturnChain
|
|
196
|
+
? JSONifyFiberNodeShallow(parent)
|
|
197
|
+
: Object.create(null);
|
|
191
198
|
trace[key] = info;
|
|
192
199
|
}
|
|
193
200
|
return trace;
|
|
@@ -206,25 +213,27 @@ function JSONifyFiberNode(node, args, options) {
|
|
|
206
213
|
propsOptions.forceJSONifyDepth = 1;
|
|
207
214
|
}
|
|
208
215
|
propsOptions.forceJSONifyDepth--;
|
|
209
|
-
|
|
216
|
+
iterateSelectedEdges(node, (edge) => {
|
|
210
217
|
const key = JSONifyEdgeNameAndType(edge);
|
|
211
218
|
info[key] =
|
|
212
|
-
propsOptions.forceJSONifyDepth >= 1
|
|
219
|
+
propsOptions.forceJSONifyDepth && propsOptions.forceJSONifyDepth >= 1
|
|
213
220
|
? JSONifyNode(edge.toNode, args, propsOptions)
|
|
214
221
|
: JSONifyNodeInShort(edge.toNode);
|
|
215
|
-
|
|
222
|
+
return null;
|
|
223
|
+
});
|
|
216
224
|
return info;
|
|
217
225
|
}
|
|
218
226
|
function JSONifyClosure(node, args, options) {
|
|
219
227
|
const info = Object.create(null);
|
|
220
|
-
|
|
228
|
+
iterateSelectedEdges(node, (edge) => {
|
|
221
229
|
if (edge.name_or_index === 'shared' ||
|
|
222
230
|
edge.name_or_index === 'context' ||
|
|
223
231
|
edge.name_or_index === 'displayName') {
|
|
224
232
|
const key = filterJSONPropName(edge.name_or_index);
|
|
225
233
|
info[key] = JSONifyNode(edge.toNode, args, options);
|
|
226
234
|
}
|
|
227
|
-
|
|
235
|
+
return null;
|
|
236
|
+
});
|
|
228
237
|
return info;
|
|
229
238
|
}
|
|
230
239
|
function JSONifyNumberNode(node,
|
|
@@ -238,7 +247,7 @@ _options) {
|
|
|
238
247
|
}
|
|
239
248
|
function JSONifyCode(node, args, options) {
|
|
240
249
|
const info = Object.create(null);
|
|
241
|
-
|
|
250
|
+
iterateSelectedEdges(node, (edge) => {
|
|
242
251
|
if (edge.name_or_index === 'name_or_scope_info' &&
|
|
243
252
|
edge.toNode.name === '(function scope info)') {
|
|
244
253
|
const key = 'variables with non-number values in closure scope chain';
|
|
@@ -251,14 +260,15 @@ function JSONifyCode(node, args, options) {
|
|
|
251
260
|
const key = filterJSONPropName(edge.name_or_index);
|
|
252
261
|
info[key] = JSONifyNode(edge.toNode, args, options);
|
|
253
262
|
}
|
|
254
|
-
|
|
263
|
+
return null;
|
|
264
|
+
});
|
|
255
265
|
return info;
|
|
256
266
|
}
|
|
257
267
|
function JSONifyContext(node, args, options) {
|
|
258
268
|
const info = Object.create(null);
|
|
259
269
|
const key = 'variables in scope (used by nested closures)';
|
|
260
270
|
const closure_vars = (info[key] = Object.create(null));
|
|
261
|
-
|
|
271
|
+
iterateSelectedEdges(node, (edge) => {
|
|
262
272
|
const key = filterJSONPropName(edge.name_or_index);
|
|
263
273
|
if (edge.type === 'context') {
|
|
264
274
|
closure_vars[key] = JSONifyNodeInShort(edge.toNode);
|
|
@@ -266,15 +276,27 @@ function JSONifyContext(node, args, options) {
|
|
|
266
276
|
else if (edge.type === '') {
|
|
267
277
|
info[key] = JSONifyNode(edge.toNode, args, options);
|
|
268
278
|
}
|
|
269
|
-
|
|
279
|
+
return null;
|
|
280
|
+
});
|
|
270
281
|
return info;
|
|
271
282
|
}
|
|
283
|
+
function iterateSelectedEdges(node, callback) {
|
|
284
|
+
let edgesProcessed = 0;
|
|
285
|
+
node.forEachReference((edge) => {
|
|
286
|
+
if (edge.type === 'internal') {
|
|
287
|
+
if (edge.name_or_index === 'map' || edge.is_index) {
|
|
288
|
+
return;
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
if (edgesProcessed++ > 100) {
|
|
292
|
+
return { stop: true };
|
|
293
|
+
}
|
|
294
|
+
return callback(edge);
|
|
295
|
+
});
|
|
296
|
+
}
|
|
272
297
|
function JSONifyOrdinaryValue(node, args, options) {
|
|
273
298
|
const info = Object.create(null);
|
|
274
|
-
|
|
275
|
-
if (edge.name_or_index === 'map' && edge.type === 'internal') {
|
|
276
|
-
continue;
|
|
277
|
-
}
|
|
299
|
+
iterateSelectedEdges(node, (edge) => {
|
|
278
300
|
const key = JSONifyEdgeNameAndType(edge);
|
|
279
301
|
const toNode = edge.toNode;
|
|
280
302
|
const toNodeName = toNode.name;
|
|
@@ -293,7 +315,8 @@ function JSONifyOrdinaryValue(node, args, options) {
|
|
|
293
315
|
else {
|
|
294
316
|
info[key] = JSONifyNodeInShort(toNode);
|
|
295
317
|
}
|
|
296
|
-
|
|
318
|
+
return null;
|
|
319
|
+
});
|
|
297
320
|
return info;
|
|
298
321
|
}
|
|
299
322
|
function JSONifyNode(node, args, options) {
|
package/dist/lib/Types.d.ts
CHANGED
|
@@ -32,7 +32,6 @@ export declare type AnyOptions = Record<string, unknown>;
|
|
|
32
32
|
export declare type UnusedOptions = Record<string, never>;
|
|
33
33
|
/** @internal */
|
|
34
34
|
export declare type Command = [string, string[], AnyOptions];
|
|
35
|
-
export declare type Predicator<T> = (node: T) => boolean;
|
|
36
35
|
/** @internal */
|
|
37
36
|
export declare type HeapNodeIdSet = Set<number>;
|
|
38
37
|
/** @internal */
|
|
@@ -86,6 +85,22 @@ export declare type CLIArgs = {
|
|
|
86
85
|
'local-puppeteer': boolean;
|
|
87
86
|
'snapshot-dir': string;
|
|
88
87
|
};
|
|
88
|
+
/**
|
|
89
|
+
* the predicate callback is used to decide if a
|
|
90
|
+
* entity of type `T`.
|
|
91
|
+
* For more concrete examples on where it is used,
|
|
92
|
+
* check out {@link findAnyReference}, {@link findAnyReferrer},
|
|
93
|
+
* and {@link findReferrers}.
|
|
94
|
+
*
|
|
95
|
+
* @typeParam T - the type of the entity to be checked
|
|
96
|
+
* @param entity - the entity to be checked
|
|
97
|
+
* @returns whether the entity passes the predicate check
|
|
98
|
+
*/
|
|
99
|
+
export declare type Predicator<T> = (entity: T) => boolean;
|
|
100
|
+
/**
|
|
101
|
+
* Data structure for holding cookies.
|
|
102
|
+
* For concrete example, check out {@link cookies}.
|
|
103
|
+
*/
|
|
89
104
|
export declare type Cookies = Array<{
|
|
90
105
|
name: string;
|
|
91
106
|
value: string;
|
|
@@ -288,9 +303,10 @@ export interface ILeakFilter {
|
|
|
288
303
|
/**
|
|
289
304
|
* Lifecycle function callback that is invoked initially once before calling any
|
|
290
305
|
* leak filter function.
|
|
306
|
+
* For concrete example, check out {@link beforeLeakFilter}.
|
|
291
307
|
*
|
|
292
|
-
* @param
|
|
293
|
-
* @param leakedNodeIds
|
|
308
|
+
* @param snapshot heap snapshot see {@link IHeapSnapshot}
|
|
309
|
+
* @param leakedNodeIds the set of leaked object (node) ids.
|
|
294
310
|
*/
|
|
295
311
|
export declare type InitLeakFilterCallback = (snapshot: IHeapSnapshot, leakedNodeIds: HeapNodeIdSet) => void;
|
|
296
312
|
/**
|
|
@@ -299,6 +315,8 @@ export declare type InitLeakFilterCallback = (snapshot: IHeapSnapshot, leakedNod
|
|
|
299
315
|
* allocated but not released from the target interaction
|
|
300
316
|
* in the heap snapshot.
|
|
301
317
|
*
|
|
318
|
+
* For concrete examples, check out {@link leakFilter}.
|
|
319
|
+
*
|
|
302
320
|
* @param node - the node that is kept alive in the memory in the heap snapshot
|
|
303
321
|
* @param snapshot - the snapshot of target interaction
|
|
304
322
|
* @param leakedNodeIds - the set of leaked node ids
|
|
@@ -317,6 +335,11 @@ export declare type LeakFilterCallback = (node: IHeapNode, snapshot: IHeapSnapsh
|
|
|
317
335
|
/**
|
|
318
336
|
* The callback defines browser interactions which are
|
|
319
337
|
* used by memlab to interact with the web app under test.
|
|
338
|
+
* For concrete examples, check out {@link action} or {@link back}.
|
|
339
|
+
*
|
|
340
|
+
* @param page the puppeteer [`Page`](https://pptr.dev/api/puppeteer.page)
|
|
341
|
+
* object, which provides APIs to interact with the web browser
|
|
342
|
+
* @returns no return value
|
|
320
343
|
*/
|
|
321
344
|
export declare type InteractionsCallback = (page: Page, args?: OperationArgs) => Promise<void>;
|
|
322
345
|
/**
|
|
@@ -684,6 +707,7 @@ export interface IDataBuilder {
|
|
|
684
707
|
}
|
|
685
708
|
/**
|
|
686
709
|
* Callback function to provide if the page is loaded.
|
|
710
|
+
* For concrete example, check out {@link isPageLoaded}.
|
|
687
711
|
* @param page - puppeteer's [Page](https://pptr.dev/api/puppeteer.page/) object.
|
|
688
712
|
* @returns a boolean value, if it returns `true`, memlab will consider
|
|
689
713
|
* the navigation completes, if it returns `false`, memlab will keep calling
|
|
@@ -738,16 +762,42 @@ export declare type E2EStepInfo = IE2EStepBasic & {
|
|
|
738
762
|
delay?: number;
|
|
739
763
|
metrics: Record<string, number>;
|
|
740
764
|
};
|
|
741
|
-
/**
|
|
765
|
+
/**
|
|
766
|
+
* This data structure contains the input configuration for the browser and
|
|
767
|
+
* output data from the browser. You can retrieve the instance of this type
|
|
768
|
+
* through {@link RunMetaInfo}.
|
|
769
|
+
*/
|
|
742
770
|
export interface IBrowserInfo {
|
|
771
|
+
/**
|
|
772
|
+
* browser version
|
|
773
|
+
*/
|
|
743
774
|
_browserVersion: string;
|
|
775
|
+
/**
|
|
776
|
+
* configuration for puppeteer
|
|
777
|
+
*/
|
|
744
778
|
_puppeteerConfig: LaunchOptions;
|
|
779
|
+
/**
|
|
780
|
+
* all web console output
|
|
781
|
+
*/
|
|
745
782
|
_consoleMessages: string[];
|
|
746
783
|
}
|
|
784
|
+
/**
|
|
785
|
+
* This data structure holds the information about memlab run.
|
|
786
|
+
* You can retrieve the instance of this type through {@link getRunMetaInfo}.
|
|
787
|
+
*/
|
|
747
788
|
export declare type RunMetaInfo = {
|
|
789
|
+
/** @internal */
|
|
748
790
|
app: string;
|
|
791
|
+
/** @internal */
|
|
749
792
|
interaction: string;
|
|
793
|
+
/**
|
|
794
|
+
* type of the memlab run
|
|
795
|
+
*/
|
|
750
796
|
type: string;
|
|
797
|
+
/**
|
|
798
|
+
* input configuration for the browser and
|
|
799
|
+
* output data from the browser
|
|
800
|
+
*/
|
|
751
801
|
browserInfo: IBrowserInfo;
|
|
752
802
|
};
|
|
753
803
|
/**
|
|
@@ -1162,6 +1212,13 @@ export interface IHeapNodeBasic {
|
|
|
1162
1212
|
*/
|
|
1163
1213
|
id: number;
|
|
1164
1214
|
}
|
|
1215
|
+
/**
|
|
1216
|
+
* Executes a provided callback once for JavaScript references.
|
|
1217
|
+
* For concrete examples, check out {@link forEachReference}
|
|
1218
|
+
* or {@link forEachReferrer}.
|
|
1219
|
+
* @param callback the callback for each JavaScript reference from a collection
|
|
1220
|
+
* @returns this API returns void
|
|
1221
|
+
*/
|
|
1165
1222
|
export declare type EdgeIterationCallback = (edge: IHeapEdge) => Optional<{
|
|
1166
1223
|
stop: boolean;
|
|
1167
1224
|
}>;
|
|
@@ -20,6 +20,7 @@ const Utils_1 = __importDefault(require("../lib/Utils"));
|
|
|
20
20
|
const TraceElement_1 = require("./TraceElement");
|
|
21
21
|
const TraceSimilarityStrategy_1 = __importDefault(require("./strategies/TraceSimilarityStrategy"));
|
|
22
22
|
const TraceAsClusterStrategy_1 = __importDefault(require("./strategies/TraceAsClusterStrategy"));
|
|
23
|
+
const MLTraceSimilarityStrategy_1 = __importDefault(require("./strategies/MLTraceSimilarityStrategy"));
|
|
23
24
|
// sync up with html/intern/js/webspeed/memlab/lib/LeakCluster.js
|
|
24
25
|
class NormalizedTrace {
|
|
25
26
|
constructor(p = null, snapshot = null) {
|
|
@@ -157,7 +158,11 @@ class NormalizedTrace {
|
|
|
157
158
|
};
|
|
158
159
|
}
|
|
159
160
|
static clusterLeakTraces(leakTraces) {
|
|
160
|
-
const { allClusters } = NormalizedTrace.diffTraces(leakTraces, []
|
|
161
|
+
const { allClusters } = NormalizedTrace.diffTraces(leakTraces, [], {
|
|
162
|
+
strategy: Config_1.default.isMLClustering
|
|
163
|
+
? new MLTraceSimilarityStrategy_1.default()
|
|
164
|
+
: undefined,
|
|
165
|
+
});
|
|
161
166
|
const lastNodeFromTrace = (trace) => trace[trace.length - 1];
|
|
162
167
|
const labaledLeakTraces = allClusters.reduce((acc, bucket) => {
|
|
163
168
|
const lastNodeFromFirstTrace = lastNodeFromTrace(bucket[0]);
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*
|
|
7
|
+
* @emails oncall+ws_labs
|
|
8
|
+
* @format
|
|
9
|
+
*/
|
|
10
|
+
import type { IClusterStrategy, LeakTrace, TraceDiff } from '../../lib/Types';
|
|
11
|
+
export default class MLTraceSimilarityStrategy implements IClusterStrategy {
|
|
12
|
+
diffTraces(newLeakTraces: LeakTrace[]): TraceDiff;
|
|
13
|
+
traceToDoc(trace: LeakTrace): string;
|
|
14
|
+
}
|
|
15
|
+
//# sourceMappingURL=MLTraceSimilarityStrategy.d.ts.map
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
4
|
+
*
|
|
5
|
+
* This source code is licensed under the MIT license found in the
|
|
6
|
+
* LICENSE file in the root directory of this source tree.
|
|
7
|
+
*
|
|
8
|
+
* @emails oncall+ws_labs
|
|
9
|
+
* @format
|
|
10
|
+
*/
|
|
11
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
12
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
|
+
};
|
|
14
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
|
+
const Config_1 = __importDefault(require("../../lib/Config"));
|
|
16
|
+
const DistanceMatrix_1 = require("./machine-learning/DistanceMatrix");
|
|
17
|
+
const HAC_1 = require("./machine-learning/HAC");
|
|
18
|
+
const TfidfVectorizer_1 = require("./machine-learning/TfidfVectorizer");
|
|
19
|
+
class MLTraceSimilarityStrategy {
|
|
20
|
+
diffTraces(newLeakTraces) {
|
|
21
|
+
var _a;
|
|
22
|
+
const rawDocuments = newLeakTraces.map(this.traceToDoc);
|
|
23
|
+
const vectorizer = new TfidfVectorizer_1.TfidfVectorizer({ rawDocuments });
|
|
24
|
+
const tfidfs = vectorizer.computeTfidfs();
|
|
25
|
+
const dmatrix = (0, DistanceMatrix_1.distance)(tfidfs);
|
|
26
|
+
const result = (0, HAC_1.cluster)(rawDocuments.length, dmatrix, Config_1.default.mlClusteringLinkageMaxDistance);
|
|
27
|
+
const map = new Map();
|
|
28
|
+
for (let i = 0; i < result.length; i++) {
|
|
29
|
+
const traceIdx = result[i];
|
|
30
|
+
const repTrace = newLeakTraces[traceIdx];
|
|
31
|
+
const trace = newLeakTraces[i];
|
|
32
|
+
if (!map.has(repTrace)) {
|
|
33
|
+
map.set(repTrace, [repTrace]);
|
|
34
|
+
}
|
|
35
|
+
// to please lint
|
|
36
|
+
(_a = map.get(repTrace)) === null || _a === void 0 ? void 0 : _a.push(trace);
|
|
37
|
+
}
|
|
38
|
+
return {
|
|
39
|
+
allClusters: Array.from(map.values()),
|
|
40
|
+
staleClusters: [],
|
|
41
|
+
clustersToAdd: [],
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
traceToDoc(trace) {
|
|
45
|
+
const res = [];
|
|
46
|
+
for (const t of trace) {
|
|
47
|
+
let name = t.kind === 'node' ? String(t.name) : String(t.name_or_index);
|
|
48
|
+
if (name === '') {
|
|
49
|
+
name = '_null_';
|
|
50
|
+
}
|
|
51
|
+
name = name.replace(/ /g, '_');
|
|
52
|
+
name = name.replace(/\d/g, '');
|
|
53
|
+
if (name === '') {
|
|
54
|
+
name = '_number_';
|
|
55
|
+
}
|
|
56
|
+
res.push(name);
|
|
57
|
+
}
|
|
58
|
+
return res.join(' ');
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
exports.default = MLTraceSimilarityStrategy;
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*
|
|
7
|
+
* @emails oncall+ws_labs
|
|
8
|
+
* @format
|
|
9
|
+
*/
|
|
10
|
+
export declare const distance: (tfidfs: Record<string, number>[]) => Float32Array;
|
|
11
|
+
//# sourceMappingURL=DistanceMatrix.d.ts.map
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
4
|
+
*
|
|
5
|
+
* This source code is licensed under the MIT license found in the
|
|
6
|
+
* LICENSE file in the root directory of this source tree.
|
|
7
|
+
*
|
|
8
|
+
* @emails oncall+ws_labs
|
|
9
|
+
* @format
|
|
10
|
+
*/
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.distance = void 0;
|
|
13
|
+
const cache = new Map();
|
|
14
|
+
const buildIntersection = (tfidfs, i, j) => {
|
|
15
|
+
const intersection = [];
|
|
16
|
+
if (!cache.has(i)) {
|
|
17
|
+
cache.set(i, Object.keys(tfidfs[i]));
|
|
18
|
+
}
|
|
19
|
+
if (!cache.has(j)) {
|
|
20
|
+
cache.set(j, Object.keys(tfidfs[j]));
|
|
21
|
+
}
|
|
22
|
+
const [keys, tfidf] = cache.get(i).length > cache.get(j).length
|
|
23
|
+
? [cache.get(j), tfidfs[i]]
|
|
24
|
+
: [cache.get(i), tfidfs[j]];
|
|
25
|
+
for (const k of keys) {
|
|
26
|
+
if (tfidf[k]) {
|
|
27
|
+
intersection.push(k);
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
return intersection;
|
|
31
|
+
};
|
|
32
|
+
const distance = (tfidfs) => {
|
|
33
|
+
const n = tfidfs.length;
|
|
34
|
+
const distances = new Float32Array((n * (n - 1)) / 2);
|
|
35
|
+
let distIdx = 0;
|
|
36
|
+
const dotProducs = tfidfs.map(atfidf => Object.values(atfidf).reduce((sum, v) => sum + v * v, 0));
|
|
37
|
+
for (let i = 0; i < tfidfs.length; i++) {
|
|
38
|
+
const a = tfidfs[i];
|
|
39
|
+
for (let j = i + 1; j < tfidfs.length; j++) {
|
|
40
|
+
const b = tfidfs[j];
|
|
41
|
+
const intersection = buildIntersection(tfidfs, i, j);
|
|
42
|
+
const dotProdOfCommons = intersection.reduce((sum, vidx) => sum + a[vidx] * b[vidx], 0);
|
|
43
|
+
// TODO make it pluggable to use other distance measures like euclidean, manhattan
|
|
44
|
+
const cosineSimilarity = 1 -
|
|
45
|
+
dotProdOfCommons /
|
|
46
|
+
(Math.sqrt(dotProducs[i]) / Math.sqrt(dotProducs[j]));
|
|
47
|
+
distances[distIdx] = cosineSimilarity;
|
|
48
|
+
distIdx++;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
cache.clear();
|
|
52
|
+
return distances;
|
|
53
|
+
};
|
|
54
|
+
exports.distance = distance;
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*
|
|
7
|
+
* @emails oncall+ws_labs
|
|
8
|
+
* @format
|
|
9
|
+
*/
|
|
10
|
+
/**
|
|
11
|
+
*
|
|
12
|
+
* @param {*} nDocs number of docs
|
|
13
|
+
* @param {*} D condenced distance matrix
|
|
14
|
+
* @returns labels - list of doc ids as clusters
|
|
15
|
+
*/
|
|
16
|
+
export declare const cluster: (nDocs: number, condensedDistanceMatrix: Float32Array, maxDistanceThreshold: number) => number[];
|
|
17
|
+
//# sourceMappingURL=HAC.d.ts.map
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
4
|
+
*
|
|
5
|
+
* This source code is licensed under the MIT license found in the
|
|
6
|
+
* LICENSE file in the root directory of this source tree.
|
|
7
|
+
*
|
|
8
|
+
* @emails oncall+ws_labs
|
|
9
|
+
* @format
|
|
10
|
+
*/
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.cluster = void 0;
|
|
13
|
+
const condensedIndex = (n, i, j) => {
|
|
14
|
+
if (i > j) {
|
|
15
|
+
return condensedIndex(n, j, i);
|
|
16
|
+
}
|
|
17
|
+
// to get distance between (i, j) think of this sequence.
|
|
18
|
+
// (n - 1) + (n - 2) + ... + (n - i) + (j - i) - 1
|
|
19
|
+
return n * i - (i * (i + 1)) / 2 + (j - i - 1);
|
|
20
|
+
};
|
|
21
|
+
const getRootLabel = (array, idx) => {
|
|
22
|
+
let rootIdx = idx;
|
|
23
|
+
while (array[rootIdx] !== rootIdx) {
|
|
24
|
+
rootIdx = array[rootIdx];
|
|
25
|
+
}
|
|
26
|
+
return rootIdx;
|
|
27
|
+
};
|
|
28
|
+
/**
|
|
29
|
+
*
|
|
30
|
+
* @param {*} nDocs number of docs
|
|
31
|
+
* @param {*} D condenced distance matrix
|
|
32
|
+
* @returns labels - list of doc ids as clusters
|
|
33
|
+
*/
|
|
34
|
+
const cluster = (nDocs, condensedDistanceMatrix, maxDistanceThreshold) => {
|
|
35
|
+
if (nDocs <= 1)
|
|
36
|
+
return [0];
|
|
37
|
+
const condencedDistanceMatrixCopy = new Float32Array(condensedDistanceMatrix);
|
|
38
|
+
const sizeOfClusters = new Uint32Array(nDocs).fill(1);
|
|
39
|
+
let chainLength = 0;
|
|
40
|
+
let clusterChain = [];
|
|
41
|
+
let traceAIdx = -1;
|
|
42
|
+
let traceBIdx = -1;
|
|
43
|
+
let currentMin = Number.MAX_SAFE_INTEGER;
|
|
44
|
+
let distanceBetweenTraces;
|
|
45
|
+
const labels = Array(nDocs)
|
|
46
|
+
.fill(0)
|
|
47
|
+
.map((_, idx) => idx);
|
|
48
|
+
for (let k = 0; k < nDocs - 1; k++) {
|
|
49
|
+
traceBIdx = -1;
|
|
50
|
+
if (chainLength === 0) {
|
|
51
|
+
for (let i = 0; i < nDocs; i++) {
|
|
52
|
+
if (sizeOfClusters[i] > 0) {
|
|
53
|
+
clusterChain[0] = i;
|
|
54
|
+
chainLength = 1;
|
|
55
|
+
break;
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
while (chainLength > 0) {
|
|
60
|
+
traceAIdx = clusterChain[chainLength - 1];
|
|
61
|
+
if (chainLength > 1) {
|
|
62
|
+
traceBIdx = clusterChain[chainLength - 2];
|
|
63
|
+
currentMin =
|
|
64
|
+
condencedDistanceMatrixCopy[condensedIndex(nDocs, traceAIdx, traceBIdx)];
|
|
65
|
+
}
|
|
66
|
+
else {
|
|
67
|
+
currentMin = Number.MAX_SAFE_INTEGER;
|
|
68
|
+
}
|
|
69
|
+
for (let i = 0; i < nDocs; i++) {
|
|
70
|
+
if (sizeOfClusters[i] == 0 || traceAIdx == i) {
|
|
71
|
+
continue;
|
|
72
|
+
}
|
|
73
|
+
distanceBetweenTraces =
|
|
74
|
+
condencedDistanceMatrixCopy[condensedIndex(nDocs, traceAIdx, i)];
|
|
75
|
+
if (distanceBetweenTraces < currentMin) {
|
|
76
|
+
currentMin = distanceBetweenTraces;
|
|
77
|
+
traceBIdx = i;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
// to make sure we found a two mutual traces whose distance is smallest.
|
|
81
|
+
if (chainLength > 1 &&
|
|
82
|
+
traceBIdx !== -1 &&
|
|
83
|
+
traceBIdx === clusterChain[chainLength - 2]) {
|
|
84
|
+
break;
|
|
85
|
+
}
|
|
86
|
+
clusterChain[chainLength] = traceBIdx;
|
|
87
|
+
chainLength = chainLength + 1;
|
|
88
|
+
}
|
|
89
|
+
clusterChain = [];
|
|
90
|
+
chainLength = 0;
|
|
91
|
+
if (currentMin > maxDistanceThreshold) {
|
|
92
|
+
sizeOfClusters[traceAIdx] = 0;
|
|
93
|
+
sizeOfClusters[traceBIdx] = 0;
|
|
94
|
+
continue;
|
|
95
|
+
}
|
|
96
|
+
if (traceAIdx === -1 || traceBIdx === -1) {
|
|
97
|
+
continue;
|
|
98
|
+
}
|
|
99
|
+
if (traceAIdx > traceBIdx) {
|
|
100
|
+
[traceAIdx, traceBIdx] = [traceBIdx, traceAIdx];
|
|
101
|
+
}
|
|
102
|
+
const nx = sizeOfClusters[traceAIdx];
|
|
103
|
+
const ny = sizeOfClusters[traceBIdx];
|
|
104
|
+
labels[traceAIdx] = traceBIdx;
|
|
105
|
+
sizeOfClusters[traceAIdx] = 0;
|
|
106
|
+
sizeOfClusters[traceBIdx] = nx + ny;
|
|
107
|
+
for (let i = 0; i < nDocs; i++) {
|
|
108
|
+
const ni = sizeOfClusters[i];
|
|
109
|
+
if (ni === 0 || i === traceBIdx) {
|
|
110
|
+
continue;
|
|
111
|
+
}
|
|
112
|
+
const d_xi = condencedDistanceMatrixCopy[condensedIndex(nDocs, i, traceAIdx)];
|
|
113
|
+
const d_yi = condencedDistanceMatrixCopy[condensedIndex(nDocs, i, traceBIdx)];
|
|
114
|
+
const size_x = nx;
|
|
115
|
+
const size_y = ny;
|
|
116
|
+
// TODO make it generic to support other linkage methods like complete, weighted etc...
|
|
117
|
+
const updatedDist = (size_x * d_xi + size_y * d_yi) / (size_x + size_y);
|
|
118
|
+
condencedDistanceMatrixCopy[condensedIndex(nDocs, i, traceBIdx)] =
|
|
119
|
+
updatedDist;
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
return labels.map((_, idx) => getRootLabel(labels, idx));
|
|
123
|
+
};
|
|
124
|
+
exports.cluster = cluster;
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*
|
|
7
|
+
* @emails oncall+ws_labs
|
|
8
|
+
* @format
|
|
9
|
+
*/
|
|
10
|
+
export declare function nGram(n: number, terms: string[]): string[];
|
|
11
|
+
//# sourceMappingURL=Ngram.d.ts.map
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
4
|
+
*
|
|
5
|
+
* This source code is licensed under the MIT license found in the
|
|
6
|
+
* LICENSE file in the root directory of this source tree.
|
|
7
|
+
*
|
|
8
|
+
* @emails oncall+ws_labs
|
|
9
|
+
* @format
|
|
10
|
+
*/
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.nGram = void 0;
|
|
13
|
+
function nGram(n, terms) {
|
|
14
|
+
const nGrams = [];
|
|
15
|
+
let index = 0;
|
|
16
|
+
while (index <= terms.length - n) {
|
|
17
|
+
nGrams[index] = terms.slice(index, index + n).join(' ');
|
|
18
|
+
++index;
|
|
19
|
+
}
|
|
20
|
+
return nGrams;
|
|
21
|
+
}
|
|
22
|
+
exports.nGram = nGram;
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*
|
|
7
|
+
* @emails oncall+ws_labs
|
|
8
|
+
* @format
|
|
9
|
+
*/
|
|
10
|
+
interface TfidfVectorizerProps {
|
|
11
|
+
rawDocuments: string[];
|
|
12
|
+
maxDF?: number;
|
|
13
|
+
}
|
|
14
|
+
export declare class TfidfVectorizer {
|
|
15
|
+
rawDocuments: string[];
|
|
16
|
+
vocabulary: Record<string, string>;
|
|
17
|
+
documentFrequency: Record<string, number>;
|
|
18
|
+
maxDF: number;
|
|
19
|
+
documents: Record<string, number>[];
|
|
20
|
+
tfidfs: Record<string, number>[];
|
|
21
|
+
constructor({ rawDocuments, maxDF }: TfidfVectorizerProps);
|
|
22
|
+
computeTfidfs(): Record<string, number>[];
|
|
23
|
+
tokenize(text: string): string[];
|
|
24
|
+
buildVocabulary(tokenizedDocuments: string[][]): Record<string, string>;
|
|
25
|
+
processDocuments(tokenizedDocuments: string[][]): void;
|
|
26
|
+
limit(): void;
|
|
27
|
+
/**
|
|
28
|
+
* Smooth idf weights by adding 1 to document frequencies (DF), as if an extra
|
|
29
|
+
* document was seen containing every term in the collection exactly once.
|
|
30
|
+
* This prevents zero divisions.
|
|
31
|
+
* */
|
|
32
|
+
smooth(): void;
|
|
33
|
+
buildTfidfs(): Record<string, number>[];
|
|
34
|
+
tf(vocabIdx: string, document: Record<string, number>): number;
|
|
35
|
+
idf(vocabIdx: string): number;
|
|
36
|
+
}
|
|
37
|
+
export {};
|
|
38
|
+
//# sourceMappingURL=TfidfVectorizer.d.ts.map
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
4
|
+
*
|
|
5
|
+
* This source code is licensed under the MIT license found in the
|
|
6
|
+
* LICENSE file in the root directory of this source tree.
|
|
7
|
+
*
|
|
8
|
+
* @emails oncall+ws_labs
|
|
9
|
+
* @format
|
|
10
|
+
*/
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.TfidfVectorizer = void 0;
|
|
13
|
+
const Ngram_1 = require("./Ngram");
|
|
14
|
+
const SMOOTHING_KEY = '__smoothObjectKey';
|
|
15
|
+
const VOCAB_IDX_FOR_DOC_WITH_HIGH_DF = '-1';
|
|
16
|
+
class TfidfVectorizer {
|
|
17
|
+
constructor({ rawDocuments, maxDF = 0.8 }) {
|
|
18
|
+
this.rawDocuments = [];
|
|
19
|
+
this.vocabulary = Object.create(null);
|
|
20
|
+
this.documentFrequency = Object.create(null);
|
|
21
|
+
this.documents = [];
|
|
22
|
+
this.rawDocuments = rawDocuments;
|
|
23
|
+
this.maxDF = maxDF;
|
|
24
|
+
}
|
|
25
|
+
computeTfidfs() {
|
|
26
|
+
const tokenizedDocuments = this.rawDocuments.map(this.tokenize);
|
|
27
|
+
this.vocabulary = this.buildVocabulary(tokenizedDocuments);
|
|
28
|
+
this.processDocuments(tokenizedDocuments);
|
|
29
|
+
this.limit();
|
|
30
|
+
this.smooth();
|
|
31
|
+
this.tfidfs = this.buildTfidfs();
|
|
32
|
+
return this.tfidfs;
|
|
33
|
+
}
|
|
34
|
+
tokenize(text) {
|
|
35
|
+
const terms = text.split(' ');
|
|
36
|
+
return [...terms, ...(0, Ngram_1.nGram)(2, terms), ...(0, Ngram_1.nGram)(3, terms)];
|
|
37
|
+
}
|
|
38
|
+
buildVocabulary(tokenizedDocuments) {
|
|
39
|
+
let vocabIdx = 0;
|
|
40
|
+
const vocabulary = Object.create(null);
|
|
41
|
+
tokenizedDocuments.forEach(doc => {
|
|
42
|
+
doc.forEach(term => {
|
|
43
|
+
if (!vocabulary[String(term)]) {
|
|
44
|
+
vocabulary[String(term)] = String(vocabIdx);
|
|
45
|
+
vocabIdx++;
|
|
46
|
+
}
|
|
47
|
+
});
|
|
48
|
+
});
|
|
49
|
+
return vocabulary;
|
|
50
|
+
}
|
|
51
|
+
processDocuments(tokenizedDocuments) {
|
|
52
|
+
tokenizedDocuments.forEach(terms => {
|
|
53
|
+
const document = {};
|
|
54
|
+
terms.forEach(t => {
|
|
55
|
+
const vocabIdx = this.vocabulary[t];
|
|
56
|
+
if (document[vocabIdx]) {
|
|
57
|
+
document[vocabIdx] += 1;
|
|
58
|
+
}
|
|
59
|
+
else {
|
|
60
|
+
if (this.documentFrequency[vocabIdx]) {
|
|
61
|
+
this.documentFrequency[vocabIdx] += 1;
|
|
62
|
+
}
|
|
63
|
+
else {
|
|
64
|
+
this.documentFrequency[vocabIdx] = 1;
|
|
65
|
+
}
|
|
66
|
+
document[vocabIdx] = 1;
|
|
67
|
+
}
|
|
68
|
+
});
|
|
69
|
+
this.documents.push(document);
|
|
70
|
+
});
|
|
71
|
+
}
|
|
72
|
+
limit() {
|
|
73
|
+
const nMaxDF = Math.floor(this.documents.length * this.maxDF);
|
|
74
|
+
const vocabIdxsToDelete = [];
|
|
75
|
+
this.documents.forEach(doc => {
|
|
76
|
+
Object.keys(doc).forEach(vocabIdx => {
|
|
77
|
+
if (this.documentFrequency[vocabIdx] > nMaxDF) {
|
|
78
|
+
delete doc[vocabIdx];
|
|
79
|
+
vocabIdxsToDelete.push(vocabIdx);
|
|
80
|
+
}
|
|
81
|
+
});
|
|
82
|
+
});
|
|
83
|
+
vocabIdxsToDelete.forEach(vocabIdx => {
|
|
84
|
+
delete this.documentFrequency[vocabIdx];
|
|
85
|
+
delete this.vocabulary[vocabIdx];
|
|
86
|
+
});
|
|
87
|
+
}
|
|
88
|
+
/**
|
|
89
|
+
* Smooth idf weights by adding 1 to document frequencies (DF), as if an extra
|
|
90
|
+
* document was seen containing every term in the collection exactly once.
|
|
91
|
+
* This prevents zero divisions.
|
|
92
|
+
* */
|
|
93
|
+
smooth() {
|
|
94
|
+
// for each vocabulary
|
|
95
|
+
Object.values(this.vocabulary).forEach(vocabIdx => (this.documentFrequency[vocabIdx] =
|
|
96
|
+
this.documentFrequency[vocabIdx] + 1));
|
|
97
|
+
this.documents.push({ [SMOOTHING_KEY]: 1 });
|
|
98
|
+
}
|
|
99
|
+
buildTfidfs() {
|
|
100
|
+
const tfidfs = [];
|
|
101
|
+
this.documents.forEach(document => {
|
|
102
|
+
// this means all the terms in the document are the terms
|
|
103
|
+
// that have high document frequency.
|
|
104
|
+
// This will make all the docs with high DF to be clustered together.
|
|
105
|
+
if (Object.keys(document).length === 0) {
|
|
106
|
+
tfidfs.push({ [VOCAB_IDX_FOR_DOC_WITH_HIGH_DF]: 1 });
|
|
107
|
+
return;
|
|
108
|
+
}
|
|
109
|
+
if (!document[SMOOTHING_KEY]) {
|
|
110
|
+
const atfidf = Object.keys(document).map(vocabIdx => {
|
|
111
|
+
return [vocabIdx, this.tf(vocabIdx, document) * this.idf(vocabIdx)];
|
|
112
|
+
});
|
|
113
|
+
// normalizing the values
|
|
114
|
+
const dotSum = atfidf
|
|
115
|
+
.map(([_, tfidfValue]) => tfidfValue * tfidfValue)
|
|
116
|
+
.reduce((sum, tfidfValueSquered) => sum + tfidfValueSquered, 0);
|
|
117
|
+
const dotSumSqrRoot = Math.sqrt(dotSum);
|
|
118
|
+
// Normalizing tfidfs
|
|
119
|
+
const atfidfVocabIdxValueObject = atfidf
|
|
120
|
+
.map(([vocabIdx, tfidfValue]) => [
|
|
121
|
+
vocabIdx,
|
|
122
|
+
tfidfValue / dotSumSqrRoot,
|
|
123
|
+
])
|
|
124
|
+
.reduce((obj, [vocabIdx, value]) => {
|
|
125
|
+
obj[vocabIdx] = value;
|
|
126
|
+
return obj;
|
|
127
|
+
}, {});
|
|
128
|
+
tfidfs.push(atfidfVocabIdxValueObject);
|
|
129
|
+
}
|
|
130
|
+
});
|
|
131
|
+
return tfidfs;
|
|
132
|
+
}
|
|
133
|
+
tf(vocabIdx, document) {
|
|
134
|
+
return 1 + Math.log(document[vocabIdx]);
|
|
135
|
+
}
|
|
136
|
+
idf(vocabIdx) {
|
|
137
|
+
return (1 + Math.log(this.documents.length / this.documentFrequency[vocabIdx]));
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
exports.TfidfVectorizer = TfidfVectorizer;
|