@memlab/core 1.1.3 → 1.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -7,6 +7,8 @@
7
7
  * @emails oncall+ws_labs
8
8
  * @format
9
9
  */
10
+ /** @internal */
11
+ export declare function registerPackage(): Promise<void>;
10
12
  export * from './lib/Types';
11
13
  /** @internal */
12
14
  export { default as config } from './lib/Config';
@@ -42,5 +44,7 @@ export { default as leakClusterLogger } from './logger/LeakClusterLogger';
42
44
  export { default as NormalizedTrace } from './trace-cluster/TraceBucket';
43
45
  /** @internal */
44
46
  export { default as EvaluationMetric } from './trace-cluster/EvalutationMetric';
47
+ /** @internal */
48
+ export * from './lib/PackageInfoLoader';
45
49
  export * from './lib/NodeHeap';
46
50
  //# sourceMappingURL=index.d.ts.map
package/dist/index.js CHANGED
@@ -22,11 +22,29 @@ var __createBinding = (this && this.__createBinding) || (Object.create ? (functi
22
22
  var __exportStar = (this && this.__exportStar) || function(m, exports) {
23
23
  for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
24
24
  };
25
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
26
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
27
+ return new (P || (P = Promise))(function (resolve, reject) {
28
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
29
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
30
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
31
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
32
+ });
33
+ };
25
34
  var __importDefault = (this && this.__importDefault) || function (mod) {
26
35
  return (mod && mod.__esModule) ? mod : { "default": mod };
27
36
  };
28
37
  Object.defineProperty(exports, "__esModule", { value: true });
29
- exports.EvaluationMetric = exports.NormalizedTrace = exports.leakClusterLogger = exports.ProcessManager = exports.modes = exports.constant = exports.analysis = exports.browserInfo = exports.serializer = exports.fileManager = exports.utils = exports.BaseOption = exports.info = exports.config = void 0;
38
+ exports.EvaluationMetric = exports.NormalizedTrace = exports.leakClusterLogger = exports.ProcessManager = exports.modes = exports.constant = exports.analysis = exports.browserInfo = exports.serializer = exports.fileManager = exports.utils = exports.BaseOption = exports.info = exports.config = exports.registerPackage = void 0;
39
+ const path_1 = __importDefault(require("path"));
40
+ const PackageInfoLoader_1 = require("./lib/PackageInfoLoader");
41
+ /** @internal */
42
+ function registerPackage() {
43
+ return __awaiter(this, void 0, void 0, function* () {
44
+ return PackageInfoLoader_1.PackageInfoLoader.registerPackage(path_1.default.join(__dirname, '..'));
45
+ });
46
+ }
47
+ exports.registerPackage = registerPackage;
30
48
  __exportStar(require("./lib/Types"), exports);
31
49
  /** @internal */
32
50
  var Config_1 = require("./lib/Config");
@@ -76,4 +94,6 @@ Object.defineProperty(exports, "NormalizedTrace", { enumerable: true, get: funct
76
94
  /** @internal */
77
95
  var EvalutationMetric_1 = require("./trace-cluster/EvalutationMetric");
78
96
  Object.defineProperty(exports, "EvaluationMetric", { enumerable: true, get: function () { return __importDefault(EvalutationMetric_1).default; } });
97
+ /** @internal */
98
+ __exportStar(require("./lib/PackageInfoLoader"), exports);
79
99
  __exportStar(require("./lib/NodeHeap"), exports);
@@ -8,7 +8,7 @@
8
8
  * @format
9
9
  */
10
10
  import type { LaunchOptions, Permission } from 'puppeteer';
11
- import type { AnyFunction, AnyValue, IClusterStrategy, IRunningMode, IScenario, Nullable, Optional, QuickExperiment, ILeakFilter } from './Types';
11
+ import type { AnyFunction, AnyValue, IClusterStrategy, IRunningMode, IScenario, Nullable, Optional, QuickExperiment, ILeakFilter, IPackageInfo } from './Types';
12
12
  interface BrowserLaunchArgumentOptions {
13
13
  headless?: boolean;
14
14
  userDataDir?: string;
@@ -44,14 +44,18 @@ export declare enum ErrorHandling {
44
44
  }
45
45
  /** @internal */
46
46
  export declare class MemLabConfig {
47
- snapshotHasDetachedness: boolean;
48
- specifiedEngine: boolean;
49
- verbose: boolean;
50
- jsEngine: string;
51
47
  _reportLeaksInTimers: boolean;
52
48
  _deviceManualOverridden: boolean;
53
49
  _timerNodes: string[];
54
50
  _timerEdges: string[];
51
+ _isFullRun: boolean;
52
+ _scenario: Optional<IScenario>;
53
+ _isHeadfulBrowser: boolean;
54
+ _browser: string;
55
+ snapshotHasDetachedness: boolean;
56
+ specifiedEngine: boolean;
57
+ verbose: boolean;
58
+ jsEngine: string;
55
59
  targetApp: string;
56
60
  targetTab: string;
57
61
  analysisMode: string;
@@ -94,7 +98,6 @@ export declare class MemLabConfig {
94
98
  puppeteerConfig: LaunchOptions & BrowserLaunchArgumentOptions & BrowserConnectOptions;
95
99
  openDevtoolsConsole: boolean;
96
100
  emulateDevice: Nullable<Device>;
97
- _browser: string;
98
101
  addEnableGK: Set<string>;
99
102
  addDisableGK: Set<string>;
100
103
  qes: QuickExperiment[];
@@ -169,15 +172,16 @@ export declare class MemLabConfig {
169
172
  oversizeObjectAsLeak: boolean;
170
173
  oversizeThreshold: number;
171
174
  clusterRetainedSizeThreshold: number;
172
- _isFullRun: boolean;
173
- _scenario: Optional<IScenario>;
174
- _isHeadfulBrowser: boolean;
175
175
  externalLeakFilter?: Optional<ILeakFilter>;
176
176
  monoRepoDir: string;
177
177
  muteConsole: boolean;
178
+ includeObjectInfoInTraceReturnChain: boolean;
178
179
  logUnclassifiedClusters: boolean;
179
180
  errorHandling: ErrorHandling;
180
181
  clusterStrategy: Optional<IClusterStrategy>;
182
+ packageInfo: IPackageInfo[];
183
+ isMLClustering: boolean;
184
+ mlClusteringLinkageMaxDistance: number;
181
185
  constructor(options?: ConfigOption);
182
186
  private initInternalConfigs;
183
187
  private init;
@@ -97,6 +97,8 @@ class MemLabConfig {
97
97
  this.jsEngine = Constant_1.default.defaultEngine;
98
98
  // the default browser (Chromium)
99
99
  this._browser = 'chrome';
100
+ // a list of package information
101
+ this.packageInfo = [];
100
102
  // a set of additional GKs to be enabled
101
103
  this.addEnableGK = new Set();
102
104
  // a set of additional GKs to be disabled
@@ -117,8 +119,14 @@ class MemLabConfig {
117
119
  this.muteConsole = false;
118
120
  // log all leak traces, each as an unclassified cluster
119
121
  this.logUnclassifiedClusters = false;
122
+ // If true, the detailed JSON file of each representative
123
+ // trace (for visualization) will include detailed object
124
+ // info for each Fiber node on the return chain.
125
+ // This may bloat the trace size from 100KB to 50MB.
126
+ this.includeObjectInfoInTraceReturnChain = false;
120
127
  // by default halt the program when utils.haltOrThrow is calleds
121
128
  this.errorHandling = ErrorHandling.Halt;
129
+ this.mlClusteringLinkageMaxDistance = 0.7;
122
130
  }
123
131
  // initialize configurable parameters
124
132
  init(options = {}) {
@@ -33,6 +33,7 @@ const Console_1 = __importDefault(require("./Console"));
33
33
  const Serializer_1 = __importDefault(require("./Serializer"));
34
34
  const Utils_1 = __importDefault(require("./Utils"));
35
35
  const LeakObjectFilter_1 = require("./leak-filters/LeakObjectFilter");
36
+ const MLTraceSimilarityStrategy_1 = __importDefault(require("../trace-cluster/strategies/MLTraceSimilarityStrategy"));
36
37
  class MemoryAnalyst {
37
38
  checkLeak() {
38
39
  return __awaiter(this, void 0, void 0, function* () {
@@ -630,7 +631,11 @@ class MemoryAnalyst {
630
631
  Console_1.default.midLevel(`${numOfLeakedObjects} leaked objects`);
631
632
  }
632
633
  // cluster traces from the current run
633
- const clusters = TraceBucket_1.default.clusterPaths(paths, snapshot, this.aggregateDominatorMetrics);
634
+ const clusters = TraceBucket_1.default.clusterPaths(paths, snapshot, this.aggregateDominatorMetrics, {
635
+ strategy: Config_1.default.isMLClustering
636
+ ? new MLTraceSimilarityStrategy_1.default()
637
+ : undefined,
638
+ });
634
639
  yield this.serializeClusterUpdate(clusters);
635
640
  if (Config_1.default.logUnclassifiedClusters) {
636
641
  // cluster traces from the current run
@@ -48,6 +48,26 @@ import type { IHeapSnapshot } from './Types';
48
48
  * ```
49
49
  */
50
50
  export declare function tagObject<T extends object>(o: T, tag: string): T;
51
+ /**
52
+ * Take a heap snapshot of the current program state and save it as a
53
+ * `.heapsnapshot` file under a randomly generated folder inside the system's
54
+ * temp folder.
55
+ *
56
+ * **Note**: All `.heapsnapshot` files could also be loaded by Chrome DevTools.
57
+ * @returns the absolute file path to the saved `.heapsnapshot` file.
58
+ *
59
+ * * **Examples**:
60
+ * ```typescript
61
+ * import type {IHeapSnapshot} from '@memlab/core';
62
+ * import {dumpNodeHeapSnapshot} from '@memlab/core';
63
+ * import {getHeapFromFile} from '@memlab/heap-analysis';
64
+ *
65
+ * (async function () {
66
+ * const heapFile = dumpNodeHeapSnapshot();
67
+ * const heap: IHeapSnapshot = await getHeapFromFile(heapFile);
68
+ * })();
69
+ * ```
70
+ */
51
71
  export declare function dumpNodeHeapSnapshot(): string;
52
72
  /**
53
73
  * Take a heap snapshot of the current program state
@@ -80,6 +80,26 @@ function tagObject(o, tag) {
80
80
  return o;
81
81
  }
82
82
  exports.tagObject = tagObject;
83
+ /**
84
+ * Take a heap snapshot of the current program state and save it as a
85
+ * `.heapsnapshot` file under a randomly generated folder inside the system's
86
+ * temp folder.
87
+ *
88
+ * **Note**: All `.heapsnapshot` files could also be loaded by Chrome DevTools.
89
+ * @returns the absolute file path to the saved `.heapsnapshot` file.
90
+ *
91
+ * * **Examples**:
92
+ * ```typescript
93
+ * import type {IHeapSnapshot} from '@memlab/core';
94
+ * import {dumpNodeHeapSnapshot} from '@memlab/core';
95
+ * import {getHeapFromFile} from '@memlab/heap-analysis';
96
+ *
97
+ * (async function () {
98
+ * const heapFile = dumpNodeHeapSnapshot();
99
+ * const heap: IHeapSnapshot = await getHeapFromFile(heapFile);
100
+ * })();
101
+ * ```
102
+ */
83
103
  function dumpNodeHeapSnapshot() {
84
104
  const file = path_1.default.join(FileManager_1.default.generateTmpHeapDir(), `nodejs.heapsnapshot`);
85
105
  v8_1.default.writeHeapSnapshot(file);
@@ -0,0 +1,7 @@
1
+ /** @internal */
2
+ export declare class PackageInfoLoader {
3
+ private static registeredPackages;
4
+ private static loadFrom;
5
+ static registerPackage(packageDirectory: string): Promise<void>;
6
+ }
7
+ //# sourceMappingURL=PackageInfoLoader.d.ts.map
@@ -0,0 +1,66 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ var __importDefault = (this && this.__importDefault) || function (mod) {
12
+ return (mod && mod.__esModule) ? mod : { "default": mod };
13
+ };
14
+ Object.defineProperty(exports, "__esModule", { value: true });
15
+ exports.PackageInfoLoader = void 0;
16
+ /**
17
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
18
+ *
19
+ * This source code is licensed under the MIT license found in the
20
+ * LICENSE file in the root directory of this source tree.
21
+ *
22
+ * @emails oncall+ws_labs
23
+ * @format
24
+ */
25
+ const fs_extra_1 = __importDefault(require("fs-extra"));
26
+ const path_1 = __importDefault(require("path"));
27
+ const Config_1 = __importDefault(require("./Config"));
28
+ const Utils_1 = __importDefault(require("./Utils"));
29
+ /** @internal */
30
+ class PackageInfoLoader {
31
+ static loadFrom(packageDirectory) {
32
+ return __awaiter(this, void 0, void 0, function* () {
33
+ let exists = yield fs_extra_1.default.pathExists(packageDirectory);
34
+ if (!exists) {
35
+ throw Utils_1.default.haltOrThrow(`package directory doesn't exist: ${packageDirectory}`);
36
+ }
37
+ let packageJSONFile = path_1.default.join(packageDirectory, 'package-oss.json');
38
+ exists = yield fs_extra_1.default.pathExists(packageJSONFile);
39
+ if (!exists) {
40
+ packageJSONFile = path_1.default.join(packageDirectory, 'package.json');
41
+ }
42
+ exists = yield fs_extra_1.default.pathExists(packageJSONFile);
43
+ if (!exists) {
44
+ throw Utils_1.default.haltOrThrow(`package.json doesn't exist: ${packageJSONFile}`);
45
+ }
46
+ try {
47
+ const metaData = yield fs_extra_1.default.readJSON(packageJSONFile, 'UTF-8');
48
+ return Object.assign(Object.assign({}, metaData), { packageLocation: packageDirectory });
49
+ }
50
+ catch (ex) {
51
+ throw Utils_1.default.haltOrThrow(Utils_1.default.getError(ex));
52
+ }
53
+ });
54
+ }
55
+ static registerPackage(packageDirectory) {
56
+ return __awaiter(this, void 0, void 0, function* () {
57
+ if (!PackageInfoLoader.registeredPackages.has(packageDirectory)) {
58
+ PackageInfoLoader.registeredPackages.add(packageDirectory);
59
+ const packageInfo = yield PackageInfoLoader.loadFrom(packageDirectory);
60
+ Config_1.default.packageInfo.push(packageInfo);
61
+ }
62
+ });
63
+ }
64
+ }
65
+ exports.PackageInfoLoader = PackageInfoLoader;
66
+ PackageInfoLoader.registeredPackages = new Set();
@@ -99,7 +99,7 @@ function JSONifyDetachedHTMLElement(node, args, options) {
99
99
  // options for elem.__reactProps$xxx
100
100
  const propsOptions = Object.assign({}, options);
101
101
  propsOptions.forceJSONifyDepth = 1;
102
- for (const edge of node.references) {
102
+ iterateSelectedEdges(node, (edge) => {
103
103
  const key = JSONifyEdgeNameAndType(edge);
104
104
  if (Utils_1.default.isReactFiberEdge(edge)) {
105
105
  info[key] = JSONifyNode(edge.toNode, args, fiberOptions);
@@ -110,7 +110,8 @@ function JSONifyDetachedHTMLElement(node, args, options) {
110
110
  else {
111
111
  info[key] = JSONifyNodeInShort(edge.toNode);
112
112
  }
113
- }
113
+ return null;
114
+ });
114
115
  return info;
115
116
  }
116
117
  function calculateReturnTrace(node, cache) {
@@ -128,15 +129,16 @@ function calculateReturnTrace(node, cache) {
128
129
  const objectNodeUsefulProps = new Set(['_context']);
129
130
  function JSONifyNodeOneLevel(node) {
130
131
  const info = Object.create(null);
131
- for (const edge of node.references) {
132
+ iterateSelectedEdges(node, (edge) => {
132
133
  const key = JSONifyEdgeNameAndType(edge);
133
134
  info[key] = JSONifyNodeShallow(edge.toNode);
134
- }
135
+ return null;
136
+ });
135
137
  return info;
136
138
  }
137
139
  function JSONifyNodeShallow(node) {
138
140
  const info = Object.create(null);
139
- for (const edge of node.references) {
141
+ iterateSelectedEdges(node, (edge) => {
140
142
  const key = JSONifyEdgeNameAndType(edge);
141
143
  if (objectNodeUsefulProps.has(edge.name_or_index)) {
142
144
  info[key] = JSONifyNodeShallow(edge.toNode);
@@ -144,7 +146,8 @@ function JSONifyNodeShallow(node) {
144
146
  else {
145
147
  info[key] = JSONifyNodeInShort(edge.toNode);
146
148
  }
147
- }
149
+ return null;
150
+ });
148
151
  return info;
149
152
  }
150
153
  const fiberNodeUsefulProps = new Set([
@@ -154,15 +157,17 @@ const fiberNodeUsefulProps = new Set([
154
157
  ]);
155
158
  function JSONifyFiberNodeShallow(node) {
156
159
  const info = Object.create(null);
157
- for (const edge of node.references) {
160
+ iterateSelectedEdges(node, (edge) => {
158
161
  const key = JSONifyEdgeNameAndType(edge);
159
162
  if (fiberNodeUsefulProps.has(edge.name_or_index) &&
160
163
  Utils_1.default.isObjectNode(edge.toNode)) {
161
164
  info[key] = JSONifyNodeShallow(edge.toNode);
162
- continue;
163
165
  }
164
- info[key] = JSONifyNodeInShort(edge.toNode);
165
- }
166
+ else {
167
+ info[key] = JSONifyNodeInShort(edge.toNode);
168
+ }
169
+ return null;
170
+ });
166
171
  return info;
167
172
  }
168
173
  // calculate the summary of return chain of the FiberNode
@@ -187,7 +192,9 @@ function JSONifyFiberNodeReturnTrace(node, args, options) {
187
192
  }
188
193
  const parentInfo = getNodeNameInJSON(parent, args);
189
194
  key = `${key}: --return (property)---> ${parentInfo}`;
190
- const info = JSONifyFiberNodeShallow(parent);
195
+ const info = Config_1.default.includeObjectInfoInTraceReturnChain
196
+ ? JSONifyFiberNodeShallow(parent)
197
+ : Object.create(null);
191
198
  trace[key] = info;
192
199
  }
193
200
  return trace;
@@ -206,25 +213,27 @@ function JSONifyFiberNode(node, args, options) {
206
213
  propsOptions.forceJSONifyDepth = 1;
207
214
  }
208
215
  propsOptions.forceJSONifyDepth--;
209
- for (const edge of node.references) {
216
+ iterateSelectedEdges(node, (edge) => {
210
217
  const key = JSONifyEdgeNameAndType(edge);
211
218
  info[key] =
212
- propsOptions.forceJSONifyDepth >= 1
219
+ propsOptions.forceJSONifyDepth && propsOptions.forceJSONifyDepth >= 1
213
220
  ? JSONifyNode(edge.toNode, args, propsOptions)
214
221
  : JSONifyNodeInShort(edge.toNode);
215
- }
222
+ return null;
223
+ });
216
224
  return info;
217
225
  }
218
226
  function JSONifyClosure(node, args, options) {
219
227
  const info = Object.create(null);
220
- for (const edge of node.references) {
228
+ iterateSelectedEdges(node, (edge) => {
221
229
  if (edge.name_or_index === 'shared' ||
222
230
  edge.name_or_index === 'context' ||
223
231
  edge.name_or_index === 'displayName') {
224
232
  const key = filterJSONPropName(edge.name_or_index);
225
233
  info[key] = JSONifyNode(edge.toNode, args, options);
226
234
  }
227
- }
235
+ return null;
236
+ });
228
237
  return info;
229
238
  }
230
239
  function JSONifyNumberNode(node,
@@ -238,7 +247,7 @@ _options) {
238
247
  }
239
248
  function JSONifyCode(node, args, options) {
240
249
  const info = Object.create(null);
241
- for (const edge of node.references) {
250
+ iterateSelectedEdges(node, (edge) => {
242
251
  if (edge.name_or_index === 'name_or_scope_info' &&
243
252
  edge.toNode.name === '(function scope info)') {
244
253
  const key = 'variables with non-number values in closure scope chain';
@@ -251,14 +260,15 @@ function JSONifyCode(node, args, options) {
251
260
  const key = filterJSONPropName(edge.name_or_index);
252
261
  info[key] = JSONifyNode(edge.toNode, args, options);
253
262
  }
254
- }
263
+ return null;
264
+ });
255
265
  return info;
256
266
  }
257
267
  function JSONifyContext(node, args, options) {
258
268
  const info = Object.create(null);
259
269
  const key = 'variables in scope (used by nested closures)';
260
270
  const closure_vars = (info[key] = Object.create(null));
261
- for (const edge of node.references) {
271
+ iterateSelectedEdges(node, (edge) => {
262
272
  const key = filterJSONPropName(edge.name_or_index);
263
273
  if (edge.type === 'context') {
264
274
  closure_vars[key] = JSONifyNodeInShort(edge.toNode);
@@ -266,15 +276,27 @@ function JSONifyContext(node, args, options) {
266
276
  else if (edge.type === '') {
267
277
  info[key] = JSONifyNode(edge.toNode, args, options);
268
278
  }
269
- }
279
+ return null;
280
+ });
270
281
  return info;
271
282
  }
283
+ function iterateSelectedEdges(node, callback) {
284
+ let edgesProcessed = 0;
285
+ node.forEachReference((edge) => {
286
+ if (edge.type === 'internal') {
287
+ if (edge.name_or_index === 'map' || edge.is_index) {
288
+ return;
289
+ }
290
+ }
291
+ if (edgesProcessed++ > 100) {
292
+ return { stop: true };
293
+ }
294
+ return callback(edge);
295
+ });
296
+ }
272
297
  function JSONifyOrdinaryValue(node, args, options) {
273
298
  const info = Object.create(null);
274
- for (const edge of node.references) {
275
- if (edge.name_or_index === 'map' && edge.type === 'internal') {
276
- continue;
277
- }
299
+ iterateSelectedEdges(node, (edge) => {
278
300
  const key = JSONifyEdgeNameAndType(edge);
279
301
  const toNode = edge.toNode;
280
302
  const toNodeName = toNode.name;
@@ -293,7 +315,8 @@ function JSONifyOrdinaryValue(node, args, options) {
293
315
  else {
294
316
  info[key] = JSONifyNodeInShort(toNode);
295
317
  }
296
- }
318
+ return null;
319
+ });
297
320
  return info;
298
321
  }
299
322
  function JSONifyNode(node, args, options) {
@@ -32,7 +32,6 @@ export declare type AnyOptions = Record<string, unknown>;
32
32
  export declare type UnusedOptions = Record<string, never>;
33
33
  /** @internal */
34
34
  export declare type Command = [string, string[], AnyOptions];
35
- export declare type Predicator<T> = (node: T) => boolean;
36
35
  /** @internal */
37
36
  export declare type HeapNodeIdSet = Set<number>;
38
37
  /** @internal */
@@ -86,9 +85,26 @@ export declare type CLIArgs = {
86
85
  'local-puppeteer': boolean;
87
86
  'snapshot-dir': string;
88
87
  };
88
+ /**
89
+ * the predicate callback is used to decide if a
90
+ * entity of type `T`.
91
+ * For more concrete examples on where it is used,
92
+ * check out {@link findAnyReference}, {@link findAnyReferrer},
93
+ * and {@link findReferrers}.
94
+ *
95
+ * @typeParam T - the type of the entity to be checked
96
+ * @param entity - the entity to be checked
97
+ * @returns whether the entity passes the predicate check
98
+ */
99
+ export declare type Predicator<T> = (entity: T) => boolean;
100
+ /**
101
+ * Data structure for holding cookies.
102
+ * For concrete example, check out {@link cookies}.
103
+ */
89
104
  export declare type Cookies = Array<{
90
105
  name: string;
91
106
  value: string;
107
+ domain?: string;
92
108
  }>;
93
109
  /** @internal */
94
110
  export interface IE2EScenarioSynthesizer {
@@ -119,6 +135,12 @@ export interface E2EScenarioSynthesizerConstructor {
119
135
  new (config: Config): IE2EScenarioSynthesizer;
120
136
  }
121
137
  /** @internal */
138
+ export interface IPackageInfo {
139
+ name: string;
140
+ version: string;
141
+ packageLocation?: string;
142
+ }
143
+ /** @internal */
122
144
  export interface IRunningMode {
123
145
  setConfig(config: Config): void;
124
146
  beforeRunning(visitPlan: IE2EScenarioVisitPlan): void;
@@ -281,9 +303,10 @@ export interface ILeakFilter {
281
303
  /**
282
304
  * Lifecycle function callback that is invoked initially once before calling any
283
305
  * leak filter function.
306
+ * For concrete example, check out {@link beforeLeakFilter}.
284
307
  *
285
- * @param snaphost - heap snapshot see {@link IHeapSnapshot}
286
- * @param leakedNodeIds - the set of leaked object (node) ids.
308
+ * @param snapshot heap snapshot see {@link IHeapSnapshot}
309
+ * @param leakedNodeIds the set of leaked object (node) ids.
287
310
  */
288
311
  export declare type InitLeakFilterCallback = (snapshot: IHeapSnapshot, leakedNodeIds: HeapNodeIdSet) => void;
289
312
  /**
@@ -292,6 +315,8 @@ export declare type InitLeakFilterCallback = (snapshot: IHeapSnapshot, leakedNod
292
315
  * allocated but not released from the target interaction
293
316
  * in the heap snapshot.
294
317
  *
318
+ * For concrete examples, check out {@link leakFilter}.
319
+ *
295
320
  * @param node - the node that is kept alive in the memory in the heap snapshot
296
321
  * @param snapshot - the snapshot of target interaction
297
322
  * @param leakedNodeIds - the set of leaked node ids
@@ -310,6 +335,11 @@ export declare type LeakFilterCallback = (node: IHeapNode, snapshot: IHeapSnapsh
310
335
  /**
311
336
  * The callback defines browser interactions which are
312
337
  * used by memlab to interact with the web app under test.
338
+ * For concrete examples, check out {@link action} or {@link back}.
339
+ *
340
+ * @param page the puppeteer [`Page`](https://pptr.dev/api/puppeteer.page)
341
+ * object, which provides APIs to interact with the web browser
342
+ * @returns no return value
313
343
  */
314
344
  export declare type InteractionsCallback = (page: Page, args?: OperationArgs) => Promise<void>;
315
345
  /**
@@ -359,7 +389,11 @@ export interface IScenario {
359
389
  * a list of `<name, value, domain>` tuples.
360
390
  *
361
391
  * **Note**: please make sure that you provide the correct `domain` field for
362
- * the cookies tuples.
392
+ * the cookies tuples. If no `domain` field is specified, memlab will try
393
+ * to fill in a domain based on the `url` callback.
394
+ * For example, when the `domain` field is absent,
395
+ * memlab will auto fill in `.facebook.com` as domain base
396
+ * on the initial page load's url: `https://www.facebook.com/`.
363
397
  *
364
398
  * @returns cookie list
365
399
  * * **Examples**:
@@ -374,6 +408,8 @@ export interface IScenario {
374
408
  * // ...
375
409
  * ],
376
410
  * };
411
+ *
412
+ * module.exports = scenario;
377
413
  * ```
378
414
  */
379
415
  cookies?: () => Cookies;
@@ -386,6 +422,8 @@ export interface IScenario {
386
422
  * const scenario = {
387
423
  * url: () => 'https://www.npmjs.com/',
388
424
  * };
425
+ *
426
+ * module.exports = scenario;
389
427
  * ```
390
428
  * If a test scenario only specifies the `url` callback (without the `action`
391
429
  * callback), memlab will try to detect memory leaks from the initial page
@@ -414,6 +452,8 @@ export interface IScenario {
414
452
  * await page.click('a[href="/back"]');
415
453
  * },
416
454
  * }
455
+ *
456
+ * module.exports = scenario;
417
457
  * ```
418
458
  * Note: always clean up external puppeteer references to JS objects
419
459
  * in the browser context.
@@ -431,6 +471,8 @@ export interface IScenario {
431
471
  * },
432
472
  * back: async (page) => ... ,
433
473
  * }
474
+ *
475
+ * module.exports = scenario;
434
476
  ```
435
477
  */
436
478
  action?: InteractionsCallback;
@@ -438,6 +480,10 @@ export interface IScenario {
438
480
  * `back` is the callback function that specifies how memlab should
439
481
  * back/revert the `action` callback. Think of it as an undo action.
440
482
  *
483
+ * * **Parameters**:
484
+ * * page: `Page` | the puppeteer [`Page`](https://pptr.dev/api/puppeteer.page)
485
+ * object, which provides APIs to interact with the web browser
486
+ *
441
487
  * * **Examples**:
442
488
  * ```typescript
443
489
  * const scenario = {
@@ -661,7 +707,12 @@ export interface IDataBuilder {
661
707
  }
662
708
  /**
663
709
  * Callback function to provide if the page is loaded.
710
+ * For concrete example, check out {@link isPageLoaded}.
664
711
  * @param page - puppeteer's [Page](https://pptr.dev/api/puppeteer.page/) object.
712
+ * @returns a boolean value, if it returns `true`, memlab will consider
713
+ * the navigation completes, if it returns `false`, memlab will keep calling
714
+ * this callback until it returns `true`. This is an async callback, you can
715
+ * also `await` and returns `true` until some async logic is resolved.
665
716
  */
666
717
  export declare type CheckPageLoadCallback = (page: Page) => Promise<boolean>;
667
718
  /** @internal */
@@ -711,16 +762,42 @@ export declare type E2EStepInfo = IE2EStepBasic & {
711
762
  delay?: number;
712
763
  metrics: Record<string, number>;
713
764
  };
714
- /** @internal */
765
+ /**
766
+ * This data structure contains the input configuration for the browser and
767
+ * output data from the browser. You can retrieve the instance of this type
768
+ * through {@link RunMetaInfo}.
769
+ */
715
770
  export interface IBrowserInfo {
771
+ /**
772
+ * browser version
773
+ */
716
774
  _browserVersion: string;
775
+ /**
776
+ * configuration for puppeteer
777
+ */
717
778
  _puppeteerConfig: LaunchOptions;
779
+ /**
780
+ * all web console output
781
+ */
718
782
  _consoleMessages: string[];
719
783
  }
784
+ /**
785
+ * This data structure holds the information about memlab run.
786
+ * You can retrieve the instance of this type through {@link getRunMetaInfo}.
787
+ */
720
788
  export declare type RunMetaInfo = {
789
+ /** @internal */
721
790
  app: string;
791
+ /** @internal */
722
792
  interaction: string;
793
+ /**
794
+ * type of the memlab run
795
+ */
723
796
  type: string;
797
+ /**
798
+ * input configuration for the browser and
799
+ * output data from the browser
800
+ */
724
801
  browserInfo: IBrowserInfo;
725
802
  };
726
803
  /**
@@ -1135,6 +1212,13 @@ export interface IHeapNodeBasic {
1135
1212
  */
1136
1213
  id: number;
1137
1214
  }
1215
+ /**
1216
+ * Executes a provided callback once for JavaScript references.
1217
+ * For concrete examples, check out {@link forEachReference}
1218
+ * or {@link forEachReferrer}.
1219
+ * @param callback the callback for each JavaScript reference from a collection
1220
+ * @returns this API returns void
1221
+ */
1138
1222
  export declare type EdgeIterationCallback = (edge: IHeapEdge) => Optional<{
1139
1223
  stop: boolean;
1140
1224
  }>;
@@ -20,6 +20,7 @@ const Utils_1 = __importDefault(require("../lib/Utils"));
20
20
  const TraceElement_1 = require("./TraceElement");
21
21
  const TraceSimilarityStrategy_1 = __importDefault(require("./strategies/TraceSimilarityStrategy"));
22
22
  const TraceAsClusterStrategy_1 = __importDefault(require("./strategies/TraceAsClusterStrategy"));
23
+ const MLTraceSimilarityStrategy_1 = __importDefault(require("./strategies/MLTraceSimilarityStrategy"));
23
24
  // sync up with html/intern/js/webspeed/memlab/lib/LeakCluster.js
24
25
  class NormalizedTrace {
25
26
  constructor(p = null, snapshot = null) {
@@ -157,7 +158,11 @@ class NormalizedTrace {
157
158
  };
158
159
  }
159
160
  static clusterLeakTraces(leakTraces) {
160
- const { allClusters } = NormalizedTrace.diffTraces(leakTraces, []);
161
+ const { allClusters } = NormalizedTrace.diffTraces(leakTraces, [], {
162
+ strategy: Config_1.default.isMLClustering
163
+ ? new MLTraceSimilarityStrategy_1.default()
164
+ : undefined,
165
+ });
161
166
  const lastNodeFromTrace = (trace) => trace[trace.length - 1];
162
167
  const labaledLeakTraces = allClusters.reduce((acc, bucket) => {
163
168
  const lastNodeFromFirstTrace = lastNodeFromTrace(bucket[0]);
@@ -0,0 +1,15 @@
1
+ /**
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ *
7
+ * @emails oncall+ws_labs
8
+ * @format
9
+ */
10
+ import type { IClusterStrategy, LeakTrace, TraceDiff } from '../../lib/Types';
11
+ export default class MLTraceSimilarityStrategy implements IClusterStrategy {
12
+ diffTraces(newLeakTraces: LeakTrace[]): TraceDiff;
13
+ traceToDoc(trace: LeakTrace): string;
14
+ }
15
+ //# sourceMappingURL=MLTraceSimilarityStrategy.d.ts.map
@@ -0,0 +1,61 @@
1
+ "use strict";
2
+ /**
3
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
4
+ *
5
+ * This source code is licensed under the MIT license found in the
6
+ * LICENSE file in the root directory of this source tree.
7
+ *
8
+ * @emails oncall+ws_labs
9
+ * @format
10
+ */
11
+ var __importDefault = (this && this.__importDefault) || function (mod) {
12
+ return (mod && mod.__esModule) ? mod : { "default": mod };
13
+ };
14
+ Object.defineProperty(exports, "__esModule", { value: true });
15
+ const Config_1 = __importDefault(require("../../lib/Config"));
16
+ const DistanceMatrix_1 = require("./machine-learning/DistanceMatrix");
17
+ const HAC_1 = require("./machine-learning/HAC");
18
+ const TfidfVectorizer_1 = require("./machine-learning/TfidfVectorizer");
19
+ class MLTraceSimilarityStrategy {
20
+ diffTraces(newLeakTraces) {
21
+ var _a;
22
+ const rawDocuments = newLeakTraces.map(this.traceToDoc);
23
+ const vectorizer = new TfidfVectorizer_1.TfidfVectorizer({ rawDocuments });
24
+ const tfidfs = vectorizer.computeTfidfs();
25
+ const dmatrix = (0, DistanceMatrix_1.distance)(tfidfs);
26
+ const result = (0, HAC_1.cluster)(rawDocuments.length, dmatrix, Config_1.default.mlClusteringLinkageMaxDistance);
27
+ const map = new Map();
28
+ for (let i = 0; i < result.length; i++) {
29
+ const traceIdx = result[i];
30
+ const repTrace = newLeakTraces[traceIdx];
31
+ const trace = newLeakTraces[i];
32
+ if (!map.has(repTrace)) {
33
+ map.set(repTrace, [repTrace]);
34
+ }
35
+ // to please lint
36
+ (_a = map.get(repTrace)) === null || _a === void 0 ? void 0 : _a.push(trace);
37
+ }
38
+ return {
39
+ allClusters: Array.from(map.values()),
40
+ staleClusters: [],
41
+ clustersToAdd: [],
42
+ };
43
+ }
44
+ traceToDoc(trace) {
45
+ const res = [];
46
+ for (const t of trace) {
47
+ let name = t.kind === 'node' ? String(t.name) : String(t.name_or_index);
48
+ if (name === '') {
49
+ name = '_null_';
50
+ }
51
+ name = name.replace(/ /g, '_');
52
+ name = name.replace(/\d/g, '');
53
+ if (name === '') {
54
+ name = '_number_';
55
+ }
56
+ res.push(name);
57
+ }
58
+ return res.join(' ');
59
+ }
60
+ }
61
+ exports.default = MLTraceSimilarityStrategy;
@@ -0,0 +1,11 @@
1
+ /**
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ *
7
+ * @emails oncall+ws_labs
8
+ * @format
9
+ */
10
+ export declare const distance: (tfidfs: Record<string, number>[]) => Float32Array;
11
+ //# sourceMappingURL=DistanceMatrix.d.ts.map
@@ -0,0 +1,54 @@
1
+ "use strict";
2
+ /**
3
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
4
+ *
5
+ * This source code is licensed under the MIT license found in the
6
+ * LICENSE file in the root directory of this source tree.
7
+ *
8
+ * @emails oncall+ws_labs
9
+ * @format
10
+ */
11
+ Object.defineProperty(exports, "__esModule", { value: true });
12
+ exports.distance = void 0;
13
+ const cache = new Map();
14
+ const buildIntersection = (tfidfs, i, j) => {
15
+ const intersection = [];
16
+ if (!cache.has(i)) {
17
+ cache.set(i, Object.keys(tfidfs[i]));
18
+ }
19
+ if (!cache.has(j)) {
20
+ cache.set(j, Object.keys(tfidfs[j]));
21
+ }
22
+ const [keys, tfidf] = cache.get(i).length > cache.get(j).length
23
+ ? [cache.get(j), tfidfs[i]]
24
+ : [cache.get(i), tfidfs[j]];
25
+ for (const k of keys) {
26
+ if (tfidf[k]) {
27
+ intersection.push(k);
28
+ }
29
+ }
30
+ return intersection;
31
+ };
32
+ const distance = (tfidfs) => {
33
+ const n = tfidfs.length;
34
+ const distances = new Float32Array((n * (n - 1)) / 2);
35
+ let distIdx = 0;
36
+ const dotProducs = tfidfs.map(atfidf => Object.values(atfidf).reduce((sum, v) => sum + v * v, 0));
37
+ for (let i = 0; i < tfidfs.length; i++) {
38
+ const a = tfidfs[i];
39
+ for (let j = i + 1; j < tfidfs.length; j++) {
40
+ const b = tfidfs[j];
41
+ const intersection = buildIntersection(tfidfs, i, j);
42
+ const dotProdOfCommons = intersection.reduce((sum, vidx) => sum + a[vidx] * b[vidx], 0);
43
+ // TODO make it pluggable to use other distance measures like euclidean, manhattan
44
+ const cosineSimilarity = 1 -
45
+ dotProdOfCommons /
46
+ (Math.sqrt(dotProducs[i]) / Math.sqrt(dotProducs[j]));
47
+ distances[distIdx] = cosineSimilarity;
48
+ distIdx++;
49
+ }
50
+ }
51
+ cache.clear();
52
+ return distances;
53
+ };
54
+ exports.distance = distance;
@@ -0,0 +1,17 @@
1
+ /**
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ *
7
+ * @emails oncall+ws_labs
8
+ * @format
9
+ */
10
+ /**
11
+ *
12
+ * @param {*} nDocs number of docs
13
+ * @param {*} D condenced distance matrix
14
+ * @returns labels - list of doc ids as clusters
15
+ */
16
+ export declare const cluster: (nDocs: number, condensedDistanceMatrix: Float32Array, maxDistanceThreshold: number) => number[];
17
+ //# sourceMappingURL=HAC.d.ts.map
@@ -0,0 +1,124 @@
1
+ "use strict";
2
+ /**
3
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
4
+ *
5
+ * This source code is licensed under the MIT license found in the
6
+ * LICENSE file in the root directory of this source tree.
7
+ *
8
+ * @emails oncall+ws_labs
9
+ * @format
10
+ */
11
+ Object.defineProperty(exports, "__esModule", { value: true });
12
+ exports.cluster = void 0;
13
+ const condensedIndex = (n, i, j) => {
14
+ if (i > j) {
15
+ return condensedIndex(n, j, i);
16
+ }
17
+ // to get distance between (i, j) think of this sequence.
18
+ // (n - 1) + (n - 2) + ... + (n - i) + (j - i) - 1
19
+ return n * i - (i * (i + 1)) / 2 + (j - i - 1);
20
+ };
21
+ const getRootLabel = (array, idx) => {
22
+ let rootIdx = idx;
23
+ while (array[rootIdx] !== rootIdx) {
24
+ rootIdx = array[rootIdx];
25
+ }
26
+ return rootIdx;
27
+ };
28
+ /**
29
+ *
30
+ * @param {*} nDocs number of docs
31
+ * @param {*} D condenced distance matrix
32
+ * @returns labels - list of doc ids as clusters
33
+ */
34
+ const cluster = (nDocs, condensedDistanceMatrix, maxDistanceThreshold) => {
35
+ if (nDocs <= 1)
36
+ return [0];
37
+ const condencedDistanceMatrixCopy = new Float32Array(condensedDistanceMatrix);
38
+ const sizeOfClusters = new Uint32Array(nDocs).fill(1);
39
+ let chainLength = 0;
40
+ let clusterChain = [];
41
+ let traceAIdx = -1;
42
+ let traceBIdx = -1;
43
+ let currentMin = Number.MAX_SAFE_INTEGER;
44
+ let distanceBetweenTraces;
45
+ const labels = Array(nDocs)
46
+ .fill(0)
47
+ .map((_, idx) => idx);
48
+ for (let k = 0; k < nDocs - 1; k++) {
49
+ traceBIdx = -1;
50
+ if (chainLength === 0) {
51
+ for (let i = 0; i < nDocs; i++) {
52
+ if (sizeOfClusters[i] > 0) {
53
+ clusterChain[0] = i;
54
+ chainLength = 1;
55
+ break;
56
+ }
57
+ }
58
+ }
59
+ while (chainLength > 0) {
60
+ traceAIdx = clusterChain[chainLength - 1];
61
+ if (chainLength > 1) {
62
+ traceBIdx = clusterChain[chainLength - 2];
63
+ currentMin =
64
+ condencedDistanceMatrixCopy[condensedIndex(nDocs, traceAIdx, traceBIdx)];
65
+ }
66
+ else {
67
+ currentMin = Number.MAX_SAFE_INTEGER;
68
+ }
69
+ for (let i = 0; i < nDocs; i++) {
70
+ if (sizeOfClusters[i] == 0 || traceAIdx == i) {
71
+ continue;
72
+ }
73
+ distanceBetweenTraces =
74
+ condencedDistanceMatrixCopy[condensedIndex(nDocs, traceAIdx, i)];
75
+ if (distanceBetweenTraces < currentMin) {
76
+ currentMin = distanceBetweenTraces;
77
+ traceBIdx = i;
78
+ }
79
+ }
80
+ // to make sure we found a two mutual traces whose distance is smallest.
81
+ if (chainLength > 1 &&
82
+ traceBIdx !== -1 &&
83
+ traceBIdx === clusterChain[chainLength - 2]) {
84
+ break;
85
+ }
86
+ clusterChain[chainLength] = traceBIdx;
87
+ chainLength = chainLength + 1;
88
+ }
89
+ clusterChain = [];
90
+ chainLength = 0;
91
+ if (currentMin > maxDistanceThreshold) {
92
+ sizeOfClusters[traceAIdx] = 0;
93
+ sizeOfClusters[traceBIdx] = 0;
94
+ continue;
95
+ }
96
+ if (traceAIdx === -1 || traceBIdx === -1) {
97
+ continue;
98
+ }
99
+ if (traceAIdx > traceBIdx) {
100
+ [traceAIdx, traceBIdx] = [traceBIdx, traceAIdx];
101
+ }
102
+ const nx = sizeOfClusters[traceAIdx];
103
+ const ny = sizeOfClusters[traceBIdx];
104
+ labels[traceAIdx] = traceBIdx;
105
+ sizeOfClusters[traceAIdx] = 0;
106
+ sizeOfClusters[traceBIdx] = nx + ny;
107
+ for (let i = 0; i < nDocs; i++) {
108
+ const ni = sizeOfClusters[i];
109
+ if (ni === 0 || i === traceBIdx) {
110
+ continue;
111
+ }
112
+ const d_xi = condencedDistanceMatrixCopy[condensedIndex(nDocs, i, traceAIdx)];
113
+ const d_yi = condencedDistanceMatrixCopy[condensedIndex(nDocs, i, traceBIdx)];
114
+ const size_x = nx;
115
+ const size_y = ny;
116
+ // TODO make it generic to support other linkage methods like complete, weighted etc...
117
+ const updatedDist = (size_x * d_xi + size_y * d_yi) / (size_x + size_y);
118
+ condencedDistanceMatrixCopy[condensedIndex(nDocs, i, traceBIdx)] =
119
+ updatedDist;
120
+ }
121
+ }
122
+ return labels.map((_, idx) => getRootLabel(labels, idx));
123
+ };
124
+ exports.cluster = cluster;
@@ -0,0 +1,11 @@
1
+ /**
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ *
7
+ * @emails oncall+ws_labs
8
+ * @format
9
+ */
10
+ export declare function nGram(n: number, terms: string[]): string[];
11
+ //# sourceMappingURL=Ngram.d.ts.map
@@ -0,0 +1,22 @@
1
+ "use strict";
2
+ /**
3
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
4
+ *
5
+ * This source code is licensed under the MIT license found in the
6
+ * LICENSE file in the root directory of this source tree.
7
+ *
8
+ * @emails oncall+ws_labs
9
+ * @format
10
+ */
11
+ Object.defineProperty(exports, "__esModule", { value: true });
12
+ exports.nGram = void 0;
13
+ function nGram(n, terms) {
14
+ const nGrams = [];
15
+ let index = 0;
16
+ while (index <= terms.length - n) {
17
+ nGrams[index] = terms.slice(index, index + n).join(' ');
18
+ ++index;
19
+ }
20
+ return nGrams;
21
+ }
22
+ exports.nGram = nGram;
@@ -0,0 +1,38 @@
1
+ /**
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ *
4
+ * This source code is licensed under the MIT license found in the
5
+ * LICENSE file in the root directory of this source tree.
6
+ *
7
+ * @emails oncall+ws_labs
8
+ * @format
9
+ */
10
+ interface TfidfVectorizerProps {
11
+ rawDocuments: string[];
12
+ maxDF?: number;
13
+ }
14
+ export declare class TfidfVectorizer {
15
+ rawDocuments: string[];
16
+ vocabulary: Record<string, string>;
17
+ documentFrequency: Record<string, number>;
18
+ maxDF: number;
19
+ documents: Record<string, number>[];
20
+ tfidfs: Record<string, number>[];
21
+ constructor({ rawDocuments, maxDF }: TfidfVectorizerProps);
22
+ computeTfidfs(): Record<string, number>[];
23
+ tokenize(text: string): string[];
24
+ buildVocabulary(tokenizedDocuments: string[][]): Record<string, string>;
25
+ processDocuments(tokenizedDocuments: string[][]): void;
26
+ limit(): void;
27
+ /**
28
+ * Smooth idf weights by adding 1 to document frequencies (DF), as if an extra
29
+ * document was seen containing every term in the collection exactly once.
30
+ * This prevents zero divisions.
31
+ * */
32
+ smooth(): void;
33
+ buildTfidfs(): Record<string, number>[];
34
+ tf(vocabIdx: string, document: Record<string, number>): number;
35
+ idf(vocabIdx: string): number;
36
+ }
37
+ export {};
38
+ //# sourceMappingURL=TfidfVectorizer.d.ts.map
@@ -0,0 +1,140 @@
1
+ "use strict";
2
+ /**
3
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
4
+ *
5
+ * This source code is licensed under the MIT license found in the
6
+ * LICENSE file in the root directory of this source tree.
7
+ *
8
+ * @emails oncall+ws_labs
9
+ * @format
10
+ */
11
+ Object.defineProperty(exports, "__esModule", { value: true });
12
+ exports.TfidfVectorizer = void 0;
13
+ const Ngram_1 = require("./Ngram");
14
+ const SMOOTHING_KEY = '__smoothObjectKey';
15
+ const VOCAB_IDX_FOR_DOC_WITH_HIGH_DF = '-1';
16
+ class TfidfVectorizer {
17
+ constructor({ rawDocuments, maxDF = 0.8 }) {
18
+ this.rawDocuments = [];
19
+ this.vocabulary = Object.create(null);
20
+ this.documentFrequency = Object.create(null);
21
+ this.documents = [];
22
+ this.rawDocuments = rawDocuments;
23
+ this.maxDF = maxDF;
24
+ }
25
+ computeTfidfs() {
26
+ const tokenizedDocuments = this.rawDocuments.map(this.tokenize);
27
+ this.vocabulary = this.buildVocabulary(tokenizedDocuments);
28
+ this.processDocuments(tokenizedDocuments);
29
+ this.limit();
30
+ this.smooth();
31
+ this.tfidfs = this.buildTfidfs();
32
+ return this.tfidfs;
33
+ }
34
+ tokenize(text) {
35
+ const terms = text.split(' ');
36
+ return [...terms, ...(0, Ngram_1.nGram)(2, terms), ...(0, Ngram_1.nGram)(3, terms)];
37
+ }
38
+ buildVocabulary(tokenizedDocuments) {
39
+ let vocabIdx = 0;
40
+ const vocabulary = Object.create(null);
41
+ tokenizedDocuments.forEach(doc => {
42
+ doc.forEach(term => {
43
+ if (!vocabulary[String(term)]) {
44
+ vocabulary[String(term)] = String(vocabIdx);
45
+ vocabIdx++;
46
+ }
47
+ });
48
+ });
49
+ return vocabulary;
50
+ }
51
+ processDocuments(tokenizedDocuments) {
52
+ tokenizedDocuments.forEach(terms => {
53
+ const document = {};
54
+ terms.forEach(t => {
55
+ const vocabIdx = this.vocabulary[t];
56
+ if (document[vocabIdx]) {
57
+ document[vocabIdx] += 1;
58
+ }
59
+ else {
60
+ if (this.documentFrequency[vocabIdx]) {
61
+ this.documentFrequency[vocabIdx] += 1;
62
+ }
63
+ else {
64
+ this.documentFrequency[vocabIdx] = 1;
65
+ }
66
+ document[vocabIdx] = 1;
67
+ }
68
+ });
69
+ this.documents.push(document);
70
+ });
71
+ }
72
+ limit() {
73
+ const nMaxDF = Math.floor(this.documents.length * this.maxDF);
74
+ const vocabIdxsToDelete = [];
75
+ this.documents.forEach(doc => {
76
+ Object.keys(doc).forEach(vocabIdx => {
77
+ if (this.documentFrequency[vocabIdx] > nMaxDF) {
78
+ delete doc[vocabIdx];
79
+ vocabIdxsToDelete.push(vocabIdx);
80
+ }
81
+ });
82
+ });
83
+ vocabIdxsToDelete.forEach(vocabIdx => {
84
+ delete this.documentFrequency[vocabIdx];
85
+ delete this.vocabulary[vocabIdx];
86
+ });
87
+ }
88
+ /**
89
+ * Smooth idf weights by adding 1 to document frequencies (DF), as if an extra
90
+ * document was seen containing every term in the collection exactly once.
91
+ * This prevents zero divisions.
92
+ * */
93
+ smooth() {
94
+ // for each vocabulary
95
+ Object.values(this.vocabulary).forEach(vocabIdx => (this.documentFrequency[vocabIdx] =
96
+ this.documentFrequency[vocabIdx] + 1));
97
+ this.documents.push({ [SMOOTHING_KEY]: 1 });
98
+ }
99
+ buildTfidfs() {
100
+ const tfidfs = [];
101
+ this.documents.forEach(document => {
102
+ // this means all the terms in the document are the terms
103
+ // that have high document frequency.
104
+ // This will make all the docs with high DF to be clustered together.
105
+ if (Object.keys(document).length === 0) {
106
+ tfidfs.push({ [VOCAB_IDX_FOR_DOC_WITH_HIGH_DF]: 1 });
107
+ return;
108
+ }
109
+ if (!document[SMOOTHING_KEY]) {
110
+ const atfidf = Object.keys(document).map(vocabIdx => {
111
+ return [vocabIdx, this.tf(vocabIdx, document) * this.idf(vocabIdx)];
112
+ });
113
+ // normalizing the values
114
+ const dotSum = atfidf
115
+ .map(([_, tfidfValue]) => tfidfValue * tfidfValue)
116
+ .reduce((sum, tfidfValueSquered) => sum + tfidfValueSquered, 0);
117
+ const dotSumSqrRoot = Math.sqrt(dotSum);
118
+ // Normalizing tfidfs
119
+ const atfidfVocabIdxValueObject = atfidf
120
+ .map(([vocabIdx, tfidfValue]) => [
121
+ vocabIdx,
122
+ tfidfValue / dotSumSqrRoot,
123
+ ])
124
+ .reduce((obj, [vocabIdx, value]) => {
125
+ obj[vocabIdx] = value;
126
+ return obj;
127
+ }, {});
128
+ tfidfs.push(atfidfVocabIdxValueObject);
129
+ }
130
+ });
131
+ return tfidfs;
132
+ }
133
+ tf(vocabIdx, document) {
134
+ return 1 + Math.log(document[vocabIdx]);
135
+ }
136
+ idf(vocabIdx) {
137
+ return (1 + Math.log(this.documents.length / this.documentFrequency[vocabIdx]));
138
+ }
139
+ }
140
+ exports.TfidfVectorizer = TfidfVectorizer;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@memlab/core",
3
- "version": "1.1.3",
3
+ "version": "1.1.6",
4
4
  "license": "MIT",
5
5
  "description": "memlab core libraries",
6
6
  "author": "Liang Gong <lgong@fb.com>",