@memlab/core 1.1.4 → 1.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/__tests__/parser/HeapParser.test.js +2 -2
- package/dist/__tests__/parser/NodeHeap.test.js +5 -5
- package/dist/__tests__/parser/StringNode.test.js +1 -1
- package/dist/__tests__/parser/traverse/HeapNodeTraverse.test.js +2 -2
- package/dist/index.d.ts +5 -1
- package/dist/index.js +22 -2
- package/dist/lib/Config.d.ts +16 -9
- package/dist/lib/Config.js +15 -0
- package/dist/lib/FileManager.js +4 -2
- package/dist/lib/HeapAnalyzer.js +25 -9
- package/dist/lib/NodeHeap.d.ts +52 -9
- package/dist/lib/NodeHeap.js +72 -21
- package/dist/lib/PackageInfoLoader.d.ts +7 -0
- package/dist/lib/PackageInfoLoader.js +66 -0
- package/dist/lib/Serializer.js +48 -25
- package/dist/lib/Types.d.ts +119 -35
- package/dist/lib/Utils.js +24 -9
- package/dist/lib/heap-data/HeapSnapshot.d.ts +1 -0
- package/dist/lib/heap-data/HeapSnapshot.js +3 -30
- package/dist/lib/heap-data/HeapStringNode.js +2 -0
- package/dist/lib/heap-data/MemLabTagStore.d.ts +23 -0
- package/dist/lib/heap-data/MemLabTagStore.js +110 -0
- package/dist/trace-cluster/TraceBucket.js +6 -1
- package/dist/trace-cluster/strategies/MLTraceSimilarityStrategy.d.ts +15 -0
- package/dist/trace-cluster/strategies/MLTraceSimilarityStrategy.js +61 -0
- package/dist/trace-cluster/strategies/machine-learning/DistanceMatrix.d.ts +11 -0
- package/dist/trace-cluster/strategies/machine-learning/DistanceMatrix.js +54 -0
- package/dist/trace-cluster/strategies/machine-learning/HAC.d.ts +17 -0
- package/dist/trace-cluster/strategies/machine-learning/HAC.js +122 -0
- package/dist/trace-cluster/strategies/machine-learning/Ngram.d.ts +11 -0
- package/dist/trace-cluster/strategies/machine-learning/Ngram.js +22 -0
- package/dist/trace-cluster/strategies/machine-learning/TfidfVectorizer.d.ts +38 -0
- package/dist/trace-cluster/strategies/machine-learning/TfidfVectorizer.js +144 -0
- package/package.json +1 -1
|
@@ -18,11 +18,13 @@ const Console_1 = __importDefault(require("../Console"));
|
|
|
18
18
|
const HeapNode_1 = __importDefault(require("./HeapNode"));
|
|
19
19
|
const HeapEdge_1 = __importDefault(require("./HeapEdge"));
|
|
20
20
|
const HeapUtils_1 = require("./HeapUtils");
|
|
21
|
+
const MemLabTagStore_1 = __importDefault(require("./MemLabTagStore"));
|
|
21
22
|
const EMPTY_UINT8_ARRAY = new Uint8Array(0);
|
|
22
23
|
const EMPTY_UINT32_ARRAY = new Uint32Array(0);
|
|
23
24
|
class HeapSnapshot {
|
|
24
25
|
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
25
26
|
constructor(snapshot, _options = {}) {
|
|
27
|
+
this.isProcessed = false;
|
|
26
28
|
this._nodeCount = -1;
|
|
27
29
|
this._edgeCount = -1;
|
|
28
30
|
this._nodeId2NodeIdx = {};
|
|
@@ -158,36 +160,7 @@ class HeapSnapshot {
|
|
|
158
160
|
return detected;
|
|
159
161
|
}
|
|
160
162
|
hasObjectWithTag(tag) {
|
|
161
|
-
|
|
162
|
-
let tagStore = null;
|
|
163
|
-
this.nodes.forEach((node) => {
|
|
164
|
-
if (node.name === 'MemLabTaggedStore' && node.type === 'object') {
|
|
165
|
-
tagStore = node;
|
|
166
|
-
return false;
|
|
167
|
-
}
|
|
168
|
-
});
|
|
169
|
-
if (tagStore == null) {
|
|
170
|
-
return false;
|
|
171
|
-
}
|
|
172
|
-
const store = tagStore;
|
|
173
|
-
// get tagStore.taggedObjects
|
|
174
|
-
const taggedObjects = store.getReferenceNode('taggedObjects', 'property');
|
|
175
|
-
if (taggedObjects == null) {
|
|
176
|
-
return false;
|
|
177
|
-
}
|
|
178
|
-
// get taggedObjects[tag]
|
|
179
|
-
const weakSet = taggedObjects.getReferenceNode(tag, 'property');
|
|
180
|
-
if (weakSet == null) {
|
|
181
|
-
return false;
|
|
182
|
-
}
|
|
183
|
-
// get weakSet.table
|
|
184
|
-
const table = weakSet.getReferenceNode('table');
|
|
185
|
-
if (table == null) {
|
|
186
|
-
return false;
|
|
187
|
-
}
|
|
188
|
-
// check if the table has any weak reference to any object
|
|
189
|
-
const ref = table.findAnyReference((edge) => edge.type === 'weak' && edge.toNode.name !== 'system / Oddball');
|
|
190
|
-
return ref != null;
|
|
163
|
+
return MemLabTagStore_1.default.hasObjectWithTag(this, tag);
|
|
191
164
|
}
|
|
192
165
|
getNodeById(id) {
|
|
193
166
|
if (!(id in this._nodeId2NodeIdx)) {
|
|
@@ -35,6 +35,8 @@ class HeapStringNode extends HeapNode_1.default {
|
|
|
35
35
|
if (parentNode == null) {
|
|
36
36
|
throw (0, HeapUtils_1.throwError)(new Error('broken sliced string'));
|
|
37
37
|
}
|
|
38
|
+
// sliced string in heap snapshot doesn't include
|
|
39
|
+
// the start index and the end index, so this may be inaccurate
|
|
38
40
|
return parentNode.stringValue;
|
|
39
41
|
}
|
|
40
42
|
return this.name;
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*
|
|
7
|
+
* @emails oncall+ws_labs
|
|
8
|
+
* @format
|
|
9
|
+
*/
|
|
10
|
+
import type { AnyValue, IHeapSnapshot } from '../Types';
|
|
11
|
+
declare type AnyObject = Record<AnyValue, AnyValue>;
|
|
12
|
+
/** @internal */
|
|
13
|
+
export default class MemLabTaggedStore {
|
|
14
|
+
taggedObjects: Record<string, WeakSet<AnyObject>>;
|
|
15
|
+
private constructor();
|
|
16
|
+
private static instance;
|
|
17
|
+
readonly id: string;
|
|
18
|
+
static getInstance(): MemLabTaggedStore;
|
|
19
|
+
static tagObject<T>(o: T, tag: string): void;
|
|
20
|
+
static hasObjectWithTag(heap: IHeapSnapshot, tag: string): boolean;
|
|
21
|
+
}
|
|
22
|
+
export {};
|
|
23
|
+
//# sourceMappingURL=MemLabTagStore.d.ts.map
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
4
|
+
*
|
|
5
|
+
* This source code is licensed under the MIT license found in the
|
|
6
|
+
* LICENSE file in the root directory of this source tree.
|
|
7
|
+
*
|
|
8
|
+
* @emails oncall+ws_labs
|
|
9
|
+
* @format
|
|
10
|
+
*/
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
const __1 = require("../..");
|
|
13
|
+
let uindex = 1;
|
|
14
|
+
function getUniqueID() {
|
|
15
|
+
const randId = `${Math.random()}`;
|
|
16
|
+
return `${process.pid}-${Date.now()}-${randId}-${uindex++}`;
|
|
17
|
+
}
|
|
18
|
+
/** @internal */
|
|
19
|
+
class MemLabTaggedStore {
|
|
20
|
+
constructor() {
|
|
21
|
+
this.id = getUniqueID();
|
|
22
|
+
this.taggedObjects = Object.create(null);
|
|
23
|
+
}
|
|
24
|
+
// make sure it's a singleton
|
|
25
|
+
static getInstance() {
|
|
26
|
+
if (!MemLabTaggedStore.instance) {
|
|
27
|
+
MemLabTaggedStore.instance = new MemLabTaggedStore();
|
|
28
|
+
}
|
|
29
|
+
return MemLabTaggedStore.instance;
|
|
30
|
+
}
|
|
31
|
+
// tag an object with a mark
|
|
32
|
+
static tagObject(o, tag) {
|
|
33
|
+
const store = MemLabTaggedStore.getInstance();
|
|
34
|
+
if (!store.taggedObjects[tag]) {
|
|
35
|
+
store.taggedObjects[tag] = new WeakSet();
|
|
36
|
+
}
|
|
37
|
+
store.taggedObjects[tag].add(o);
|
|
38
|
+
}
|
|
39
|
+
// check if any object in the heap snapshot has the mark
|
|
40
|
+
// tagged by this MemLabTaggedStore in this execution context
|
|
41
|
+
static hasObjectWithTag(heap, tag) {
|
|
42
|
+
const curContextTagStoreID = MemLabTaggedStore.getInstance().id;
|
|
43
|
+
let tagStore = null;
|
|
44
|
+
// get all MemLabTaggedStore instances in the heap snapshot
|
|
45
|
+
const stores = [];
|
|
46
|
+
heap.nodes.forEach((node) => {
|
|
47
|
+
if (node.name === 'MemLabTaggedStore' && node.type === 'object') {
|
|
48
|
+
stores.push(node);
|
|
49
|
+
}
|
|
50
|
+
});
|
|
51
|
+
// if no tag store found
|
|
52
|
+
if (stores.length === 0) {
|
|
53
|
+
return false;
|
|
54
|
+
// if there is only one store found
|
|
55
|
+
}
|
|
56
|
+
else if (stores.length === 1) {
|
|
57
|
+
tagStore = stores[0];
|
|
58
|
+
// if there are multiple MemLabTagStore instances
|
|
59
|
+
// found in the heap snapshot
|
|
60
|
+
}
|
|
61
|
+
else if (stores.length > 1) {
|
|
62
|
+
stores.forEach((node) => {
|
|
63
|
+
// in case multiple instances of MemLabTaggedStore exists
|
|
64
|
+
// in the heap snapshot, we need to make sure that the
|
|
65
|
+
// tag store is the one matching the current execution context
|
|
66
|
+
let storeID = '';
|
|
67
|
+
// match tag store id
|
|
68
|
+
node.forEachReference(edge => {
|
|
69
|
+
var _a, _b;
|
|
70
|
+
if (edge.name_or_index === 'id' && edge.toNode.isString) {
|
|
71
|
+
storeID = (_b = (_a = edge.toNode.toStringNode()) === null || _a === void 0 ? void 0 : _a.stringValue) !== null && _b !== void 0 ? _b : '';
|
|
72
|
+
return { stop: true };
|
|
73
|
+
}
|
|
74
|
+
});
|
|
75
|
+
if (curContextTagStoreID === storeID) {
|
|
76
|
+
tagStore = node;
|
|
77
|
+
}
|
|
78
|
+
});
|
|
79
|
+
if (tagStore == null) {
|
|
80
|
+
throw __1.utils.haltOrThrow('Multiple MemLabTagStore instances found in heap snapshot ' +
|
|
81
|
+
'when checking object tags, please make sure only one memlab ' +
|
|
82
|
+
'instance is running at a time and double check that memlab is ' +
|
|
83
|
+
'not running in Jest concurrent mode.');
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
if (tagStore == null) {
|
|
87
|
+
return false;
|
|
88
|
+
}
|
|
89
|
+
const store = tagStore;
|
|
90
|
+
// get tagStore.taggedObjects
|
|
91
|
+
const taggedObjects = store.getReferenceNode('taggedObjects', 'property');
|
|
92
|
+
if (taggedObjects == null) {
|
|
93
|
+
return false;
|
|
94
|
+
}
|
|
95
|
+
// get taggedObjects[tag]
|
|
96
|
+
const weakSet = taggedObjects.getReferenceNode(tag, 'property');
|
|
97
|
+
if (weakSet == null) {
|
|
98
|
+
return false;
|
|
99
|
+
}
|
|
100
|
+
// get weakSet.table
|
|
101
|
+
const table = weakSet.getReferenceNode('table');
|
|
102
|
+
if (table == null) {
|
|
103
|
+
return false;
|
|
104
|
+
}
|
|
105
|
+
// check if the table has any weak reference to any object
|
|
106
|
+
const ref = table.findAnyReference((edge) => edge.type === 'weak' && edge.toNode.name !== 'system / Oddball');
|
|
107
|
+
return ref != null;
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
exports.default = MemLabTaggedStore;
|
|
@@ -20,6 +20,7 @@ const Utils_1 = __importDefault(require("../lib/Utils"));
|
|
|
20
20
|
const TraceElement_1 = require("./TraceElement");
|
|
21
21
|
const TraceSimilarityStrategy_1 = __importDefault(require("./strategies/TraceSimilarityStrategy"));
|
|
22
22
|
const TraceAsClusterStrategy_1 = __importDefault(require("./strategies/TraceAsClusterStrategy"));
|
|
23
|
+
const MLTraceSimilarityStrategy_1 = __importDefault(require("./strategies/MLTraceSimilarityStrategy"));
|
|
23
24
|
// sync up with html/intern/js/webspeed/memlab/lib/LeakCluster.js
|
|
24
25
|
class NormalizedTrace {
|
|
25
26
|
constructor(p = null, snapshot = null) {
|
|
@@ -157,7 +158,11 @@ class NormalizedTrace {
|
|
|
157
158
|
};
|
|
158
159
|
}
|
|
159
160
|
static clusterLeakTraces(leakTraces) {
|
|
160
|
-
const { allClusters } = NormalizedTrace.diffTraces(leakTraces, []
|
|
161
|
+
const { allClusters } = NormalizedTrace.diffTraces(leakTraces, [], {
|
|
162
|
+
strategy: Config_1.default.isMLClustering
|
|
163
|
+
? new MLTraceSimilarityStrategy_1.default()
|
|
164
|
+
: undefined,
|
|
165
|
+
});
|
|
161
166
|
const lastNodeFromTrace = (trace) => trace[trace.length - 1];
|
|
162
167
|
const labaledLeakTraces = allClusters.reduce((acc, bucket) => {
|
|
163
168
|
const lastNodeFromFirstTrace = lastNodeFromTrace(bucket[0]);
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*
|
|
7
|
+
* @emails oncall+ws_labs
|
|
8
|
+
* @format
|
|
9
|
+
*/
|
|
10
|
+
import type { IClusterStrategy, LeakTrace, TraceDiff } from '../../lib/Types';
|
|
11
|
+
export default class MLTraceSimilarityStrategy implements IClusterStrategy {
|
|
12
|
+
diffTraces(newLeakTraces: LeakTrace[]): TraceDiff;
|
|
13
|
+
traceToDoc(trace: LeakTrace): string;
|
|
14
|
+
}
|
|
15
|
+
//# sourceMappingURL=MLTraceSimilarityStrategy.d.ts.map
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
4
|
+
*
|
|
5
|
+
* This source code is licensed under the MIT license found in the
|
|
6
|
+
* LICENSE file in the root directory of this source tree.
|
|
7
|
+
*
|
|
8
|
+
* @emails oncall+ws_labs
|
|
9
|
+
* @format
|
|
10
|
+
*/
|
|
11
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
12
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
|
+
};
|
|
14
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
|
+
const Config_1 = __importDefault(require("../../lib/Config"));
|
|
16
|
+
const DistanceMatrix_1 = require("./machine-learning/DistanceMatrix");
|
|
17
|
+
const HAC_1 = require("./machine-learning/HAC");
|
|
18
|
+
const TfidfVectorizer_1 = require("./machine-learning/TfidfVectorizer");
|
|
19
|
+
class MLTraceSimilarityStrategy {
|
|
20
|
+
diffTraces(newLeakTraces) {
|
|
21
|
+
var _a;
|
|
22
|
+
const rawDocuments = newLeakTraces.map(this.traceToDoc);
|
|
23
|
+
const vectorizer = new TfidfVectorizer_1.TfidfVectorizer({ rawDocuments });
|
|
24
|
+
const tfidfs = vectorizer.computeTfidfs();
|
|
25
|
+
const dmatrix = (0, DistanceMatrix_1.distance)(tfidfs);
|
|
26
|
+
const result = (0, HAC_1.cluster)(rawDocuments.length, dmatrix, Config_1.default.mlClusteringLinkageMaxDistance);
|
|
27
|
+
const map = new Map();
|
|
28
|
+
for (let i = 0; i < result.length; i++) {
|
|
29
|
+
const traceIdx = result[i];
|
|
30
|
+
const repTrace = newLeakTraces[traceIdx];
|
|
31
|
+
const trace = newLeakTraces[i];
|
|
32
|
+
if (!map.has(repTrace)) {
|
|
33
|
+
map.set(repTrace, [repTrace]);
|
|
34
|
+
}
|
|
35
|
+
// to please linter
|
|
36
|
+
(_a = map.get(repTrace)) === null || _a === void 0 ? void 0 : _a.push(trace);
|
|
37
|
+
}
|
|
38
|
+
return {
|
|
39
|
+
allClusters: Array.from(map.values()),
|
|
40
|
+
staleClusters: [],
|
|
41
|
+
clustersToAdd: [],
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
traceToDoc(trace) {
|
|
45
|
+
const res = [];
|
|
46
|
+
for (const t of trace) {
|
|
47
|
+
let name = t.kind === 'node' ? String(t.name) : String(t.name_or_index);
|
|
48
|
+
if (name === '') {
|
|
49
|
+
name = '_null_';
|
|
50
|
+
}
|
|
51
|
+
name = name.replace(/ /g, '_');
|
|
52
|
+
name = name.replace(/\d/g, '');
|
|
53
|
+
if (name === '') {
|
|
54
|
+
name = '_number_';
|
|
55
|
+
}
|
|
56
|
+
res.push(name);
|
|
57
|
+
}
|
|
58
|
+
return res.join(' ');
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
exports.default = MLTraceSimilarityStrategy;
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*
|
|
7
|
+
* @emails oncall+ws_labs
|
|
8
|
+
* @format
|
|
9
|
+
*/
|
|
10
|
+
export declare const distance: (tfidfs: Record<string, number>[]) => Float32Array;
|
|
11
|
+
//# sourceMappingURL=DistanceMatrix.d.ts.map
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
4
|
+
*
|
|
5
|
+
* This source code is licensed under the MIT license found in the
|
|
6
|
+
* LICENSE file in the root directory of this source tree.
|
|
7
|
+
*
|
|
8
|
+
* @emails oncall+ws_labs
|
|
9
|
+
* @format
|
|
10
|
+
*/
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.distance = void 0;
|
|
13
|
+
const cache = new Map();
|
|
14
|
+
const buildIntersection = (tfidfs, i, j) => {
|
|
15
|
+
const intersection = [];
|
|
16
|
+
if (!cache.has(i)) {
|
|
17
|
+
cache.set(i, Object.keys(tfidfs[i]));
|
|
18
|
+
}
|
|
19
|
+
if (!cache.has(j)) {
|
|
20
|
+
cache.set(j, Object.keys(tfidfs[j]));
|
|
21
|
+
}
|
|
22
|
+
const [keys, tfidf] = cache.get(i).length > cache.get(j).length
|
|
23
|
+
? [cache.get(j), tfidfs[i]]
|
|
24
|
+
: [cache.get(i), tfidfs[j]];
|
|
25
|
+
for (const k of keys) {
|
|
26
|
+
if (tfidf[k]) {
|
|
27
|
+
intersection.push(k);
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
return intersection;
|
|
31
|
+
};
|
|
32
|
+
const distance = (tfidfs) => {
|
|
33
|
+
const n = tfidfs.length;
|
|
34
|
+
const distances = new Float32Array((n * (n - 1)) / 2);
|
|
35
|
+
let distIdx = 0;
|
|
36
|
+
const dotProducs = tfidfs.map(atfidf => Object.values(atfidf).reduce((sum, v) => sum + v * v, 0));
|
|
37
|
+
for (let i = 0; i < tfidfs.length; i++) {
|
|
38
|
+
const a = tfidfs[i];
|
|
39
|
+
for (let j = i + 1; j < tfidfs.length; j++) {
|
|
40
|
+
const b = tfidfs[j];
|
|
41
|
+
const intersection = buildIntersection(tfidfs, i, j);
|
|
42
|
+
const dotProdOfCommons = intersection.reduce((sum, vidx) => sum + a[vidx] * b[vidx], 0);
|
|
43
|
+
// TODO make it pluggable to use other distance measures like euclidean, manhattan
|
|
44
|
+
const cosineSimilarity = 1 -
|
|
45
|
+
dotProdOfCommons /
|
|
46
|
+
(Math.sqrt(dotProducs[i]) / Math.sqrt(dotProducs[j]));
|
|
47
|
+
distances[distIdx] = cosineSimilarity;
|
|
48
|
+
distIdx++;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
cache.clear();
|
|
52
|
+
return distances;
|
|
53
|
+
};
|
|
54
|
+
exports.distance = distance;
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*
|
|
7
|
+
* @emails oncall+ws_labs
|
|
8
|
+
* @format
|
|
9
|
+
*/
|
|
10
|
+
/**
|
|
11
|
+
*
|
|
12
|
+
* @param {*} nDocs number of docs
|
|
13
|
+
* @param {*} D condenced distance matrix
|
|
14
|
+
* @returns labels - list of doc ids as clusters
|
|
15
|
+
*/
|
|
16
|
+
export declare const cluster: (nDocs: number, condensedDistanceMatrix: Float32Array, maxDistanceThreshold: number) => number[] | Uint32Array;
|
|
17
|
+
//# sourceMappingURL=HAC.d.ts.map
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
4
|
+
*
|
|
5
|
+
* This source code is licensed under the MIT license found in the
|
|
6
|
+
* LICENSE file in the root directory of this source tree.
|
|
7
|
+
*
|
|
8
|
+
* @emails oncall+ws_labs
|
|
9
|
+
* @format
|
|
10
|
+
*/
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.cluster = void 0;
|
|
13
|
+
const condensedIndex = (n, i, j) => {
|
|
14
|
+
if (i > j) {
|
|
15
|
+
return condensedIndex(n, j, i);
|
|
16
|
+
}
|
|
17
|
+
// to get distance between (i, j) think of this sequence.
|
|
18
|
+
// (n - 1) + (n - 2) + ... + (n - i) + (j - i) - 1
|
|
19
|
+
return n * i - (i * (i + 1)) / 2 + (j - i - 1);
|
|
20
|
+
};
|
|
21
|
+
function getRootLabel(array, idx) {
|
|
22
|
+
let rootIdx = idx;
|
|
23
|
+
while (array[rootIdx] !== rootIdx) {
|
|
24
|
+
rootIdx = array[rootIdx];
|
|
25
|
+
}
|
|
26
|
+
return rootIdx;
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
*
|
|
30
|
+
* @param {*} nDocs number of docs
|
|
31
|
+
* @param {*} D condenced distance matrix
|
|
32
|
+
* @returns labels - list of doc ids as clusters
|
|
33
|
+
*/
|
|
34
|
+
const cluster = (nDocs, condensedDistanceMatrix, maxDistanceThreshold) => {
|
|
35
|
+
if (nDocs <= 1)
|
|
36
|
+
return [0];
|
|
37
|
+
const condencedDistanceMatrixCopy = new Float32Array(condensedDistanceMatrix);
|
|
38
|
+
const sizeOfClusters = new Uint32Array(nDocs).fill(1);
|
|
39
|
+
let chainLength = 0;
|
|
40
|
+
let clusterChain = [];
|
|
41
|
+
let traceAIdx = -1;
|
|
42
|
+
let traceBIdx = -1;
|
|
43
|
+
let currentMin = Number.MAX_SAFE_INTEGER;
|
|
44
|
+
let distanceBetweenTraces;
|
|
45
|
+
const labels = new Uint32Array(nDocs).map((_, idx) => idx);
|
|
46
|
+
for (let k = 0; k < nDocs - 1; k++) {
|
|
47
|
+
traceBIdx = -1;
|
|
48
|
+
if (chainLength === 0) {
|
|
49
|
+
for (let i = 0; i < nDocs; i++) {
|
|
50
|
+
if (sizeOfClusters[i] > 0) {
|
|
51
|
+
clusterChain[0] = i;
|
|
52
|
+
chainLength = 1;
|
|
53
|
+
break;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
while (chainLength > 0) {
|
|
58
|
+
traceAIdx = clusterChain[chainLength - 1];
|
|
59
|
+
if (chainLength > 1) {
|
|
60
|
+
traceBIdx = clusterChain[chainLength - 2];
|
|
61
|
+
currentMin =
|
|
62
|
+
condencedDistanceMatrixCopy[condensedIndex(nDocs, traceAIdx, traceBIdx)];
|
|
63
|
+
}
|
|
64
|
+
else {
|
|
65
|
+
currentMin = Number.MAX_SAFE_INTEGER;
|
|
66
|
+
}
|
|
67
|
+
for (let i = 0; i < nDocs; i++) {
|
|
68
|
+
if (sizeOfClusters[i] == 0 || traceAIdx == i) {
|
|
69
|
+
continue;
|
|
70
|
+
}
|
|
71
|
+
distanceBetweenTraces =
|
|
72
|
+
condencedDistanceMatrixCopy[condensedIndex(nDocs, traceAIdx, i)];
|
|
73
|
+
if (distanceBetweenTraces < currentMin) {
|
|
74
|
+
currentMin = distanceBetweenTraces;
|
|
75
|
+
traceBIdx = i;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
// make sure that traceA and traceB are closest to each other
|
|
79
|
+
if (chainLength > 1 &&
|
|
80
|
+
traceBIdx !== -1 &&
|
|
81
|
+
traceBIdx === clusterChain[chainLength - 2]) {
|
|
82
|
+
break;
|
|
83
|
+
}
|
|
84
|
+
clusterChain[chainLength] = traceBIdx;
|
|
85
|
+
chainLength = chainLength + 1;
|
|
86
|
+
}
|
|
87
|
+
clusterChain = [];
|
|
88
|
+
chainLength = 0;
|
|
89
|
+
if (currentMin > maxDistanceThreshold) {
|
|
90
|
+
sizeOfClusters[traceAIdx] = 0;
|
|
91
|
+
sizeOfClusters[traceBIdx] = 0;
|
|
92
|
+
continue;
|
|
93
|
+
}
|
|
94
|
+
if (traceAIdx === -1 || traceBIdx === -1) {
|
|
95
|
+
continue;
|
|
96
|
+
}
|
|
97
|
+
if (traceAIdx > traceBIdx) {
|
|
98
|
+
[traceAIdx, traceBIdx] = [traceBIdx, traceAIdx];
|
|
99
|
+
}
|
|
100
|
+
const nx = sizeOfClusters[traceAIdx];
|
|
101
|
+
const ny = sizeOfClusters[traceBIdx];
|
|
102
|
+
labels[traceAIdx] = traceBIdx;
|
|
103
|
+
sizeOfClusters[traceAIdx] = 0;
|
|
104
|
+
sizeOfClusters[traceBIdx] = nx + ny;
|
|
105
|
+
for (let i = 0; i < nDocs; i++) {
|
|
106
|
+
const ni = sizeOfClusters[i];
|
|
107
|
+
if (ni === 0 || i === traceBIdx) {
|
|
108
|
+
continue;
|
|
109
|
+
}
|
|
110
|
+
const d_xi = condencedDistanceMatrixCopy[condensedIndex(nDocs, i, traceAIdx)];
|
|
111
|
+
const d_yi = condencedDistanceMatrixCopy[condensedIndex(nDocs, i, traceBIdx)];
|
|
112
|
+
const size_x = nx;
|
|
113
|
+
const size_y = ny;
|
|
114
|
+
// TODO make it generic to support other linkage methods like complete, weighted etc...
|
|
115
|
+
const updatedDist = (size_x * d_xi + size_y * d_yi) / (size_x + size_y);
|
|
116
|
+
condencedDistanceMatrixCopy[condensedIndex(nDocs, i, traceBIdx)] =
|
|
117
|
+
updatedDist;
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
return labels.map((_, idx) => getRootLabel(labels, idx));
|
|
121
|
+
};
|
|
122
|
+
exports.cluster = cluster;
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*
|
|
7
|
+
* @emails oncall+ws_labs
|
|
8
|
+
* @format
|
|
9
|
+
*/
|
|
10
|
+
export declare function nGram(n: number, terms: string[]): string[];
|
|
11
|
+
//# sourceMappingURL=Ngram.d.ts.map
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
4
|
+
*
|
|
5
|
+
* This source code is licensed under the MIT license found in the
|
|
6
|
+
* LICENSE file in the root directory of this source tree.
|
|
7
|
+
*
|
|
8
|
+
* @emails oncall+ws_labs
|
|
9
|
+
* @format
|
|
10
|
+
*/
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.nGram = void 0;
|
|
13
|
+
function nGram(n, terms) {
|
|
14
|
+
const nGrams = [];
|
|
15
|
+
let index = 0;
|
|
16
|
+
while (index <= terms.length - n) {
|
|
17
|
+
nGrams[index] = terms.slice(index, index + n).join(' ');
|
|
18
|
+
++index;
|
|
19
|
+
}
|
|
20
|
+
return nGrams;
|
|
21
|
+
}
|
|
22
|
+
exports.nGram = nGram;
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
3
|
+
*
|
|
4
|
+
* This source code is licensed under the MIT license found in the
|
|
5
|
+
* LICENSE file in the root directory of this source tree.
|
|
6
|
+
*
|
|
7
|
+
* @emails oncall+ws_labs
|
|
8
|
+
* @format
|
|
9
|
+
*/
|
|
10
|
+
interface TfidfVectorizerProps {
|
|
11
|
+
rawDocuments: string[];
|
|
12
|
+
maxDF?: number;
|
|
13
|
+
}
|
|
14
|
+
export declare class TfidfVectorizer {
|
|
15
|
+
rawDocuments: string[];
|
|
16
|
+
vocabulary: Record<string, string>;
|
|
17
|
+
documentFrequency: Record<string, number>;
|
|
18
|
+
maxDF: number;
|
|
19
|
+
documents: Record<string, number>[];
|
|
20
|
+
tfidfs: Record<string, number>[];
|
|
21
|
+
constructor({ rawDocuments, maxDF }: TfidfVectorizerProps);
|
|
22
|
+
computeTfidfs(): Record<string, number>[];
|
|
23
|
+
tokenize(text: string): string[];
|
|
24
|
+
buildVocabulary(tokenizedDocuments: string[][]): Record<string, string>;
|
|
25
|
+
processDocuments(tokenizedDocuments: string[][]): void;
|
|
26
|
+
limit(): void;
|
|
27
|
+
/**
|
|
28
|
+
* Smooth idf weights by adding 1 to document frequencies (DF), as if an extra
|
|
29
|
+
* document was seen containing every term in the collection exactly once.
|
|
30
|
+
* This prevents zero divisions.
|
|
31
|
+
* */
|
|
32
|
+
smooth(): void;
|
|
33
|
+
buildTfidfs(): Record<string, number>[];
|
|
34
|
+
tf(vocabIdx: string, document: Record<string, number>): number;
|
|
35
|
+
idf(vocabIdx: string): number;
|
|
36
|
+
}
|
|
37
|
+
export {};
|
|
38
|
+
//# sourceMappingURL=TfidfVectorizer.d.ts.map
|