@memlab/core 1.1.4 → 1.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/__tests__/parser/HeapParser.test.js +2 -2
- package/dist/__tests__/parser/NodeHeap.test.js +5 -5
- package/dist/__tests__/parser/StringNode.test.js +1 -1
- package/dist/__tests__/parser/traverse/HeapNodeTraverse.test.js +2 -2
- package/dist/index.d.ts +5 -1
- package/dist/index.js +22 -2
- package/dist/lib/Config.d.ts +16 -9
- package/dist/lib/Config.js +15 -0
- package/dist/lib/FileManager.js +4 -2
- package/dist/lib/HeapAnalyzer.js +25 -9
- package/dist/lib/NodeHeap.d.ts +52 -9
- package/dist/lib/NodeHeap.js +72 -21
- package/dist/lib/PackageInfoLoader.d.ts +7 -0
- package/dist/lib/PackageInfoLoader.js +66 -0
- package/dist/lib/Serializer.js +48 -25
- package/dist/lib/Types.d.ts +119 -35
- package/dist/lib/Utils.js +24 -9
- package/dist/lib/heap-data/HeapSnapshot.d.ts +1 -0
- package/dist/lib/heap-data/HeapSnapshot.js +3 -30
- package/dist/lib/heap-data/HeapStringNode.js +2 -0
- package/dist/lib/heap-data/MemLabTagStore.d.ts +23 -0
- package/dist/lib/heap-data/MemLabTagStore.js +110 -0
- package/dist/trace-cluster/TraceBucket.js +6 -1
- package/dist/trace-cluster/strategies/MLTraceSimilarityStrategy.d.ts +15 -0
- package/dist/trace-cluster/strategies/MLTraceSimilarityStrategy.js +61 -0
- package/dist/trace-cluster/strategies/machine-learning/DistanceMatrix.d.ts +11 -0
- package/dist/trace-cluster/strategies/machine-learning/DistanceMatrix.js +54 -0
- package/dist/trace-cluster/strategies/machine-learning/HAC.d.ts +17 -0
- package/dist/trace-cluster/strategies/machine-learning/HAC.js +122 -0
- package/dist/trace-cluster/strategies/machine-learning/Ngram.d.ts +11 -0
- package/dist/trace-cluster/strategies/machine-learning/Ngram.js +22 -0
- package/dist/trace-cluster/strategies/machine-learning/TfidfVectorizer.d.ts +38 -0
- package/dist/trace-cluster/strategies/machine-learning/TfidfVectorizer.js +144 -0
- package/package.json +1 -1
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
4
|
+
*
|
|
5
|
+
* This source code is licensed under the MIT license found in the
|
|
6
|
+
* LICENSE file in the root directory of this source tree.
|
|
7
|
+
*
|
|
8
|
+
* @emails oncall+ws_labs
|
|
9
|
+
* @format
|
|
10
|
+
*/
|
|
11
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
12
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
|
+
};
|
|
14
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
|
+
exports.TfidfVectorizer = void 0;
|
|
16
|
+
const Config_1 = __importDefault(require("../../../lib/Config"));
|
|
17
|
+
const Ngram_1 = require("./Ngram");
|
|
18
|
+
const SMOOTHING_KEY = '__smoothObjectKey';
|
|
19
|
+
const VOCAB_IDX_FOR_DOC_WITH_HIGH_DF = '-1';
|
|
20
|
+
class TfidfVectorizer {
|
|
21
|
+
constructor({ rawDocuments, maxDF }) {
|
|
22
|
+
this.rawDocuments = [];
|
|
23
|
+
this.vocabulary = Object.create(null);
|
|
24
|
+
this.documentFrequency = Object.create(null);
|
|
25
|
+
this.documents = [];
|
|
26
|
+
this.rawDocuments = rawDocuments;
|
|
27
|
+
this.maxDF = maxDF !== null && maxDF !== void 0 ? maxDF : Config_1.default.mlMaxDF;
|
|
28
|
+
}
|
|
29
|
+
computeTfidfs() {
|
|
30
|
+
const tokenizedDocuments = this.rawDocuments.map(this.tokenize);
|
|
31
|
+
this.vocabulary = this.buildVocabulary(tokenizedDocuments);
|
|
32
|
+
this.processDocuments(tokenizedDocuments);
|
|
33
|
+
this.limit();
|
|
34
|
+
this.smooth();
|
|
35
|
+
this.tfidfs = this.buildTfidfs();
|
|
36
|
+
return this.tfidfs;
|
|
37
|
+
}
|
|
38
|
+
tokenize(text) {
|
|
39
|
+
const terms = text.split(' ');
|
|
40
|
+
return [...terms, ...(0, Ngram_1.nGram)(2, terms), ...(0, Ngram_1.nGram)(3, terms)];
|
|
41
|
+
}
|
|
42
|
+
buildVocabulary(tokenizedDocuments) {
|
|
43
|
+
let vocabIdx = 0;
|
|
44
|
+
const vocabulary = Object.create(null);
|
|
45
|
+
tokenizedDocuments.forEach(doc => {
|
|
46
|
+
doc.forEach(term => {
|
|
47
|
+
if (!vocabulary[String(term)]) {
|
|
48
|
+
vocabulary[String(term)] = String(vocabIdx);
|
|
49
|
+
vocabIdx++;
|
|
50
|
+
}
|
|
51
|
+
});
|
|
52
|
+
});
|
|
53
|
+
return vocabulary;
|
|
54
|
+
}
|
|
55
|
+
processDocuments(tokenizedDocuments) {
|
|
56
|
+
tokenizedDocuments.forEach(terms => {
|
|
57
|
+
const document = {};
|
|
58
|
+
terms.forEach(t => {
|
|
59
|
+
const vocabIdx = this.vocabulary[t];
|
|
60
|
+
if (document[vocabIdx]) {
|
|
61
|
+
document[vocabIdx] += 1;
|
|
62
|
+
}
|
|
63
|
+
else {
|
|
64
|
+
if (this.documentFrequency[vocabIdx]) {
|
|
65
|
+
this.documentFrequency[vocabIdx] += 1;
|
|
66
|
+
}
|
|
67
|
+
else {
|
|
68
|
+
this.documentFrequency[vocabIdx] = 1;
|
|
69
|
+
}
|
|
70
|
+
document[vocabIdx] = 1;
|
|
71
|
+
}
|
|
72
|
+
});
|
|
73
|
+
this.documents.push(document);
|
|
74
|
+
});
|
|
75
|
+
}
|
|
76
|
+
limit() {
|
|
77
|
+
const nMaxDF = Math.floor(this.documents.length * this.maxDF);
|
|
78
|
+
const vocabIdxsToDelete = [];
|
|
79
|
+
this.documents.forEach(doc => {
|
|
80
|
+
Object.keys(doc).forEach(vocabIdx => {
|
|
81
|
+
if (this.documentFrequency[vocabIdx] > nMaxDF) {
|
|
82
|
+
delete doc[vocabIdx];
|
|
83
|
+
vocabIdxsToDelete.push(vocabIdx);
|
|
84
|
+
}
|
|
85
|
+
});
|
|
86
|
+
});
|
|
87
|
+
vocabIdxsToDelete.forEach(vocabIdx => {
|
|
88
|
+
delete this.documentFrequency[vocabIdx];
|
|
89
|
+
delete this.vocabulary[vocabIdx];
|
|
90
|
+
});
|
|
91
|
+
}
|
|
92
|
+
/**
|
|
93
|
+
* Smooth idf weights by adding 1 to document frequencies (DF), as if an extra
|
|
94
|
+
* document was seen containing every term in the collection exactly once.
|
|
95
|
+
* This prevents zero divisions.
|
|
96
|
+
* */
|
|
97
|
+
smooth() {
|
|
98
|
+
// for each vocabulary
|
|
99
|
+
Object.values(this.vocabulary).forEach(vocabIdx => (this.documentFrequency[vocabIdx] =
|
|
100
|
+
this.documentFrequency[vocabIdx] + 1));
|
|
101
|
+
this.documents.push({ [SMOOTHING_KEY]: 1 });
|
|
102
|
+
}
|
|
103
|
+
buildTfidfs() {
|
|
104
|
+
const tfidfs = [];
|
|
105
|
+
this.documents.forEach(document => {
|
|
106
|
+
// this means all the terms in the document are the terms
|
|
107
|
+
// that have high document frequency.
|
|
108
|
+
// This will make all the docs with high DF to be clustered together.
|
|
109
|
+
if (Object.keys(document).length === 0) {
|
|
110
|
+
tfidfs.push({ [VOCAB_IDX_FOR_DOC_WITH_HIGH_DF]: 1 });
|
|
111
|
+
return;
|
|
112
|
+
}
|
|
113
|
+
if (!document[SMOOTHING_KEY]) {
|
|
114
|
+
const atfidf = Object.keys(document).map(vocabIdx => {
|
|
115
|
+
return [vocabIdx, this.tf(vocabIdx, document) * this.idf(vocabIdx)];
|
|
116
|
+
});
|
|
117
|
+
// normalizing the values
|
|
118
|
+
const dotSum = atfidf
|
|
119
|
+
.map(([_, tfidfValue]) => tfidfValue * tfidfValue)
|
|
120
|
+
.reduce((sum, tfidfValueSquered) => sum + tfidfValueSquered, 0);
|
|
121
|
+
const dotSumSqrRoot = Math.sqrt(dotSum);
|
|
122
|
+
// Normalizing tfidfs
|
|
123
|
+
const atfidfVocabIdxValueObject = atfidf
|
|
124
|
+
.map(([vocabIdx, tfidfValue]) => [
|
|
125
|
+
vocabIdx,
|
|
126
|
+
tfidfValue / dotSumSqrRoot,
|
|
127
|
+
])
|
|
128
|
+
.reduce((obj, [vocabIdx, value]) => {
|
|
129
|
+
obj[vocabIdx] = value;
|
|
130
|
+
return obj;
|
|
131
|
+
}, {});
|
|
132
|
+
tfidfs.push(atfidfVocabIdxValueObject);
|
|
133
|
+
}
|
|
134
|
+
});
|
|
135
|
+
return tfidfs;
|
|
136
|
+
}
|
|
137
|
+
tf(vocabIdx, document) {
|
|
138
|
+
return 1 + Math.log(document[vocabIdx]);
|
|
139
|
+
}
|
|
140
|
+
idf(vocabIdx) {
|
|
141
|
+
return (1 + Math.log(this.documents.length / this.documentFrequency[vocabIdx]));
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
exports.TfidfVectorizer = TfidfVectorizer;
|