@memlab/core 1.1.4 → 1.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/dist/__tests__/parser/HeapParser.test.js +2 -2
  2. package/dist/__tests__/parser/NodeHeap.test.js +5 -5
  3. package/dist/__tests__/parser/StringNode.test.js +1 -1
  4. package/dist/__tests__/parser/traverse/HeapNodeTraverse.test.js +2 -2
  5. package/dist/index.d.ts +5 -1
  6. package/dist/index.js +22 -2
  7. package/dist/lib/Config.d.ts +16 -9
  8. package/dist/lib/Config.js +15 -0
  9. package/dist/lib/FileManager.js +4 -2
  10. package/dist/lib/HeapAnalyzer.js +25 -9
  11. package/dist/lib/NodeHeap.d.ts +52 -9
  12. package/dist/lib/NodeHeap.js +72 -21
  13. package/dist/lib/PackageInfoLoader.d.ts +7 -0
  14. package/dist/lib/PackageInfoLoader.js +66 -0
  15. package/dist/lib/Serializer.js +48 -25
  16. package/dist/lib/Types.d.ts +119 -35
  17. package/dist/lib/Utils.js +24 -9
  18. package/dist/lib/heap-data/HeapSnapshot.d.ts +1 -0
  19. package/dist/lib/heap-data/HeapSnapshot.js +3 -30
  20. package/dist/lib/heap-data/HeapStringNode.js +2 -0
  21. package/dist/lib/heap-data/MemLabTagStore.d.ts +23 -0
  22. package/dist/lib/heap-data/MemLabTagStore.js +110 -0
  23. package/dist/trace-cluster/TraceBucket.js +6 -1
  24. package/dist/trace-cluster/strategies/MLTraceSimilarityStrategy.d.ts +15 -0
  25. package/dist/trace-cluster/strategies/MLTraceSimilarityStrategy.js +61 -0
  26. package/dist/trace-cluster/strategies/machine-learning/DistanceMatrix.d.ts +11 -0
  27. package/dist/trace-cluster/strategies/machine-learning/DistanceMatrix.js +54 -0
  28. package/dist/trace-cluster/strategies/machine-learning/HAC.d.ts +17 -0
  29. package/dist/trace-cluster/strategies/machine-learning/HAC.js +122 -0
  30. package/dist/trace-cluster/strategies/machine-learning/Ngram.d.ts +11 -0
  31. package/dist/trace-cluster/strategies/machine-learning/Ngram.js +22 -0
  32. package/dist/trace-cluster/strategies/machine-learning/TfidfVectorizer.d.ts +38 -0
  33. package/dist/trace-cluster/strategies/machine-learning/TfidfVectorizer.js +144 -0
  34. package/package.json +1 -1
@@ -0,0 +1,144 @@
1
+ "use strict";
2
+ /**
3
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
4
+ *
5
+ * This source code is licensed under the MIT license found in the
6
+ * LICENSE file in the root directory of this source tree.
7
+ *
8
+ * @emails oncall+ws_labs
9
+ * @format
10
+ */
11
+ var __importDefault = (this && this.__importDefault) || function (mod) {
12
+ return (mod && mod.__esModule) ? mod : { "default": mod };
13
+ };
14
+ Object.defineProperty(exports, "__esModule", { value: true });
15
+ exports.TfidfVectorizer = void 0;
16
+ const Config_1 = __importDefault(require("../../../lib/Config"));
17
+ const Ngram_1 = require("./Ngram");
18
+ const SMOOTHING_KEY = '__smoothObjectKey';
19
+ const VOCAB_IDX_FOR_DOC_WITH_HIGH_DF = '-1';
20
+ class TfidfVectorizer {
21
+ constructor({ rawDocuments, maxDF }) {
22
+ this.rawDocuments = [];
23
+ this.vocabulary = Object.create(null);
24
+ this.documentFrequency = Object.create(null);
25
+ this.documents = [];
26
+ this.rawDocuments = rawDocuments;
27
+ this.maxDF = maxDF !== null && maxDF !== void 0 ? maxDF : Config_1.default.mlMaxDF;
28
+ }
29
+ computeTfidfs() {
30
+ const tokenizedDocuments = this.rawDocuments.map(this.tokenize);
31
+ this.vocabulary = this.buildVocabulary(tokenizedDocuments);
32
+ this.processDocuments(tokenizedDocuments);
33
+ this.limit();
34
+ this.smooth();
35
+ this.tfidfs = this.buildTfidfs();
36
+ return this.tfidfs;
37
+ }
38
+ tokenize(text) {
39
+ const terms = text.split(' ');
40
+ return [...terms, ...(0, Ngram_1.nGram)(2, terms), ...(0, Ngram_1.nGram)(3, terms)];
41
+ }
42
+ buildVocabulary(tokenizedDocuments) {
43
+ let vocabIdx = 0;
44
+ const vocabulary = Object.create(null);
45
+ tokenizedDocuments.forEach(doc => {
46
+ doc.forEach(term => {
47
+ if (!vocabulary[String(term)]) {
48
+ vocabulary[String(term)] = String(vocabIdx);
49
+ vocabIdx++;
50
+ }
51
+ });
52
+ });
53
+ return vocabulary;
54
+ }
55
+ processDocuments(tokenizedDocuments) {
56
+ tokenizedDocuments.forEach(terms => {
57
+ const document = {};
58
+ terms.forEach(t => {
59
+ const vocabIdx = this.vocabulary[t];
60
+ if (document[vocabIdx]) {
61
+ document[vocabIdx] += 1;
62
+ }
63
+ else {
64
+ if (this.documentFrequency[vocabIdx]) {
65
+ this.documentFrequency[vocabIdx] += 1;
66
+ }
67
+ else {
68
+ this.documentFrequency[vocabIdx] = 1;
69
+ }
70
+ document[vocabIdx] = 1;
71
+ }
72
+ });
73
+ this.documents.push(document);
74
+ });
75
+ }
76
+ limit() {
77
+ const nMaxDF = Math.floor(this.documents.length * this.maxDF);
78
+ const vocabIdxsToDelete = [];
79
+ this.documents.forEach(doc => {
80
+ Object.keys(doc).forEach(vocabIdx => {
81
+ if (this.documentFrequency[vocabIdx] > nMaxDF) {
82
+ delete doc[vocabIdx];
83
+ vocabIdxsToDelete.push(vocabIdx);
84
+ }
85
+ });
86
+ });
87
+ vocabIdxsToDelete.forEach(vocabIdx => {
88
+ delete this.documentFrequency[vocabIdx];
89
+ delete this.vocabulary[vocabIdx];
90
+ });
91
+ }
92
+ /**
93
+ * Smooth idf weights by adding 1 to document frequencies (DF), as if an extra
94
+ * document was seen containing every term in the collection exactly once.
95
+ * This prevents zero divisions.
96
+ * */
97
+ smooth() {
98
+ // for each vocabulary
99
+ Object.values(this.vocabulary).forEach(vocabIdx => (this.documentFrequency[vocabIdx] =
100
+ this.documentFrequency[vocabIdx] + 1));
101
+ this.documents.push({ [SMOOTHING_KEY]: 1 });
102
+ }
103
+ buildTfidfs() {
104
+ const tfidfs = [];
105
+ this.documents.forEach(document => {
106
+ // this means all the terms in the document are the terms
107
+ // that have high document frequency.
108
+ // This will make all the docs with high DF to be clustered together.
109
+ if (Object.keys(document).length === 0) {
110
+ tfidfs.push({ [VOCAB_IDX_FOR_DOC_WITH_HIGH_DF]: 1 });
111
+ return;
112
+ }
113
+ if (!document[SMOOTHING_KEY]) {
114
+ const atfidf = Object.keys(document).map(vocabIdx => {
115
+ return [vocabIdx, this.tf(vocabIdx, document) * this.idf(vocabIdx)];
116
+ });
117
+ // normalizing the values
118
+ const dotSum = atfidf
119
+ .map(([_, tfidfValue]) => tfidfValue * tfidfValue)
120
+ .reduce((sum, tfidfValueSquered) => sum + tfidfValueSquered, 0);
121
+ const dotSumSqrRoot = Math.sqrt(dotSum);
122
+ // Normalizing tfidfs
123
+ const atfidfVocabIdxValueObject = atfidf
124
+ .map(([vocabIdx, tfidfValue]) => [
125
+ vocabIdx,
126
+ tfidfValue / dotSumSqrRoot,
127
+ ])
128
+ .reduce((obj, [vocabIdx, value]) => {
129
+ obj[vocabIdx] = value;
130
+ return obj;
131
+ }, {});
132
+ tfidfs.push(atfidfVocabIdxValueObject);
133
+ }
134
+ });
135
+ return tfidfs;
136
+ }
137
+ tf(vocabIdx, document) {
138
+ return 1 + Math.log(document[vocabIdx]);
139
+ }
140
+ idf(vocabIdx) {
141
+ return (1 + Math.log(this.documents.length / this.documentFrequency[vocabIdx]));
142
+ }
143
+ }
144
+ exports.TfidfVectorizer = TfidfVectorizer;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@memlab/core",
3
- "version": "1.1.4",
3
+ "version": "1.1.7",
4
4
  "license": "MIT",
5
5
  "description": "memlab core libraries",
6
6
  "author": "Liang Gong <lgong@fb.com>",