lhcb-ntuple-wizard-test 1.1.7 → 1.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/lib/worker.js +6 -0
- package/package.json +1 -1
- package/dist/components/semantic.js +0 -95
- package/dist/components/worker.js +0 -196
package/dist/lib/worker.js
CHANGED
|
@@ -1,5 +1,11 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
|
|
3
|
+
Object.defineProperty(exports, "__esModule", {
|
|
4
|
+
value: true
|
|
5
|
+
});
|
|
6
|
+
exports.calcCosSim = calcCosSim;
|
|
7
|
+
exports.embed = embed;
|
|
8
|
+
exports.readJsonFile = readJsonFile;
|
|
3
9
|
var _transformers = require("@xenova/transformers");
|
|
4
10
|
/*****************************************************************************\
|
|
5
11
|
* (c) Copyright 2024 CERN for the benefit of the LHCb Collaboration *
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "lhcb-ntuple-wizard-test",
|
|
3
|
-
"version": "1.1.
|
|
3
|
+
"version": "1.1.9",
|
|
4
4
|
"description": "An application to access large-scale open data from LHCb",
|
|
5
5
|
"url": "https://gitlab.cern.ch/lhcb-dpa/wp6-analysis-preservation-and-open-data/lhcb-ntuple-wizard-frontend/issues",
|
|
6
6
|
"private": false,
|
|
@@ -1,95 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
|
|
3
|
-
Object.defineProperty(exports, "__esModule", {
|
|
4
|
-
value: true
|
|
5
|
-
});
|
|
6
|
-
exports.embedCorpus = embedCorpus;
|
|
7
|
-
exports.getSimilarity = getSimilarity;
|
|
8
|
-
exports.loadModel = loadModel;
|
|
9
|
-
var _transformers = require("@xenova/transformers");
|
|
10
|
-
/*****************************************************************************\
|
|
11
|
-
* (c) Copyright 2024 CERN for the benefit of the LHCb Collaboration *
|
|
12
|
-
* *
|
|
13
|
-
* This software is distributed under the terms of the GNU General Public *
|
|
14
|
-
* Licence version 3 (GPL Version 3), copied verbatim in the file "COPYING". *
|
|
15
|
-
* *
|
|
16
|
-
* In applying this licence, CERN does not waive the privileges and immunities *
|
|
17
|
-
* granted to it by virtue of its status as an Intergovernmental Organization *
|
|
18
|
-
* or submit itself to any jurisdiction. *
|
|
19
|
-
\*****************************************************************************/
|
|
20
|
-
|
|
21
|
-
// To always download the model from huggingface.com
|
|
22
|
-
_transformers.env.allowLocalModels = false;
|
|
23
|
-
_transformers.env.useBrowserCache = false;
|
|
24
|
-
// Create a new worker
|
|
25
|
-
const worker = new Worker(new URL("./worker.jsx", import.meta.url));
|
|
26
|
-
/**
|
|
27
|
-
* @type {function}
|
|
28
|
-
* Promise resolve function for loading the model
|
|
29
|
-
*/
|
|
30
|
-
let loadResolve;
|
|
31
|
-
/**
|
|
32
|
-
* @type {function}
|
|
33
|
-
*/
|
|
34
|
-
let queryResolve;
|
|
35
|
-
worker.onmessage = function (event) {
|
|
36
|
-
const message = event.data;
|
|
37
|
-
switch (message.type) {
|
|
38
|
-
case "progress":
|
|
39
|
-
if (message.progress.status === "ready") {
|
|
40
|
-
loadResolve();
|
|
41
|
-
}
|
|
42
|
-
break;
|
|
43
|
-
case "corpus":
|
|
44
|
-
// the corpus is embedded
|
|
45
|
-
queryResolve();
|
|
46
|
-
break;
|
|
47
|
-
case "result":
|
|
48
|
-
queryResolve(message.result);
|
|
49
|
-
break;
|
|
50
|
-
}
|
|
51
|
-
};
|
|
52
|
-
/**
|
|
53
|
-
*
|
|
54
|
-
* @param {string} modelname
|
|
55
|
-
* Load the model with the provided model name
|
|
56
|
-
* @returns
|
|
57
|
-
*/
|
|
58
|
-
async function loadModel(modelname) {
|
|
59
|
-
worker.postMessage({
|
|
60
|
-
type: "init",
|
|
61
|
-
model: modelname
|
|
62
|
-
});
|
|
63
|
-
return new Promise(resolve => {
|
|
64
|
-
loadResolve = resolve;
|
|
65
|
-
});
|
|
66
|
-
}
|
|
67
|
-
/**
|
|
68
|
-
* Passes corpus embedding from backend to the worker
|
|
69
|
-
* @returns
|
|
70
|
-
*/
|
|
71
|
-
async function embedCorpus(metadata) {
|
|
72
|
-
worker.postMessage({
|
|
73
|
-
type: "corpus",
|
|
74
|
-
kgdoc: metadata.metadata.kgdoc,
|
|
75
|
-
emb: metadata.metadata.embedding
|
|
76
|
-
});
|
|
77
|
-
return new Promise(resolve => {
|
|
78
|
-
queryResolve = resolve;
|
|
79
|
-
});
|
|
80
|
-
}
|
|
81
|
-
/**
|
|
82
|
-
*
|
|
83
|
-
* @param {string} query#
|
|
84
|
-
* Calculate the similarity between the query and the corpus
|
|
85
|
-
* @returns
|
|
86
|
-
*/
|
|
87
|
-
async function getSimilarity(query) {
|
|
88
|
-
worker.postMessage({
|
|
89
|
-
type: "similarity",
|
|
90
|
-
query: query
|
|
91
|
-
});
|
|
92
|
-
return new Promise(resolve => {
|
|
93
|
-
queryResolve = resolve;
|
|
94
|
-
});
|
|
95
|
-
}
|
|
@@ -1,196 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
|
|
3
|
-
var _transformers = require("@xenova/transformers");
|
|
4
|
-
/*****************************************************************************\
|
|
5
|
-
* (c) Copyright 2024 CERN for the benefit of the LHCb Collaboration *
|
|
6
|
-
* *
|
|
7
|
-
* This software is distributed under the terms of the GNU General Public *
|
|
8
|
-
* Licence version 3 (GPL Version 3), copied verbatim in the file "COPYING". *
|
|
9
|
-
* *
|
|
10
|
-
* In applying this licence, CERN does not waive the privileges and immunities *
|
|
11
|
-
* granted to it by virtue of its status as an Intergovernmental Organization *
|
|
12
|
-
* or submit itself to any jurisdiction. *
|
|
13
|
-
\*****************************************************************************/
|
|
14
|
-
|
|
15
|
-
// This is so that we will download the models from huggingface.co
|
|
16
|
-
_transformers.env.allowLocalModels = false;
|
|
17
|
-
_transformers.env.useBrowserCache = false;
|
|
18
|
-
// store the embeddings in a dict
|
|
19
|
-
let query_embedding;
|
|
20
|
-
/**
|
|
21
|
-
* @type {Object.<string, Array>}
|
|
22
|
-
*/
|
|
23
|
-
let embedding_dict = {};
|
|
24
|
-
/**
|
|
25
|
-
* @type {pipeline}
|
|
26
|
-
*/
|
|
27
|
-
let embedder;
|
|
28
|
-
|
|
29
|
-
// these will hold the information to tie variable explanations to TupleTools
|
|
30
|
-
let dataKeys;
|
|
31
|
-
let paths;
|
|
32
|
-
/**
|
|
33
|
-
* @type {Array}
|
|
34
|
-
* */
|
|
35
|
-
let vars;
|
|
36
|
-
/**
|
|
37
|
-
* This function calculates the embedding and stores it in a dict.
|
|
38
|
-
* If the text is already embedded, it will return the embedding from the dict.
|
|
39
|
-
* @param {string} text
|
|
40
|
-
* @param {bool} embedNew
|
|
41
|
-
* @returns embeddingVector
|
|
42
|
-
*/
|
|
43
|
-
async function embed(text) {
|
|
44
|
-
let embedNew = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : true;
|
|
45
|
-
if (text in embedding_dict) {
|
|
46
|
-
return embedding_dict[text];
|
|
47
|
-
}
|
|
48
|
-
const e0 = await embedder(text, {
|
|
49
|
-
pooling: "mean",
|
|
50
|
-
normalize: true
|
|
51
|
-
});
|
|
52
|
-
if (embedNew) {
|
|
53
|
-
embedding_dict[text] = e0.data;
|
|
54
|
-
}
|
|
55
|
-
return e0.data;
|
|
56
|
-
}
|
|
57
|
-
/**
|
|
58
|
-
* This function calculates the cosine similarity between two embeddings.
|
|
59
|
-
* @param {Array} corpus_embedding
|
|
60
|
-
* @param {Array} query_embedding
|
|
61
|
-
* @returns cosine similarity
|
|
62
|
-
* */
|
|
63
|
-
function calcCosSim(corpus_embedding, query_embedding) {
|
|
64
|
-
let dotProduct = 0;
|
|
65
|
-
let queryMag = 0;
|
|
66
|
-
let embMag = 0;
|
|
67
|
-
let loop_length = query_embedding.length;
|
|
68
|
-
// because the embeddings might have different dimensions
|
|
69
|
-
if (query_embedding.length > corpus_embedding.length) {
|
|
70
|
-
loop_length = corpus_embedding.length;
|
|
71
|
-
}
|
|
72
|
-
for (let i = 0; i < loop_length; i++) {
|
|
73
|
-
dotProduct += query_embedding[i] * corpus_embedding[i];
|
|
74
|
-
queryMag += query_embedding[i] * query_embedding[i];
|
|
75
|
-
embMag += corpus_embedding[i] * corpus_embedding[i];
|
|
76
|
-
}
|
|
77
|
-
const sim = dotProduct / (Math.sqrt(queryMag) * Math.sqrt(embMag));
|
|
78
|
-
return sim;
|
|
79
|
-
}
|
|
80
|
-
/**This is a helper function to read the nested json file containing the documentation.
|
|
81
|
-
* @returns [vars, paths, dataKeys]
|
|
82
|
-
*/
|
|
83
|
-
|
|
84
|
-
function readJsonFile(data) {
|
|
85
|
-
// go through nested object
|
|
86
|
-
vars = [];
|
|
87
|
-
paths = [];
|
|
88
|
-
const iterateObject = function (obj) {
|
|
89
|
-
let path = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : "";
|
|
90
|
-
Object.keys(obj).forEach(key => {
|
|
91
|
-
const currentPath = path ? `${path}.${key}` : key;
|
|
92
|
-
if (obj[key] instanceof Object) {
|
|
93
|
-
iterateObject(obj[key], currentPath);
|
|
94
|
-
} else {
|
|
95
|
-
vars.push(obj[key]);
|
|
96
|
-
paths.push(currentPath);
|
|
97
|
-
}
|
|
98
|
-
});
|
|
99
|
-
};
|
|
100
|
-
|
|
101
|
-
// go through data and put it into an array
|
|
102
|
-
const dataArray = Object.values(data);
|
|
103
|
-
dataKeys = Object.keys(data);
|
|
104
|
-
iterateObject(dataArray);
|
|
105
|
-
return [vars, paths, dataKeys];
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
// worker on message definition
|
|
109
|
-
self.onmessage = async event => {
|
|
110
|
-
const message = event.data;
|
|
111
|
-
switch (message.type) {
|
|
112
|
-
case "init":
|
|
113
|
-
// load the model and reset the embedding dict
|
|
114
|
-
embedding_dict = {};
|
|
115
|
-
embedder = await (0, _transformers.pipeline)("feature-extraction", message.model, {
|
|
116
|
-
progress_callback: progress => {
|
|
117
|
-
self.postMessage({
|
|
118
|
-
type: "progress",
|
|
119
|
-
progress: progress
|
|
120
|
-
});
|
|
121
|
-
}
|
|
122
|
-
});
|
|
123
|
-
break;
|
|
124
|
-
case "corpus":
|
|
125
|
-
{
|
|
126
|
-
// embed the corpus or load the embeddings
|
|
127
|
-
readJsonFile(message.kgdoc); // this is so we can later match variables to tupletools
|
|
128
|
-
|
|
129
|
-
Object.keys(message.emb).forEach(key => {
|
|
130
|
-
let embarr = [];
|
|
131
|
-
for (const fl of message.emb[key].split(",")) {
|
|
132
|
-
embarr.push(fl.replace("[", "").replace("]", "").replace("np.float32", "").replace("(", "").replace(")", "").replace(" ", ""));
|
|
133
|
-
}
|
|
134
|
-
embedding_dict[key] = embarr;
|
|
135
|
-
});
|
|
136
|
-
self.postMessage({
|
|
137
|
-
type: "corpus"
|
|
138
|
-
});
|
|
139
|
-
break;
|
|
140
|
-
}
|
|
141
|
-
case "similarity":
|
|
142
|
-
{
|
|
143
|
-
//calculate query embedding and then calculate similarity
|
|
144
|
-
query_embedding = await embed(message.query, false);
|
|
145
|
-
let sim_dict = {};
|
|
146
|
-
Object.keys(embedding_dict).forEach(key => {
|
|
147
|
-
sim_dict[key] = calcCosSim(embedding_dict[key], query_embedding);
|
|
148
|
-
});
|
|
149
|
-
// sort by highest similarity (closest to 1)
|
|
150
|
-
const sortedSimDict = Object.fromEntries(Object.entries(sim_dict).sort((_ref, _ref2) => {
|
|
151
|
-
let [, a] = _ref;
|
|
152
|
-
let [, b] = _ref2;
|
|
153
|
-
return b - a;
|
|
154
|
-
}));
|
|
155
|
-
const top5 = Object.fromEntries(Object.entries(sortedSimDict).slice(0, 20));
|
|
156
|
-
let loki_result = [];
|
|
157
|
-
let ttool_result = [];
|
|
158
|
-
let result = {
|
|
159
|
-
loki: loki_result,
|
|
160
|
-
ttool: ttool_result
|
|
161
|
-
};
|
|
162
|
-
Object.keys(top5).forEach(key => {
|
|
163
|
-
if (key === "basic,charged: : ") {
|
|
164
|
-
return;
|
|
165
|
-
} // This is a workaround until we implement a fix in Analysis (and corresponding DaVinci release)
|
|
166
|
-
let description = key.substring(key.indexOf(":") + 2);
|
|
167
|
-
let variable = key.substring(0, key.indexOf(":") - 1);
|
|
168
|
-
let varpath = paths[vars.indexOf(description)];
|
|
169
|
-
let tupletool = dataKeys[varpath.split(".")[0]];
|
|
170
|
-
if (tupletool === "LoKi_functors") {
|
|
171
|
-
if (top5[key] > 0.86) {
|
|
172
|
-
if (loki_result.length < 5) {
|
|
173
|
-
loki_result.push([description, variable, tupletool]);
|
|
174
|
-
}
|
|
175
|
-
}
|
|
176
|
-
} else {
|
|
177
|
-
if (top5[key] > 0.75) {
|
|
178
|
-
if (ttool_result.length < 5) {
|
|
179
|
-
ttool_result.push([description, variable, tupletool]);
|
|
180
|
-
}
|
|
181
|
-
}
|
|
182
|
-
}
|
|
183
|
-
});
|
|
184
|
-
if (loki_result.length === 0) {
|
|
185
|
-
loki_result.push(["No matches found", "No matches found", "No matches found"]);
|
|
186
|
-
}
|
|
187
|
-
if (ttool_result.length === 0) {
|
|
188
|
-
ttool_result.push(["No matches found", "No matches found", "No matches found"]);
|
|
189
|
-
}
|
|
190
|
-
self.postMessage({
|
|
191
|
-
type: "result",
|
|
192
|
-
result: result
|
|
193
|
-
});
|
|
194
|
-
}
|
|
195
|
-
}
|
|
196
|
-
};
|