lhcb-ntuple-wizard-test 1.1.7 → 1.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "lhcb-ntuple-wizard-test",
3
- "version": "1.1.7",
3
+ "version": "1.1.8",
4
4
  "description": "An application to access large-scale open data from LHCb",
5
5
  "url": "https://gitlab.cern.ch/lhcb-dpa/wp6-analysis-preservation-and-open-data/lhcb-ntuple-wizard-frontend/issues",
6
6
  "private": false,
@@ -1,95 +0,0 @@
1
- "use strict";
2
-
3
- Object.defineProperty(exports, "__esModule", {
4
- value: true
5
- });
6
- exports.embedCorpus = embedCorpus;
7
- exports.getSimilarity = getSimilarity;
8
- exports.loadModel = loadModel;
9
- var _transformers = require("@xenova/transformers");
10
- /*****************************************************************************\
11
- * (c) Copyright 2024 CERN for the benefit of the LHCb Collaboration *
12
- * *
13
- * This software is distributed under the terms of the GNU General Public *
14
- * Licence version 3 (GPL Version 3), copied verbatim in the file "COPYING". *
15
- * *
16
- * In applying this licence, CERN does not waive the privileges and immunities *
17
- * granted to it by virtue of its status as an Intergovernmental Organization *
18
- * or submit itself to any jurisdiction. *
19
- \*****************************************************************************/
20
-
21
- // To always download the model from huggingface.com
22
- _transformers.env.allowLocalModels = false;
23
- _transformers.env.useBrowserCache = false;
24
- // Create a new worker
25
- const worker = new Worker(new URL("./worker.jsx", import.meta.url));
26
- /**
27
- * @type {function}
28
- * Promise resolve function for loading the model
29
- */
30
- let loadResolve;
31
- /**
32
- * @type {function}
33
- */
34
- let queryResolve;
35
- worker.onmessage = function (event) {
36
- const message = event.data;
37
- switch (message.type) {
38
- case "progress":
39
- if (message.progress.status === "ready") {
40
- loadResolve();
41
- }
42
- break;
43
- case "corpus":
44
- // the corpus is embedded
45
- queryResolve();
46
- break;
47
- case "result":
48
- queryResolve(message.result);
49
- break;
50
- }
51
- };
52
- /**
53
- *
54
- * @param {string} modelname
55
- * Load the model with the provided model name
56
- * @returns
57
- */
58
- async function loadModel(modelname) {
59
- worker.postMessage({
60
- type: "init",
61
- model: modelname
62
- });
63
- return new Promise(resolve => {
64
- loadResolve = resolve;
65
- });
66
- }
67
- /**
68
- * Passes corpus embedding from backend to the worker
69
- * @returns
70
- */
71
- async function embedCorpus(metadata) {
72
- worker.postMessage({
73
- type: "corpus",
74
- kgdoc: metadata.metadata.kgdoc,
75
- emb: metadata.metadata.embedding
76
- });
77
- return new Promise(resolve => {
78
- queryResolve = resolve;
79
- });
80
- }
81
- /**
82
- *
83
- * @param {string} query#
84
- * Calculate the similarity between the query and the corpus
85
- * @returns
86
- */
87
- async function getSimilarity(query) {
88
- worker.postMessage({
89
- type: "similarity",
90
- query: query
91
- });
92
- return new Promise(resolve => {
93
- queryResolve = resolve;
94
- });
95
- }
@@ -1,196 +0,0 @@
1
- "use strict";
2
-
3
- var _transformers = require("@xenova/transformers");
4
- /*****************************************************************************\
5
- * (c) Copyright 2024 CERN for the benefit of the LHCb Collaboration *
6
- * *
7
- * This software is distributed under the terms of the GNU General Public *
8
- * Licence version 3 (GPL Version 3), copied verbatim in the file "COPYING". *
9
- * *
10
- * In applying this licence, CERN does not waive the privileges and immunities *
11
- * granted to it by virtue of its status as an Intergovernmental Organization *
12
- * or submit itself to any jurisdiction. *
13
- \*****************************************************************************/
14
-
15
- // This is so that we will download the models from huggingface.co
16
- _transformers.env.allowLocalModels = false;
17
- _transformers.env.useBrowserCache = false;
18
- // store the embeddings in a dict
19
- let query_embedding;
20
- /**
21
- * @type {Object.<string, Array>}
22
- */
23
- let embedding_dict = {};
24
- /**
25
- * @type {pipeline}
26
- */
27
- let embedder;
28
-
29
- // these will hold the information to tie variable explanations to TupleTools
30
- let dataKeys;
31
- let paths;
32
- /**
33
- * @type {Array}
34
- * */
35
- let vars;
36
- /**
37
- * This function calculates the embedding and stores it in a dict.
38
- * If the text is already embedded, it will return the embedding from the dict.
39
- * @param {string} text
40
- * @param {bool} embedNew
41
- * @returns embeddingVector
42
- */
43
- async function embed(text) {
44
- let embedNew = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : true;
45
- if (text in embedding_dict) {
46
- return embedding_dict[text];
47
- }
48
- const e0 = await embedder(text, {
49
- pooling: "mean",
50
- normalize: true
51
- });
52
- if (embedNew) {
53
- embedding_dict[text] = e0.data;
54
- }
55
- return e0.data;
56
- }
57
- /**
58
- * This function calculates the cosine similarity between two embeddings.
59
- * @param {Array} corpus_embedding
60
- * @param {Array} query_embedding
61
- * @returns cosine similarity
62
- * */
63
- function calcCosSim(corpus_embedding, query_embedding) {
64
- let dotProduct = 0;
65
- let queryMag = 0;
66
- let embMag = 0;
67
- let loop_length = query_embedding.length;
68
- // because the embeddings might have different dimensions
69
- if (query_embedding.length > corpus_embedding.length) {
70
- loop_length = corpus_embedding.length;
71
- }
72
- for (let i = 0; i < loop_length; i++) {
73
- dotProduct += query_embedding[i] * corpus_embedding[i];
74
- queryMag += query_embedding[i] * query_embedding[i];
75
- embMag += corpus_embedding[i] * corpus_embedding[i];
76
- }
77
- const sim = dotProduct / (Math.sqrt(queryMag) * Math.sqrt(embMag));
78
- return sim;
79
- }
80
- /**This is a helper function to read the nested json file containing the documentation.
81
- * @returns [vars, paths, dataKeys]
82
- */
83
-
84
- function readJsonFile(data) {
85
- // go through nested object
86
- vars = [];
87
- paths = [];
88
- const iterateObject = function (obj) {
89
- let path = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : "";
90
- Object.keys(obj).forEach(key => {
91
- const currentPath = path ? `${path}.${key}` : key;
92
- if (obj[key] instanceof Object) {
93
- iterateObject(obj[key], currentPath);
94
- } else {
95
- vars.push(obj[key]);
96
- paths.push(currentPath);
97
- }
98
- });
99
- };
100
-
101
- // go through data and put it into an array
102
- const dataArray = Object.values(data);
103
- dataKeys = Object.keys(data);
104
- iterateObject(dataArray);
105
- return [vars, paths, dataKeys];
106
- }
107
-
108
- // worker on message definition
109
- self.onmessage = async event => {
110
- const message = event.data;
111
- switch (message.type) {
112
- case "init":
113
- // load the model and reset the embedding dict
114
- embedding_dict = {};
115
- embedder = await (0, _transformers.pipeline)("feature-extraction", message.model, {
116
- progress_callback: progress => {
117
- self.postMessage({
118
- type: "progress",
119
- progress: progress
120
- });
121
- }
122
- });
123
- break;
124
- case "corpus":
125
- {
126
- // embed the corpus or load the embeddings
127
- readJsonFile(message.kgdoc); // this is so we can later match variables to tupletools
128
-
129
- Object.keys(message.emb).forEach(key => {
130
- let embarr = [];
131
- for (const fl of message.emb[key].split(",")) {
132
- embarr.push(fl.replace("[", "").replace("]", "").replace("np.float32", "").replace("(", "").replace(")", "").replace(" ", ""));
133
- }
134
- embedding_dict[key] = embarr;
135
- });
136
- self.postMessage({
137
- type: "corpus"
138
- });
139
- break;
140
- }
141
- case "similarity":
142
- {
143
- //calculate query embedding and then calculate similarity
144
- query_embedding = await embed(message.query, false);
145
- let sim_dict = {};
146
- Object.keys(embedding_dict).forEach(key => {
147
- sim_dict[key] = calcCosSim(embedding_dict[key], query_embedding);
148
- });
149
- // sort by highest similarity (closest to 1)
150
- const sortedSimDict = Object.fromEntries(Object.entries(sim_dict).sort((_ref, _ref2) => {
151
- let [, a] = _ref;
152
- let [, b] = _ref2;
153
- return b - a;
154
- }));
155
- const top5 = Object.fromEntries(Object.entries(sortedSimDict).slice(0, 20));
156
- let loki_result = [];
157
- let ttool_result = [];
158
- let result = {
159
- loki: loki_result,
160
- ttool: ttool_result
161
- };
162
- Object.keys(top5).forEach(key => {
163
- if (key === "basic,charged: : ") {
164
- return;
165
- } // This is a workaround until we implement a fix in Analysis (and corresponding DaVinci release)
166
- let description = key.substring(key.indexOf(":") + 2);
167
- let variable = key.substring(0, key.indexOf(":") - 1);
168
- let varpath = paths[vars.indexOf(description)];
169
- let tupletool = dataKeys[varpath.split(".")[0]];
170
- if (tupletool === "LoKi_functors") {
171
- if (top5[key] > 0.86) {
172
- if (loki_result.length < 5) {
173
- loki_result.push([description, variable, tupletool]);
174
- }
175
- }
176
- } else {
177
- if (top5[key] > 0.75) {
178
- if (ttool_result.length < 5) {
179
- ttool_result.push([description, variable, tupletool]);
180
- }
181
- }
182
- }
183
- });
184
- if (loki_result.length === 0) {
185
- loki_result.push(["No matches found", "No matches found", "No matches found"]);
186
- }
187
- if (ttool_result.length === 0) {
188
- ttool_result.push(["No matches found", "No matches found", "No matches found"]);
189
- }
190
- self.postMessage({
191
- type: "result",
192
- result: result
193
- });
194
- }
195
- }
196
- };