@socialgouv/cdtn-elasticsearch 2.45.0 → 2.46.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +10 -0
- package/package.json +2 -5
- package/src/index.d.ts +0 -2
- package/src/mapping/document.mapping.js +1 -4
- package/src/vectorizer/index.js +1 -56
- package/src/vectorizer/index.test.js +1 -41
package/CHANGELOG.md
CHANGED
|
@@ -3,6 +3,16 @@
|
|
|
3
3
|
All notable changes to this project will be documented in this file.
|
|
4
4
|
See [Conventional Commits](https://conventionalcommits.org) for commit guidelines.
|
|
5
5
|
|
|
6
|
+
## [2.46.1](https://github.com/SocialGouv/cdtn-admin/compare/v2.46.0...v2.46.1) (2024-06-27)
|
|
7
|
+
|
|
8
|
+
**Note:** Version bump only for package @socialgouv/cdtn-elasticsearch
|
|
9
|
+
|
|
10
|
+
# [2.46.0](https://github.com/SocialGouv/cdtn-admin/compare/v2.45.0...v2.46.0) (2024-06-24)
|
|
11
|
+
|
|
12
|
+
### Features
|
|
13
|
+
|
|
14
|
+
- **search:** remove usage of NLP to search document ([#1426](https://github.com/SocialGouv/cdtn-admin/issues/1426)) ([4f76e37](https://github.com/SocialGouv/cdtn-admin/commit/4f76e37d9d3ac34a70f041aa5946b72e8cc3fd36))
|
|
15
|
+
|
|
6
16
|
# [2.45.0](https://github.com/SocialGouv/cdtn-admin/compare/v2.44.2...v2.45.0) (2024-06-18)
|
|
7
17
|
|
|
8
18
|
**Note:** Version bump only for package @socialgouv/cdtn-elasticsearch
|
package/package.json
CHANGED
|
@@ -1,15 +1,12 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@socialgouv/cdtn-elasticsearch",
|
|
3
3
|
"description": "SocialGouv - Code du travail numerique - Infrastructure - Elasticsearch",
|
|
4
|
-
"version": "2.
|
|
4
|
+
"version": "2.46.1",
|
|
5
5
|
"babel": {
|
|
6
6
|
"plugins": [
|
|
7
7
|
"@babel/plugin-transform-modules-commonjs"
|
|
8
8
|
]
|
|
9
9
|
},
|
|
10
|
-
"dependencies": {
|
|
11
|
-
"got": "^11.8.2"
|
|
12
|
-
},
|
|
13
10
|
"license": "Apache-2.0",
|
|
14
11
|
"main": "src/index.js",
|
|
15
12
|
"publishConfig": {
|
|
@@ -40,5 +37,5 @@
|
|
|
40
37
|
},
|
|
41
38
|
"sideEffects": false,
|
|
42
39
|
"types": "src/index.d.ts",
|
|
43
|
-
"gitHead": "
|
|
40
|
+
"gitHead": "470a848cc73f632ae894c78b2c369fce61fd42c2"
|
|
44
41
|
}
|
package/src/index.d.ts
CHANGED
|
@@ -4,7 +4,5 @@ export const documentMapping: any;
|
|
|
4
4
|
export const DOCUMENTS: string;
|
|
5
5
|
export const indexDocumentsBatched: any;
|
|
6
6
|
export const SUGGESTIONS: string;
|
|
7
|
-
export const vectorizeDocument: any;
|
|
8
7
|
export const version: any;
|
|
9
8
|
export const suggestionMapping: any;
|
|
10
|
-
export const vectorizeQuery: any;
|
|
@@ -236,14 +236,11 @@ exports.documentMapping = {
|
|
|
236
236
|
type: "text",
|
|
237
237
|
},
|
|
238
238
|
|
|
239
|
-
title_vector: {
|
|
240
|
-
dims: 512,
|
|
241
|
-
type: "dense_vector",
|
|
242
|
-
},
|
|
243
239
|
// The source URL
|
|
244
240
|
url: {
|
|
245
241
|
type: "keyword",
|
|
246
242
|
},
|
|
243
|
+
|
|
247
244
|
// used in prequalifieds
|
|
248
245
|
variants: {
|
|
249
246
|
type: "text",
|
package/src/vectorizer/index.js
CHANGED
|
@@ -1,14 +1,5 @@
|
|
|
1
|
-
// vectorizer is imported by code-du-travail-api which is using CommonJS, and throwing an exception
|
|
2
|
-
// when requiring code-du-travail-data ES module, thus we keep using CommonJS import here
|
|
3
|
-
const got = require("got");
|
|
4
1
|
const { stopwords: semantic_stopwords } = require("../dataset/stop_words");
|
|
5
2
|
|
|
6
|
-
// URL of the TF serve deployment
|
|
7
|
-
const NLP_URL =
|
|
8
|
-
process.env.NLP_URL || "https://serving-ml.fabrique.social.gouv.fr";
|
|
9
|
-
console.log("NLP URL:", NLP_URL);
|
|
10
|
-
const tfServeURL = NLP_URL + "/v1/models/sentqam:predict";
|
|
11
|
-
|
|
12
3
|
function stripAccents(text) {
|
|
13
4
|
// strip accents
|
|
14
5
|
return text.normalize("NFD").replace(/[\u0300-\u036f]/g, "");
|
|
@@ -16,8 +7,6 @@ function stripAccents(text) {
|
|
|
16
7
|
|
|
17
8
|
const stopWords = new Set(semantic_stopwords.map(stripAccents));
|
|
18
9
|
|
|
19
|
-
const cache = new Map();
|
|
20
|
-
|
|
21
10
|
function preprocess(text) {
|
|
22
11
|
const stripped = stripAccents(text);
|
|
23
12
|
|
|
@@ -31,48 +20,4 @@ function preprocess(text) {
|
|
|
31
20
|
return noStopWords.join(" ");
|
|
32
21
|
}
|
|
33
22
|
|
|
34
|
-
|
|
35
|
-
const response = await got.post(tfServeURL, {
|
|
36
|
-
cache,
|
|
37
|
-
json,
|
|
38
|
-
responseType: "json",
|
|
39
|
-
retry: {
|
|
40
|
-
limit: 15,
|
|
41
|
-
methods: ["POST"],
|
|
42
|
-
},
|
|
43
|
-
});
|
|
44
|
-
return response.body["outputs"];
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
async function vectorizeDocument(title, content) {
|
|
48
|
-
if (title == undefined || title == "") {
|
|
49
|
-
throw new Error("Cannot vectorize document with empty title.");
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
const input = [preprocess(title)];
|
|
53
|
-
const context = content ? [preprocess(content)] : "";
|
|
54
|
-
|
|
55
|
-
const body = {
|
|
56
|
-
inputs: { context, input },
|
|
57
|
-
signature_name: "response_encoder",
|
|
58
|
-
};
|
|
59
|
-
const vectors = await callTFServe(body);
|
|
60
|
-
|
|
61
|
-
return vectors[0];
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
async function vectorizeQuery(query) {
|
|
65
|
-
if (!query) {
|
|
66
|
-
throw new Error("Cannot vectorize empty query.");
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
const inputs = [preprocess(query)];
|
|
70
|
-
const body = {
|
|
71
|
-
inputs,
|
|
72
|
-
signature_name: "question_encoder",
|
|
73
|
-
};
|
|
74
|
-
const vectors = await callTFServe(body);
|
|
75
|
-
return vectors[0];
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
module.exports = { preprocess, vectorizeDocument, vectorizeQuery };
|
|
23
|
+
module.exports = { preprocess };
|
|
@@ -1,44 +1,4 @@
|
|
|
1
|
-
const {
|
|
2
|
-
|
|
3
|
-
const timeout = 10000;
|
|
4
|
-
|
|
5
|
-
test(
|
|
6
|
-
"Should vectorize document",
|
|
7
|
-
async () => {
|
|
8
|
-
const vector1 = await vectorizeDocument("titre", "contenu");
|
|
9
|
-
expect(vector1).toBeDefined();
|
|
10
|
-
// FIXME Should return the same result but don't. See with remi and fabien.
|
|
11
|
-
// expect(vector1).toMatchSnapshot();
|
|
12
|
-
|
|
13
|
-
// preprocessing should make those embeddings equal
|
|
14
|
-
// FIXME Should return the same result but don't. See with remi and fabien.
|
|
15
|
-
// const vector2 = await vectorizeDocument("le titre", "et le contènu");
|
|
16
|
-
// expect(vector2).toEqual(vector1);
|
|
17
|
-
},
|
|
18
|
-
timeout
|
|
19
|
-
);
|
|
20
|
-
|
|
21
|
-
test(
|
|
22
|
-
"Should vectorize query",
|
|
23
|
-
async () => {
|
|
24
|
-
// FIXME Résultat aléatoire, voir pourquoi on n'obtient pas toujours la même réponse
|
|
25
|
-
// const vector1 = await vectorizeQuery("requete");
|
|
26
|
-
// expect(vector1).toMatchSnapshot();
|
|
27
|
-
// const vector2 = await vectorizeQuery("la requête");
|
|
28
|
-
// expect(vector2).toEqual(vector1);
|
|
29
|
-
},
|
|
30
|
-
timeout
|
|
31
|
-
);
|
|
32
|
-
|
|
33
|
-
test(
|
|
34
|
-
"Should fail when no content passed",
|
|
35
|
-
async () => {
|
|
36
|
-
await expect(vectorizeQuery()).rejects.toThrow(
|
|
37
|
-
new Error("Cannot vectorize empty query.")
|
|
38
|
-
);
|
|
39
|
-
},
|
|
40
|
-
timeout
|
|
41
|
-
);
|
|
1
|
+
const { preprocess } = require("./index");
|
|
42
2
|
|
|
43
3
|
test("Should preprocess text", async () => {
|
|
44
4
|
expect(preprocess("à la nôtre")).toEqual("");
|