@socialgouv/fiches-travail-data-types 4.374.0 → 4.376.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +20 -0
- package/build/checkRefs.d.ts +1 -0
- package/build/checkRefs.js +24 -0
- package/build/email.d.ts +3 -0
- package/build/email.js +12 -0
- package/build/fetch-data/generateHeaders.d.ts +4 -0
- package/build/fetch-data/generateHeaders.js +16 -0
- package/build/fetch-data/index.d.ts +2 -0
- package/build/fetch-data/index.js +66 -0
- package/build/fetch-data/parseDom.d.ts +14 -0
- package/build/fetch-data/parseDom.js +211 -0
- package/build/fetch-data/postProcess.d.ts +1 -0
- package/build/fetch-data/postProcess.js +60 -0
- package/build/fetch-data/referenceExtractor.d.ts +18 -0
- package/build/fetch-data/referenceExtractor.js +239 -0
- package/build/fetch-data/referenceResolver.d.ts +1 -0
- package/build/fetch-data/referenceResolver.js +176 -0
- package/build/fetch-data/scrapUrl.d.ts +14 -0
- package/build/fetch-data/scrapUrl.js +52 -0
- package/build/index.d.ts +3 -0
- package/build/index.js +20 -0
- package/build/types.d.ts +29 -0
- package/build/types.js +2 -0
- package/package.json +17 -40
- package/index.d.ts +0 -32
- package/index.esm.js +0 -7
- package/index.js +0 -11
package/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,23 @@
|
|
|
1
|
+
# [4.376.0](https://github.com/SocialGouv/fiches-travail-data/compare/v4.375.0...v4.376.0) (2022-12-06)
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
### Bug Fixes
|
|
5
|
+
|
|
6
|
+
* **build:** ajout de la gestion build full typescript ✨ ([#386](https://github.com/SocialGouv/fiches-travail-data/issues/386)) ([46e5ade](https://github.com/SocialGouv/fiches-travail-data/commit/46e5ade139b31b3e4ffd220dff1136248cd3aeaa))
|
|
7
|
+
* **picture:** ajout du scénario pour la balise picture + typescript / eslint ([#385](https://github.com/SocialGouv/fiches-travail-data/issues/385)) ([5dce100](https://github.com/SocialGouv/fiches-travail-data/commit/5dce100607fbfc938a96468ac6774e3924df559b))
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
### Features
|
|
11
|
+
|
|
12
|
+
* **data:** 20221206_1018 update ([8ee594e](https://github.com/SocialGouv/fiches-travail-data/commit/8ee594e4e29b30300a501030f37dbeeac83980a8))
|
|
13
|
+
|
|
14
|
+
# [4.375.0](https://github.com/SocialGouv/fiches-travail-data/compare/v4.374.0...v4.375.0) (2022-12-01)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
### Features
|
|
18
|
+
|
|
19
|
+
* **data:** 20221201_2211 update ([9bf8887](https://github.com/SocialGouv/fiches-travail-data/commit/9bf888781d07d7c8d1bdc87af5dc038fa1dcf9b3))
|
|
20
|
+
|
|
1
21
|
# [4.374.0](https://github.com/SocialGouv/fiches-travail-data/compare/v4.373.0...v4.374.0) (2022-11-30)
|
|
2
22
|
|
|
3
23
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
const fiches = require("../data/fiches-travail.json");
|
|
3
|
+
const UNDEFINED_KEY = "UNDEFINED";
|
|
4
|
+
const undefinedReferences = fiches.filter((fiche) => {
|
|
5
|
+
const refErrors = fiche.sections.filter((section) => {
|
|
6
|
+
if (!section.references) {
|
|
7
|
+
console.log("no refs in " + fiche.title);
|
|
8
|
+
}
|
|
9
|
+
return section.references && UNDEFINED_KEY in section.references;
|
|
10
|
+
});
|
|
11
|
+
return refErrors.length > 0;
|
|
12
|
+
});
|
|
13
|
+
const printMissingRef = (fiche) => {
|
|
14
|
+
console.log(`#### [${fiche.title}](${fiche.url})`);
|
|
15
|
+
fiche.sections.forEach((section) => {
|
|
16
|
+
if (section.references && UNDEFINED_KEY in section.references) {
|
|
17
|
+
console.log(`- ${section.anchor}`);
|
|
18
|
+
const fmt = section.references[UNDEFINED_KEY].articles.map((ref) => ref.text);
|
|
19
|
+
console.log(`> ${Array.from(new Set(fmt)).join(" / ")}`);
|
|
20
|
+
}
|
|
21
|
+
});
|
|
22
|
+
};
|
|
23
|
+
console.log(`### ${undefinedReferences.length}/${fiches.length} fiches aux références non résolues.`);
|
|
24
|
+
undefinedReferences.map((fiche) => printMissingRef(fiche));
|
package/build/email.d.ts
ADDED
package/build/email.js
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.encode = exports.decode = void 0;
|
|
4
|
+
const char = "_";
|
|
5
|
+
function encode(str) {
|
|
6
|
+
return str.replace(/@/g, `${char}@`);
|
|
7
|
+
}
|
|
8
|
+
exports.encode = encode;
|
|
9
|
+
function decode(str) {
|
|
10
|
+
return str.replace(new RegExp(`${char}@`, "g"), "@");
|
|
11
|
+
}
|
|
12
|
+
exports.decode = decode;
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.generateHeaders = void 0;
|
|
4
|
+
/**
|
|
5
|
+
* Build the header for request with a specific token to bypass bot protection
|
|
6
|
+
*/
|
|
7
|
+
function generateHeaders(extras) {
|
|
8
|
+
if (!process.env.TOKEN_MT) {
|
|
9
|
+
throw Error("Token (cgtoken) is required to fetch the data. This token is provided by the travail-emploi.gouv.fr team.");
|
|
10
|
+
}
|
|
11
|
+
return {
|
|
12
|
+
...extras,
|
|
13
|
+
Cookie: `cgtoken=${process.env.TOKEN_MT};`,
|
|
14
|
+
};
|
|
15
|
+
}
|
|
16
|
+
exports.generateHeaders = generateHeaders;
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.scrap = exports.fetchFeed = void 0;
|
|
7
|
+
const fs_1 = __importDefault(require("fs"));
|
|
8
|
+
const got_1 = __importDefault(require("got"));
|
|
9
|
+
const p_limit_1 = __importDefault(require("p-limit"));
|
|
10
|
+
const path_1 = __importDefault(require("path"));
|
|
11
|
+
const generateHeaders_1 = require("./generateHeaders");
|
|
12
|
+
const scrapUrl_1 = require("./scrapUrl");
|
|
13
|
+
const FEED_URL = "https://travail-emploi.gouv.fr/?page=oseo_json";
|
|
14
|
+
const limit = (0, p_limit_1.default)(10);
|
|
15
|
+
async function fetchFeed(url) {
|
|
16
|
+
const response = await got_1.default.post(url, {
|
|
17
|
+
headers: (0, generateHeaders_1.generateHeaders)({
|
|
18
|
+
"Content-Type": "application/json",
|
|
19
|
+
}),
|
|
20
|
+
http2: true,
|
|
21
|
+
retry: 3,
|
|
22
|
+
});
|
|
23
|
+
const { fiches: feed } = JSON.parse(response.body);
|
|
24
|
+
return feed;
|
|
25
|
+
}
|
|
26
|
+
exports.fetchFeed = fetchFeed;
|
|
27
|
+
async function scrap(urls) {
|
|
28
|
+
const inputs = urls.map(({ id, url }) => limit(() => (0, scrapUrl_1.scrapUrl)(id, url)));
|
|
29
|
+
const results = await Promise.allSettled(inputs);
|
|
30
|
+
const failedPromise = results.filter(({ status }) => status === "rejected");
|
|
31
|
+
if (failedPromise.length > 0) {
|
|
32
|
+
console.error("scrap fail", failedPromise.map(({ reason }) => reason));
|
|
33
|
+
throw new Error("Error - fetching pages fail. Some pages are missing");
|
|
34
|
+
}
|
|
35
|
+
const resolvedPromise = results.flatMap(({ status, value }) => status === "fulfilled" ? [value] : []);
|
|
36
|
+
// ensure we not have duplicate url
|
|
37
|
+
let hasDuplicate = false;
|
|
38
|
+
for (const { pubId, url } of resolvedPromise) {
|
|
39
|
+
const count = resolvedPromise.filter((fiche) => fiche.pubId === pubId && pubId !== undefined).length;
|
|
40
|
+
if (count > 1) {
|
|
41
|
+
hasDuplicate = true;
|
|
42
|
+
console.error(`[error] la fiche ${url} est présente ${count} fois. Veuillez supprimer le doublon du datafiller`);
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
if (hasDuplicate) {
|
|
46
|
+
throw new Error(`[error] fiches en doublons. Veuillez supprimer les doublons du datafiller`);
|
|
47
|
+
}
|
|
48
|
+
return resolvedPromise;
|
|
49
|
+
}
|
|
50
|
+
exports.scrap = scrap;
|
|
51
|
+
if (module === require.main) {
|
|
52
|
+
const t0 = Date.now();
|
|
53
|
+
fetchFeed(FEED_URL)
|
|
54
|
+
.then(scrap)
|
|
55
|
+
.then((fiches) => {
|
|
56
|
+
console.log(`done in ${Math.round((Date.now() - t0) / 1000)} sec`);
|
|
57
|
+
const dataFilePath = path_1.default.join(__dirname, "..", "..", "data", "fiches-travail.json");
|
|
58
|
+
fs_1.default.mkdirSync(path_1.default.dirname(dataFilePath), { recursive: true });
|
|
59
|
+
fs_1.default.writeFileSync(dataFilePath, JSON.stringify(fiches, null, 2));
|
|
60
|
+
})
|
|
61
|
+
.catch((error) => {
|
|
62
|
+
console.error(error);
|
|
63
|
+
console.error(`fail in ${Math.round((Date.now() - t0) / 1000)} sec`);
|
|
64
|
+
process.exit(1);
|
|
65
|
+
});
|
|
66
|
+
}
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.parseDom = void 0;
|
|
7
|
+
const cdtn_slugify_1 = __importDefault(require("@socialgouv/cdtn-slugify"));
|
|
8
|
+
const got_1 = require("got");
|
|
9
|
+
const email_1 = require("../email");
|
|
10
|
+
const postProcess_1 = require("./postProcess");
|
|
11
|
+
const referenceExtractor_1 = require("./referenceExtractor");
|
|
12
|
+
const referenceResolver_1 = require("./referenceResolver");
|
|
13
|
+
const $$ = (node, selector) => Array.from(node.querySelectorAll(selector));
|
|
14
|
+
const $ = (node, selector) => node.querySelector(selector);
|
|
15
|
+
function unwrapEmail(data = "") {
|
|
16
|
+
const [k, ...tokens] = Array.from({ length: data.length / 2 }, (_, i) => i * 2).map((val) => parseInt(data.slice(val, val + 2), 16));
|
|
17
|
+
const rawValue = tokens.map((v) => String.fromCharCode(v ^ k)).join("");
|
|
18
|
+
return (0, email_1.encode)(decodeURIComponent(escape(rawValue)));
|
|
19
|
+
}
|
|
20
|
+
const formatEmail = (node) => {
|
|
21
|
+
const value = unwrapEmail(node.getAttribute("data-cfemail"));
|
|
22
|
+
node.removeAttribute("data-cfemail");
|
|
23
|
+
node.textContent = value;
|
|
24
|
+
};
|
|
25
|
+
const formatPicture = (node) => {
|
|
26
|
+
const comment = node.parentElement.childNodes[0];
|
|
27
|
+
if (comment.nodeName !== "#comment") {
|
|
28
|
+
//upper sibbling node is not a comment so it's not a case we handle
|
|
29
|
+
return;
|
|
30
|
+
}
|
|
31
|
+
const [, src = ""] = comment.data.match(/src=["']([^'"]*)["']/);
|
|
32
|
+
if (src.length === 0) {
|
|
33
|
+
return;
|
|
34
|
+
}
|
|
35
|
+
let [srcClean] = src.split("?");
|
|
36
|
+
if (!srcClean.match(/^https?:\/\//)) {
|
|
37
|
+
if (srcClean.slice(0, 1) !== "/") {
|
|
38
|
+
srcClean = "/" + srcClean;
|
|
39
|
+
}
|
|
40
|
+
srcClean = `https://travail-emploi.gouv.fr${srcClean}`;
|
|
41
|
+
}
|
|
42
|
+
// we remove the ie comment that have timestamp in the url
|
|
43
|
+
comment.remove();
|
|
44
|
+
// we add e
|
|
45
|
+
const sourceNode = node.ownerDocument.createElement("source");
|
|
46
|
+
sourceNode.setAttribute("srcset", srcClean);
|
|
47
|
+
sourceNode.setAttribute("media", "(min-width: 300px)");
|
|
48
|
+
node.appendChild(sourceNode);
|
|
49
|
+
return node;
|
|
50
|
+
};
|
|
51
|
+
const formatAnchor = (node) => {
|
|
52
|
+
if (node.innerHTML.trim() === "") {
|
|
53
|
+
node.remove();
|
|
54
|
+
return;
|
|
55
|
+
}
|
|
56
|
+
if (node.getElementsByTagName("img").length) {
|
|
57
|
+
node.classList.add("no-after");
|
|
58
|
+
}
|
|
59
|
+
let href = node.getAttribute("href");
|
|
60
|
+
// remove ATTAg(...) on pdf link
|
|
61
|
+
node.removeAttribute("onclick");
|
|
62
|
+
if (!href)
|
|
63
|
+
return;
|
|
64
|
+
// unwrap link with href="javascript:"
|
|
65
|
+
if (/^javascript:/.test(href)) {
|
|
66
|
+
node.parentNode.innerHTML = node.textContent;
|
|
67
|
+
}
|
|
68
|
+
if (/email-protection/.test(href)) {
|
|
69
|
+
const [, data = ""] = href.split("#");
|
|
70
|
+
const value = unwrapEmail(data);
|
|
71
|
+
node.setAttribute("href", `mailto:${value}`);
|
|
72
|
+
return;
|
|
73
|
+
}
|
|
74
|
+
if (!href.match(/^https?:\/\//)) {
|
|
75
|
+
if (href.slice(0, 1) !== "/") {
|
|
76
|
+
href = "/" + href;
|
|
77
|
+
}
|
|
78
|
+
node.setAttribute("href", `https://travail-emploi.gouv.fr${href}`);
|
|
79
|
+
node.setAttribute("target", "_blank");
|
|
80
|
+
node.setAttribute("rel", "nofollow, noopener");
|
|
81
|
+
}
|
|
82
|
+
};
|
|
83
|
+
const flattenCsBlocs = (node) => {
|
|
84
|
+
node.insertAdjacentHTML("afterend", node.innerHTML);
|
|
85
|
+
node.parentNode.removeChild(node);
|
|
86
|
+
};
|
|
87
|
+
const getSectionTag = (article) => {
|
|
88
|
+
const h3 = $$(article, ".main-article__texte > h3").length && "h3";
|
|
89
|
+
const h4 = $$(article, ".main-article__texte > h4").length && "h4";
|
|
90
|
+
const h5 = $$(article, ".main-article__texte > h5").length && "h5";
|
|
91
|
+
return h3 || h4 || h5 || "sectionTag";
|
|
92
|
+
};
|
|
93
|
+
const getReferences = (text) => {
|
|
94
|
+
// first we extract the tokens referencing articles
|
|
95
|
+
const references = (0, referenceExtractor_1.extractReferences)(text);
|
|
96
|
+
// then we try to resolve the actual articles ids using legi-data
|
|
97
|
+
return (0, referenceResolver_1.resolveReferences)(references);
|
|
98
|
+
};
|
|
99
|
+
function parseDom(dom, id, url) {
|
|
100
|
+
const article = $(dom.window.document, "main");
|
|
101
|
+
if (!article) {
|
|
102
|
+
throw new got_1.ParseError("no <main>");
|
|
103
|
+
}
|
|
104
|
+
$$(article, "a").forEach(formatAnchor);
|
|
105
|
+
$$(article, "picture").forEach(formatPicture);
|
|
106
|
+
$$(article, "[data-cfemail]").forEach(formatEmail);
|
|
107
|
+
$$(article, ".cs_blocs").forEach(flattenCsBlocs);
|
|
108
|
+
const imgs = $$(article, "img");
|
|
109
|
+
imgs.forEach((node) => {
|
|
110
|
+
// remove adaptImgFix(this) on hero img
|
|
111
|
+
node.removeAttribute("onmousedown");
|
|
112
|
+
});
|
|
113
|
+
imgs
|
|
114
|
+
.filter((node) => node.getAttribute("src").indexOf("data:image") === -1)
|
|
115
|
+
.forEach((node) => {
|
|
116
|
+
let src = node.getAttribute("src");
|
|
117
|
+
if (!src.match(/^https?:\/\//)) {
|
|
118
|
+
if (src.slice(0, 1) !== "/") {
|
|
119
|
+
src = "/" + src;
|
|
120
|
+
}
|
|
121
|
+
src = `https://travail-emploi.gouv.fr${src}`;
|
|
122
|
+
node.setAttribute("src", src);
|
|
123
|
+
}
|
|
124
|
+
});
|
|
125
|
+
let titleElement = $(article, "h1");
|
|
126
|
+
if (!titleElement) {
|
|
127
|
+
titleElement = $(article, "h2");
|
|
128
|
+
if (!titleElement) {
|
|
129
|
+
throw new got_1.ParseError("No <h1> or <h2> element");
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
const title = titleElement.textContent.trim();
|
|
133
|
+
if (!id) {
|
|
134
|
+
throw new got_1.ParseError(`No id`);
|
|
135
|
+
}
|
|
136
|
+
const dateRaw = $(dom.window.document, "meta[property*=modified_time]") ||
|
|
137
|
+
$(dom.window.document, "meta[property$=published_time]");
|
|
138
|
+
const [year, month, day] = dateRaw.getAttribute("content").split("-");
|
|
139
|
+
let intro = $(article, ".main-article__chapo") || "";
|
|
140
|
+
intro =
|
|
141
|
+
intro && intro.innerHTML.replace(/\n/g, "").replace(/\s+/g, " ").trim();
|
|
142
|
+
const description = $(dom.window.document, "meta[name=description]").getAttribute("content");
|
|
143
|
+
const sections = [];
|
|
144
|
+
const sectionTag = getSectionTag(article);
|
|
145
|
+
// First pass is only to get a potential untitled section at the top of the article
|
|
146
|
+
// This section has neither anchor nor title
|
|
147
|
+
let nextArticleElement = $(article, ".main-article__texte > *");
|
|
148
|
+
const untitledSection = {
|
|
149
|
+
anchor: "",
|
|
150
|
+
html: "",
|
|
151
|
+
text: "",
|
|
152
|
+
title: title,
|
|
153
|
+
};
|
|
154
|
+
while (nextArticleElement &&
|
|
155
|
+
nextArticleElement.tagName.toLowerCase() !== sectionTag) {
|
|
156
|
+
if (nextArticleElement.textContent) {
|
|
157
|
+
if (!untitledSection.description) {
|
|
158
|
+
untitledSection.description = "temp description";
|
|
159
|
+
}
|
|
160
|
+
untitledSection.html += nextArticleElement.outerHTML
|
|
161
|
+
.replace(/\n+/g, "")
|
|
162
|
+
.replace(/>\s+</g, "><")
|
|
163
|
+
.replace(/\s+/g, " ");
|
|
164
|
+
untitledSection.text +=
|
|
165
|
+
" " + nextArticleElement.textContent.replace(/\s+/g, " ").trim();
|
|
166
|
+
}
|
|
167
|
+
nextArticleElement = nextArticleElement.nextElementSibling;
|
|
168
|
+
}
|
|
169
|
+
if (untitledSection.description) {
|
|
170
|
+
untitledSection.text.trim();
|
|
171
|
+
untitledSection.description = untitledSection.text.slice(0, 200).trim();
|
|
172
|
+
untitledSection.references = getReferences(untitledSection.text);
|
|
173
|
+
sections.push(untitledSection);
|
|
174
|
+
}
|
|
175
|
+
// Gets all the titled content
|
|
176
|
+
const articleChildren = $$(article, `.main-article__texte > ${sectionTag}`);
|
|
177
|
+
articleChildren.forEach(function (el) {
|
|
178
|
+
if (el.tagName.toLowerCase() === sectionTag) {
|
|
179
|
+
let nextEl = el.nextElementSibling;
|
|
180
|
+
let html = "";
|
|
181
|
+
while (nextEl && nextEl.tagName.toLowerCase() !== sectionTag) {
|
|
182
|
+
html += nextEl.outerHTML;
|
|
183
|
+
nextEl = nextEl.nextElementSibling;
|
|
184
|
+
}
|
|
185
|
+
const section = dom.window.document.createElement("div");
|
|
186
|
+
section.innerHTML = html;
|
|
187
|
+
const sectionText = section.textContent.replace(/\s+/g, " ").trim();
|
|
188
|
+
sections.push({
|
|
189
|
+
anchor: el.getAttribute("id") || (0, cdtn_slugify_1.default)(el.textContent),
|
|
190
|
+
description: sectionText.slice(0, 200).trim(),
|
|
191
|
+
html: (0, postProcess_1.htmlPostParser)(html.replace(/\n+/g, "").replace(/>\s+</g, "><").replace(/\s+/g, " ")),
|
|
192
|
+
references: getReferences(sectionText),
|
|
193
|
+
text: sectionText,
|
|
194
|
+
title: el.textContent.trim(),
|
|
195
|
+
});
|
|
196
|
+
}
|
|
197
|
+
});
|
|
198
|
+
if (sections.length === 0) {
|
|
199
|
+
throw new got_1.ParseError(`No sections`);
|
|
200
|
+
}
|
|
201
|
+
return {
|
|
202
|
+
date: `${day}/${month}/${year}`,
|
|
203
|
+
description,
|
|
204
|
+
intro,
|
|
205
|
+
pubId: id,
|
|
206
|
+
sections,
|
|
207
|
+
title,
|
|
208
|
+
url,
|
|
209
|
+
};
|
|
210
|
+
}
|
|
211
|
+
exports.parseDom = parseDom;
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare const htmlPostParser: (html: string) => string;
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || function (mod) {
|
|
19
|
+
if (mod && mod.__esModule) return mod;
|
|
20
|
+
var result = {};
|
|
21
|
+
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
|
|
22
|
+
__setModuleDefault(result, mod);
|
|
23
|
+
return result;
|
|
24
|
+
};
|
|
25
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
26
|
+
exports.htmlPostParser = void 0;
|
|
27
|
+
const cheerio = __importStar(require("cheerio"));
|
|
28
|
+
const htmlPostParser = (html) => {
|
|
29
|
+
const $ = cheerio.load(html, null, false);
|
|
30
|
+
const arrImgSrc = [];
|
|
31
|
+
let currentIndex = 0;
|
|
32
|
+
$("style").remove();
|
|
33
|
+
// https://travail-emploi.gouv.fr/le-ministere-en-action/coronavirus-covid-19/questions-reponses-par-theme/article/mesures-de-prevention-dans-l-entreprise-contre-la-covid-19
|
|
34
|
+
$("button").remove();
|
|
35
|
+
$(".oembed-source").remove();
|
|
36
|
+
// https://travail-emploi.gouv.fr/emploi-et-insertion/accompagnement-des-mutations-economiques/activite-partielle-chomage-partiel/article/activite-partielle-chomage-partiel
|
|
37
|
+
$("*")
|
|
38
|
+
.contents()
|
|
39
|
+
.each(function () {
|
|
40
|
+
if (this.nodeType === 8) {
|
|
41
|
+
const regex = /src=['"](.*?)['"]/;
|
|
42
|
+
const result = regex.exec(this.nodeValue);
|
|
43
|
+
if (result) {
|
|
44
|
+
const src = result[0].slice(5, -1);
|
|
45
|
+
if (src)
|
|
46
|
+
arrImgSrc.push(src);
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
});
|
|
50
|
+
$("picture").replaceWith(() => {
|
|
51
|
+
const src = arrImgSrc[currentIndex];
|
|
52
|
+
if (src) {
|
|
53
|
+
currentIndex++;
|
|
54
|
+
return `<img src="https://travail-emploi.gouv.fr/${src}" style="width:100%;height:auto;"/>`;
|
|
55
|
+
}
|
|
56
|
+
return $(this).html()?.toString() ?? "";
|
|
57
|
+
});
|
|
58
|
+
return $.html();
|
|
59
|
+
};
|
|
60
|
+
exports.htmlPostParser = htmlPostParser;
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
export function classifyTokens(tokens: any): any;
|
|
2
|
+
export namespace CODE_SECU {
|
|
3
|
+
const id: string;
|
|
4
|
+
const name: string;
|
|
5
|
+
}
|
|
6
|
+
export namespace CODE_TRAVAIL {
|
|
7
|
+
const id_1: string;
|
|
8
|
+
export { id_1 as id };
|
|
9
|
+
const name_1: string;
|
|
10
|
+
export { name_1 as name };
|
|
11
|
+
}
|
|
12
|
+
export const codesFullNames: {
|
|
13
|
+
[x: string]: {
|
|
14
|
+
id: string;
|
|
15
|
+
name: string;
|
|
16
|
+
};
|
|
17
|
+
};
|
|
18
|
+
export function extractReferences(text: any): any;
|
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/*
|
|
3
|
+
Extracting references is done in several steps :
|
|
4
|
+
1) classifyTokens : we identify valid article references (start with l/r/d then token of shape like 1234-12) => split text into sequence of tokens and give a label to each token
|
|
5
|
+
2) identifyCode : we search for the associated code after the ref tokens (valid options are : code du travail / code de la scurite sociale)
|
|
6
|
+
3) we group those to constitute structured reference of shape :
|
|
7
|
+
{
|
|
8
|
+
"article": "L. 2313-8",
|
|
9
|
+
"code": Object {
|
|
10
|
+
"id": "LEGITEXT000006072050",
|
|
11
|
+
"name": "code du travail",
|
|
12
|
+
},
|
|
13
|
+
}
|
|
14
|
+
*/
|
|
15
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
16
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
17
|
+
};
|
|
18
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
19
|
+
exports.extractReferences = exports.codesFullNames = exports.CODE_TRAVAIL = exports.CODE_SECU = exports.classifyTokens = void 0;
|
|
20
|
+
const treebank_1 = __importDefault(require("talisman/tokenizers/words/treebank"));
|
|
21
|
+
const NEGATIVE = "O";
|
|
22
|
+
const ARTICLE = "B-ART";
|
|
23
|
+
const CODE_PREFIX = "B-COD";
|
|
24
|
+
// code du travail
|
|
25
|
+
const CODE_TRA = CODE_PREFIX + "_TRA";
|
|
26
|
+
// code sécurité sociale
|
|
27
|
+
const CODE_SS = CODE_PREFIX + "_SS";
|
|
28
|
+
// code any other
|
|
29
|
+
const CODE_OTHER = CODE_PREFIX + "_O";
|
|
30
|
+
const UNRECOGNIZED = "unrecognized";
|
|
31
|
+
const CODE_TRAVAIL = {
|
|
32
|
+
id: "LEGITEXT000006072050",
|
|
33
|
+
name: "code du travail",
|
|
34
|
+
};
|
|
35
|
+
exports.CODE_TRAVAIL = CODE_TRAVAIL;
|
|
36
|
+
const CODE_SECU = {
|
|
37
|
+
id: "LEGITEXT000006073189",
|
|
38
|
+
name: "code de la sécurité sociale",
|
|
39
|
+
};
|
|
40
|
+
exports.CODE_SECU = CODE_SECU;
|
|
41
|
+
const codesFullNames = {
|
|
42
|
+
[CODE_SS]: CODE_SECU,
|
|
43
|
+
[CODE_TRA]: CODE_TRAVAIL,
|
|
44
|
+
};
|
|
45
|
+
exports.codesFullNames = codesFullNames;
|
|
46
|
+
// maximum distance between code tokens and corresponding article ref
|
|
47
|
+
const range = 20;
|
|
48
|
+
const articleRegEx = new RegExp("^(\\d{1,4}(-\\d+){0,3})\\b"); // nums 123 123-45 123-45-6 123-45-6-7
|
|
49
|
+
function articleMatcher(token) {
|
|
50
|
+
return token.match(articleRegEx);
|
|
51
|
+
}
|
|
52
|
+
const validPrefix = ["l", "r", "d"];
|
|
53
|
+
// returns :
|
|
54
|
+
// 0 if not matching
|
|
55
|
+
// 1 if matching prefix only (L.)
|
|
56
|
+
// 2 if matching prefix and valid ref (L123.12)
|
|
57
|
+
function prefixMatcher(token) {
|
|
58
|
+
const lowToken = token.toLowerCase();
|
|
59
|
+
// if starts with possible prefix
|
|
60
|
+
const matchingPrefix = validPrefix.filter((p) => lowToken.startsWith(p)).length > 0;
|
|
61
|
+
if (matchingPrefix) {
|
|
62
|
+
const residual = lowToken.slice(1);
|
|
63
|
+
// case only L
|
|
64
|
+
if (!residual.length) {
|
|
65
|
+
return 1;
|
|
66
|
+
}
|
|
67
|
+
else {
|
|
68
|
+
// case L.
|
|
69
|
+
if (residual == ".") {
|
|
70
|
+
return 1;
|
|
71
|
+
}
|
|
72
|
+
else {
|
|
73
|
+
// case L.123-12
|
|
74
|
+
if (residual.slice(0, 1) == "." && articleMatcher(residual.slice(1))) {
|
|
75
|
+
return 2;
|
|
76
|
+
}
|
|
77
|
+
// case L.123-12
|
|
78
|
+
else if (articleMatcher(residual.slice(1))) {
|
|
79
|
+
return 2;
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
// no match
|
|
85
|
+
return 0;
|
|
86
|
+
}
|
|
87
|
+
function infixMatcher(token) {
|
|
88
|
+
// this is quite subtle...
|
|
89
|
+
return ["à", "à"].includes(token);
|
|
90
|
+
}
|
|
91
|
+
// classify sequence of tokens to identify references to articles
|
|
92
|
+
function classifyTokens(tokens) {
|
|
93
|
+
// step 1 : check for prefix matches or articles
|
|
94
|
+
const step1 = tokens.map((token) => {
|
|
95
|
+
const prefix = prefixMatcher(token);
|
|
96
|
+
const infix = infixMatcher(token);
|
|
97
|
+
const article = articleMatcher(token);
|
|
98
|
+
if (prefix > 0) {
|
|
99
|
+
return prefix;
|
|
100
|
+
}
|
|
101
|
+
else if (article) {
|
|
102
|
+
return 3;
|
|
103
|
+
}
|
|
104
|
+
else if (infix) {
|
|
105
|
+
return 4;
|
|
106
|
+
}
|
|
107
|
+
else {
|
|
108
|
+
return 0;
|
|
109
|
+
}
|
|
110
|
+
});
|
|
111
|
+
// step 2 : confirm valid sequences
|
|
112
|
+
// hack : we keep a buffer as last element of the accumulator
|
|
113
|
+
const predictions = step1.reduce((acc, e) => {
|
|
114
|
+
const buffer = acc[acc.length - 1];
|
|
115
|
+
const inSequence = buffer.length > 0;
|
|
116
|
+
const lastElement = buffer[buffer.length - 1];
|
|
117
|
+
// case continue existing
|
|
118
|
+
if (e >= 1 && inSequence) {
|
|
119
|
+
buffer.push(e);
|
|
120
|
+
}
|
|
121
|
+
// case finish existing
|
|
122
|
+
else if (e == 0 && inSequence && lastElement > 1) {
|
|
123
|
+
acc.pop();
|
|
124
|
+
// push buffer
|
|
125
|
+
buffer.forEach(() => acc.push(true));
|
|
126
|
+
// push for current
|
|
127
|
+
acc.push(false);
|
|
128
|
+
acc.push([]);
|
|
129
|
+
}
|
|
130
|
+
// case start (valid start are 1 or 2, as 3 is number only without prefix)
|
|
131
|
+
else if (e > 0 && e < 3 && !inSequence) {
|
|
132
|
+
buffer.push(e);
|
|
133
|
+
}
|
|
134
|
+
// other cases, flush buffer and append current
|
|
135
|
+
else {
|
|
136
|
+
acc.pop();
|
|
137
|
+
acc.push(...buffer.map(() => false));
|
|
138
|
+
acc.push(false);
|
|
139
|
+
acc.push([]);
|
|
140
|
+
}
|
|
141
|
+
return acc;
|
|
142
|
+
}, [[]]);
|
|
143
|
+
// conclude
|
|
144
|
+
const residual = predictions.pop();
|
|
145
|
+
// if ends with bigger than 1, then add residual as true
|
|
146
|
+
if (residual.length > 0 && residual[residual.length - 1] > 1) {
|
|
147
|
+
predictions.push(...residual.map(() => true));
|
|
148
|
+
}
|
|
149
|
+
else {
|
|
150
|
+
predictions.push(...residual.map(() => false));
|
|
151
|
+
}
|
|
152
|
+
return predictions.map((p) => (p ? ARTICLE : NEGATIVE));
|
|
153
|
+
}
|
|
154
|
+
exports.classifyTokens = classifyTokens;
|
|
155
|
+
function identifyCodes(tokens, predicitions) {
|
|
156
|
+
// we look for "code" tokens (starting a code reference)
|
|
157
|
+
const matchCode = tokens.map((token, i) => {
|
|
158
|
+
return token.toLowerCase() == "code" ? CODE_PREFIX : predicitions[i];
|
|
159
|
+
});
|
|
160
|
+
// we search for entire code references
|
|
161
|
+
const resolvedCodePreds = matchCode.map((pred, i) => {
|
|
162
|
+
if (pred == CODE_PREFIX) {
|
|
163
|
+
const joinedNextTokens = tokens
|
|
164
|
+
.slice(i, i + 5)
|
|
165
|
+
.join(" ")
|
|
166
|
+
.toLowerCase();
|
|
167
|
+
if (joinedNextTokens.startsWith(codesFullNames[CODE_SS].name)) {
|
|
168
|
+
return CODE_SS;
|
|
169
|
+
}
|
|
170
|
+
else if (joinedNextTokens.startsWith(codesFullNames[CODE_TRA].name)) {
|
|
171
|
+
return CODE_TRA;
|
|
172
|
+
}
|
|
173
|
+
else {
|
|
174
|
+
return CODE_OTHER;
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
else {
|
|
178
|
+
return pred;
|
|
179
|
+
}
|
|
180
|
+
});
|
|
181
|
+
return resolvedCodePreds;
|
|
182
|
+
}
|
|
183
|
+
// extract references from free text : tokenize and classify
|
|
184
|
+
function extractReferences(text) {
|
|
185
|
+
const tokens = (0, treebank_1.default)(text);
|
|
186
|
+
let predictions = classifyTokens(tokens);
|
|
187
|
+
predictions = identifyCodes(tokens, predictions);
|
|
188
|
+
// console.log(tokens);
|
|
189
|
+
// console.log(predictions);
|
|
190
|
+
// group continuous positives tokens and set code
|
|
191
|
+
// while continuous match, merge
|
|
192
|
+
// if code, then associate it to articles within range
|
|
193
|
+
return tokens
|
|
194
|
+
.map((token, index) => {
|
|
195
|
+
return { index, pred: predictions[index], token };
|
|
196
|
+
})
|
|
197
|
+
.reduce((acc, { token, index, pred }) => {
|
|
198
|
+
// case article : we start or merge
|
|
199
|
+
if (pred == ARTICLE) {
|
|
200
|
+
if (acc.length == 0) {
|
|
201
|
+
acc.push({ index, token });
|
|
202
|
+
}
|
|
203
|
+
else {
|
|
204
|
+
const last = acc[acc.length - 1];
|
|
205
|
+
// case continuous : we merge
|
|
206
|
+
if (last.index + 1 == index) {
|
|
207
|
+
last.token = `${last.token} ${token}`;
|
|
208
|
+
last.index = index;
|
|
209
|
+
}
|
|
210
|
+
else {
|
|
211
|
+
acc.push({ index, token });
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
// case code, we associate it to articles within range
|
|
216
|
+
else if (pred.startsWith(CODE_PREFIX) && acc.length > 0) {
|
|
217
|
+
acc.forEach((match) => {
|
|
218
|
+
// if no code yet and in range
|
|
219
|
+
if (!match.code && match.index + range >= index) {
|
|
220
|
+
if (pred in codesFullNames) {
|
|
221
|
+
match.code = codesFullNames[pred];
|
|
222
|
+
}
|
|
223
|
+
else {
|
|
224
|
+
match.code = UNRECOGNIZED;
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
});
|
|
228
|
+
}
|
|
229
|
+
return acc;
|
|
230
|
+
}, [])
|
|
231
|
+
.filter(({ code }) => {
|
|
232
|
+
// valid cases are no code or code different than UNRECOGNIZED (for other codes : rural, education...)
|
|
233
|
+
return !code || (code && code != UNRECOGNIZED);
|
|
234
|
+
})
|
|
235
|
+
.map(({ token, code }) => {
|
|
236
|
+
return { code, text: token };
|
|
237
|
+
});
|
|
238
|
+
}
|
|
239
|
+
exports.extractReferences = extractReferences;
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export function resolveReferences(refs: any): any;
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/*
|
|
3
|
+
Here we resolve the references :
|
|
4
|
+
Given an article (or a range) and its code (code du travail ou securite sociale), we search for its
|
|
5
|
+
actual id in the legi data corpus.
|
|
6
|
+
*/
|
|
7
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
8
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
9
|
+
};
|
|
10
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
11
|
+
exports.resolveReferences = void 0;
|
|
12
|
+
const unist_util_find_1 = __importDefault(require("unist-util-find"));
|
|
13
|
+
const unist_util_visit_1 = __importDefault(require("unist-util-visit"));
|
|
14
|
+
const referenceExtractor_1 = require("./referenceExtractor");
|
|
15
|
+
const codes = {};
|
|
16
|
+
Object.values(referenceExtractor_1.codesFullNames).forEach(({ id }) => {
|
|
17
|
+
const code = require(`@socialgouv/legi-data/data/${id}.json`);
|
|
18
|
+
codes[id] = code;
|
|
19
|
+
});
|
|
20
|
+
// duplicated in reference Extractor
|
|
21
|
+
const rangeMarkers = ["à", "à"];
|
|
22
|
+
const CODE_UNKNOWN = { id: "UNDEFINED" };
|
|
23
|
+
// shall we use "code du travail" by default ?
|
|
24
|
+
const DEFAULT_CODE = referenceExtractor_1.CODE_TRAVAIL;
|
|
25
|
+
// dumb convert article.data.num as integer for comparison
|
|
26
|
+
// each part up to MAX_DEPTH is padded with PAD_LENGTH
|
|
27
|
+
const PAD_LENGTH = 5; // left pad numbers to X chars
|
|
28
|
+
const MAX_DEPTH = 5; // max number of L432-1-1-1
|
|
29
|
+
// padding numbers : 2 -> "0002"
|
|
30
|
+
const leftPad = (num) => {
|
|
31
|
+
let padded = "" + num;
|
|
32
|
+
while (padded.length < PAD_LENGTH) {
|
|
33
|
+
padded = "0" + padded;
|
|
34
|
+
}
|
|
35
|
+
return padded;
|
|
36
|
+
};
|
|
37
|
+
// transform articles into comparable integers
|
|
38
|
+
const asInt = (num) => {
|
|
39
|
+
const parts = num
|
|
40
|
+
.replace(/[^\d-]/g, "")
|
|
41
|
+
.split("-")
|
|
42
|
+
.map(leftPad);
|
|
43
|
+
while (parts.length < MAX_DEPTH) {
|
|
44
|
+
parts.push(leftPad(0));
|
|
45
|
+
}
|
|
46
|
+
const int = parseInt(parts.join(""));
|
|
47
|
+
return int;
|
|
48
|
+
};
|
|
49
|
+
function getLegiDataRange(code, start, end) {
|
|
50
|
+
// check if num is numerically after start. also check LRD prefix
|
|
51
|
+
const isAfterStart = (node) => asInt(node.data.num) >= asInt(start) &&
|
|
52
|
+
node.data.num.charAt(0) === start.charAt(0);
|
|
53
|
+
// check if num is numerically before end. also check LRD prefix
|
|
54
|
+
const isBeforeEnd = (node) => asInt(node.data.num) <= asInt(end) &&
|
|
55
|
+
node.data.num.charAt(0) === end.charAt(0);
|
|
56
|
+
const articles = [];
|
|
57
|
+
(0, unist_util_visit_1.default)(code, "article", (node) => {
|
|
58
|
+
if (isAfterStart(node) && isBeforeEnd(node)) {
|
|
59
|
+
articles.push(node);
|
|
60
|
+
}
|
|
61
|
+
});
|
|
62
|
+
return articles;
|
|
63
|
+
}
|
|
64
|
+
function formatStartEnd(startRaw, endRaw) {
|
|
65
|
+
// we need to identify special case where end ref is relative to start ref (e.g. L. 4733-9 à 11)
|
|
66
|
+
// if there's nothing in common between end and start, we consider being in this special case
|
|
67
|
+
const [startParts, endParts] = [startRaw, endRaw].map((a) => a
|
|
68
|
+
.replace(/\u2011/g, "-")
|
|
69
|
+
.replace()
|
|
70
|
+
.split("-")
|
|
71
|
+
.map((p) => p.trim()));
|
|
72
|
+
const letter = startParts[0].slice(0, 1);
|
|
73
|
+
const startNums = Array.from(startParts);
|
|
74
|
+
startNums[0] = startNums[0].replace(/\D/g, "");
|
|
75
|
+
let endNums = Array.from(endParts);
|
|
76
|
+
endNums[0] = endNums[0].replace(/\D/g, "");
|
|
77
|
+
if (endNums.length == 1 && /^\d+$/.test(endParts[0])) {
|
|
78
|
+
const endRange = endNums[0];
|
|
79
|
+
endNums = Array.from(startNums.slice(0, -1));
|
|
80
|
+
endNums.push(endRange);
|
|
81
|
+
}
|
|
82
|
+
return [letter + startNums.join("-"), letter + endNums.join("-")];
|
|
83
|
+
}
|
|
84
|
+
// in case of a range (like "L. 4733-9 à 4733-11"), we try to identify
|
|
85
|
+
// the articles implicitly included within the range
|
|
86
|
+
function unravelRange(range) {
|
|
87
|
+
const mark = rangeMarkers.filter((a) => range.text.includes(a))[0];
|
|
88
|
+
const rawParts = range.text.split(mark).map((p) => p.trim());
|
|
89
|
+
const code = range.code ? range.code : DEFAULT_CODE;
|
|
90
|
+
if (rawParts.length == 2 && code != CODE_UNKNOWN) {
|
|
91
|
+
// objective is to identify starting and ending articles (with the legi data correct format)
|
|
92
|
+
// then we can do a legi-data lookup
|
|
93
|
+
const [startRaw, endRaw] = rawParts;
|
|
94
|
+
const [startFMT, endFMT] = formatStartEnd(startRaw, endRaw);
|
|
95
|
+
const unraveled = getLegiDataRange(codes[code.id], startFMT, endFMT).map((a) => {
|
|
96
|
+
const fmt = a.data.num;
|
|
97
|
+
// keep original text for beginning and end
|
|
98
|
+
let text;
|
|
99
|
+
if (startFMT == fmt) {
|
|
100
|
+
text = startRaw;
|
|
101
|
+
}
|
|
102
|
+
else if (endFMT == fmt) {
|
|
103
|
+
text = endRaw;
|
|
104
|
+
}
|
|
105
|
+
return { ...(text && { text }), code, fmt };
|
|
106
|
+
});
|
|
107
|
+
if (unraveled.length > 0) {
|
|
108
|
+
return unraveled;
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
// default in case of error, note that we explicitly set code to unknown
|
|
112
|
+
// in order to identify range errors
|
|
113
|
+
return range.text.split(mark).map((a) => {
|
|
114
|
+
return { code: CODE_UNKNOWN, text: a.trim() };
|
|
115
|
+
});
|
|
116
|
+
}
|
|
117
|
+
function formatArticle(article) {
|
|
118
|
+
// remove dot and spaces + remove non digit trailing chars + replace unicode dash ‑ to standard -
|
|
119
|
+
return article
|
|
120
|
+
.toUpperCase()
|
|
121
|
+
.replace(".", "")
|
|
122
|
+
.replace(" ", "")
|
|
123
|
+
.replace(/\D*$/, "")
|
|
124
|
+
.replace(/\u2011/g, "-");
|
|
125
|
+
}
|
|
126
|
+
function resolveReference(ref) {
|
|
127
|
+
let toResolve = [ref];
|
|
128
|
+
if (rangeMarkers.filter((a) => ref.text.includes(a)).length != 0) {
|
|
129
|
+
toResolve = unravelRange(ref);
|
|
130
|
+
}
|
|
131
|
+
return toResolve.map((a) => {
|
|
132
|
+
// use default code if no defined
|
|
133
|
+
const code = (a.code == CODE_UNKNOWN) | (a.code == undefined) ? DEFAULT_CODE : a.code;
|
|
134
|
+
if (!a.fmt)
|
|
135
|
+
a.fmt = formatArticle(a.text);
|
|
136
|
+
if (code && code != CODE_UNKNOWN) {
|
|
137
|
+
const article = (0, unist_util_find_1.default)(codes[code.id], (node) => node.type === "article" && node.data.num === a.fmt);
|
|
138
|
+
if (article) {
|
|
139
|
+
a.cid = article.data.cid;
|
|
140
|
+
a.id = article.data.id;
|
|
141
|
+
a.code = code;
|
|
142
|
+
}
|
|
143
|
+
else {
|
|
144
|
+
// not found in code
|
|
145
|
+
a.code = CODE_UNKNOWN;
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
return a;
|
|
149
|
+
});
|
|
150
|
+
}
|
|
151
|
+
function resolveReferences(refs) {
|
|
152
|
+
const resolvedRefs = refs.map((ref) => resolveReference(ref)).flat();
|
|
153
|
+
const deduplicated = resolvedRefs.reduce((acc, art) => {
|
|
154
|
+
// drop duplicated references
|
|
155
|
+
const existing = acc
|
|
156
|
+
.map((a) => [a.text, a.fmt])
|
|
157
|
+
.flat()
|
|
158
|
+
.filter((v) => v);
|
|
159
|
+
if (!(existing.includes(art.fmt) || existing.includes(art.text))) {
|
|
160
|
+
acc.push(art);
|
|
161
|
+
}
|
|
162
|
+
return acc;
|
|
163
|
+
}, []);
|
|
164
|
+
// group by code
|
|
165
|
+
const grouped = deduplicated.reduce((acc, art) => {
|
|
166
|
+
const { code, ...rawArticle } = art;
|
|
167
|
+
const parsedCode = code ? code : CODE_UNKNOWN;
|
|
168
|
+
if (!Object.keys(acc).includes(parsedCode.id)) {
|
|
169
|
+
acc[parsedCode.id] = { articles: [], name: parsedCode.name };
|
|
170
|
+
}
|
|
171
|
+
acc[parsedCode.id].articles.push(rawArticle);
|
|
172
|
+
return acc;
|
|
173
|
+
}, {});
|
|
174
|
+
return grouped;
|
|
175
|
+
}
|
|
176
|
+
exports.resolveReferences = resolveReferences;
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.scrapUrl = void 0;
|
|
7
|
+
const got_1 = __importDefault(require("got"));
|
|
8
|
+
const jsdom_1 = require("jsdom");
|
|
9
|
+
const generateHeaders_1 = require("./generateHeaders");
|
|
10
|
+
const parseDom_1 = require("./parseDom");
|
|
11
|
+
async function scrapUrl(id, url) {
|
|
12
|
+
const headers = (0, generateHeaders_1.generateHeaders)();
|
|
13
|
+
try {
|
|
14
|
+
let response = await (0, got_1.default)(url, {
|
|
15
|
+
followRedirect: true,
|
|
16
|
+
headers,
|
|
17
|
+
http2: true,
|
|
18
|
+
retry: 3,
|
|
19
|
+
});
|
|
20
|
+
if (/HTTP 30\d/.test(response.body)) {
|
|
21
|
+
const [, redirectUrl] = response.body.match(/href="(.*)"/);
|
|
22
|
+
try {
|
|
23
|
+
response = await (0, got_1.default)(redirectUrl, {
|
|
24
|
+
followRedirect: true,
|
|
25
|
+
headers,
|
|
26
|
+
http2: true,
|
|
27
|
+
retry: 3,
|
|
28
|
+
});
|
|
29
|
+
}
|
|
30
|
+
catch (error) {
|
|
31
|
+
throw new Error(`Wrong redirectUrl: ${url} => ${redirectUrl}`);
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
const dom = new jsdom_1.JSDOM(response.body, { url });
|
|
35
|
+
return (0, parseDom_1.parseDom)(dom, id, url);
|
|
36
|
+
}
|
|
37
|
+
catch (error) {
|
|
38
|
+
let err;
|
|
39
|
+
if (error instanceof got_1.default.ParseError) {
|
|
40
|
+
err = new Error(`Parsing Error: ${error.message}`);
|
|
41
|
+
}
|
|
42
|
+
else if (error instanceof got_1.default.HTTPError) {
|
|
43
|
+
err = new Error(`HTTP Error: ${error.response.statusCode} - ${error.options.url.href} - ${error.message}`);
|
|
44
|
+
}
|
|
45
|
+
else {
|
|
46
|
+
err = new Error(error.message);
|
|
47
|
+
}
|
|
48
|
+
err.url = url;
|
|
49
|
+
throw err;
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
exports.scrapUrl = scrapUrl;
|
package/build/index.d.ts
ADDED
package/build/index.js
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
14
|
+
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
15
|
+
};
|
|
16
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
17
|
+
exports.decode = void 0;
|
|
18
|
+
const email_1 = require("./email");
|
|
19
|
+
Object.defineProperty(exports, "decode", { enumerable: true, get: function () { return email_1.decode; } });
|
|
20
|
+
__exportStar(require("./types"), exports);
|
package/build/types.d.ts
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
export type FicheTravailEmploi = {
|
|
2
|
+
date: string;
|
|
3
|
+
description: string;
|
|
4
|
+
intro: string;
|
|
5
|
+
pubId: string;
|
|
6
|
+
sections: Section[];
|
|
7
|
+
title: string;
|
|
8
|
+
url: string;
|
|
9
|
+
};
|
|
10
|
+
export type Section = {
|
|
11
|
+
anchor: string;
|
|
12
|
+
description: string;
|
|
13
|
+
html: string;
|
|
14
|
+
references: ReferencesMap;
|
|
15
|
+
text: string;
|
|
16
|
+
title: string;
|
|
17
|
+
};
|
|
18
|
+
export type ReferencesMap = {
|
|
19
|
+
[key: string]: {
|
|
20
|
+
name: string;
|
|
21
|
+
articles: ReferenceFTE[];
|
|
22
|
+
};
|
|
23
|
+
};
|
|
24
|
+
export type ReferenceFTE = {
|
|
25
|
+
id: string;
|
|
26
|
+
cid: string;
|
|
27
|
+
fmt: string;
|
|
28
|
+
text: string;
|
|
29
|
+
};
|
package/build/types.js
ADDED
package/package.json
CHANGED
|
@@ -1,16 +1,12 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@socialgouv/fiches-travail-data-types",
|
|
3
|
-
"version": "4.
|
|
4
|
-
"main": "index.js",
|
|
5
|
-
"module": "index.
|
|
3
|
+
"version": "4.376.0",
|
|
4
|
+
"main": "build/index.js",
|
|
5
|
+
"module": "build/index.js",
|
|
6
6
|
"files": [
|
|
7
|
-
"
|
|
8
|
-
"index.js",
|
|
9
|
-
"index.esm.js",
|
|
10
|
-
"index.d.ts",
|
|
11
|
-
"README.md"
|
|
7
|
+
"build"
|
|
12
8
|
],
|
|
13
|
-
"types": "index.d.ts",
|
|
9
|
+
"types": "build/index.d.ts",
|
|
14
10
|
"repository": {
|
|
15
11
|
"type": "git",
|
|
16
12
|
"url": "https://github.com/SocialGouv/fiches-travail-data.git"
|
|
@@ -21,45 +17,26 @@
|
|
|
21
17
|
"access": "public"
|
|
22
18
|
},
|
|
23
19
|
"scripts": {
|
|
24
|
-
"build": "
|
|
25
|
-
"start": "node
|
|
26
|
-
"checkRefs": "node
|
|
27
|
-
"lint": "eslint src
|
|
20
|
+
"build": "tsc",
|
|
21
|
+
"start": "node build/fetch-data",
|
|
22
|
+
"checkRefs": "node build/checkRefs",
|
|
23
|
+
"lint": "eslint \"./src/**/*.{js,ts}\"",
|
|
28
24
|
"test": "jest"
|
|
29
25
|
},
|
|
30
26
|
"devDependencies": {
|
|
31
27
|
"@babel/core": "^7.16.0",
|
|
32
28
|
"@babel/preset-env": "^7.16.4",
|
|
33
|
-
"@
|
|
34
|
-
"@
|
|
35
|
-
"@
|
|
29
|
+
"@swc/core": "^1.3.21",
|
|
30
|
+
"@swc/jest": "^0.2.23",
|
|
31
|
+
"@typescript-eslint/eslint-plugin": "^5.45.0",
|
|
32
|
+
"@typescript-eslint/parser": "^5.45.0",
|
|
36
33
|
"babel-jest": "^27.4.4",
|
|
37
|
-
"eslint": "^8.
|
|
38
|
-
"
|
|
39
|
-
"got": "^11.8.3",
|
|
34
|
+
"eslint": "^8.28.0",
|
|
35
|
+
"eslint-plugin-jest": "^27.1.6",
|
|
40
36
|
"husky": "^7.0.4",
|
|
41
|
-
"jest": "^
|
|
37
|
+
"jest": "^29.3.1",
|
|
42
38
|
"jsdom": "^17.0.0",
|
|
43
|
-
"npm-run-all": "^4.1.5",
|
|
44
|
-
"p-limit": "^3.1.0",
|
|
45
39
|
"prettier": "^2.5.1",
|
|
46
|
-
"
|
|
47
|
-
"superagent": "^6.1.0",
|
|
48
|
-
"talisman": "^1.1.4",
|
|
49
|
-
"unist-util-find": "^1.0.2",
|
|
50
|
-
"unist-util-visit": "^2.0.3"
|
|
51
|
-
},
|
|
52
|
-
"jest": {
|
|
53
|
-
"roots": [
|
|
54
|
-
"<rootDir>/src"
|
|
55
|
-
],
|
|
56
|
-
"transform": {
|
|
57
|
-
"^.+\\.jsx?$": "babel-jest"
|
|
58
|
-
},
|
|
59
|
-
"testRegex": "(/__tests__/.*|(\\.|/)(test|spec))\\.jsx?$",
|
|
60
|
-
"moduleFileExtensions": [
|
|
61
|
-
"js",
|
|
62
|
-
"json"
|
|
63
|
-
]
|
|
40
|
+
"typescript": "^4.9.3"
|
|
64
41
|
}
|
|
65
42
|
}
|
package/index.d.ts
DELETED
|
@@ -1,32 +0,0 @@
|
|
|
1
|
-
export type FicheTravailEmploi = {
|
|
2
|
-
date: string
|
|
3
|
-
description: string
|
|
4
|
-
intro: string
|
|
5
|
-
pubId: string
|
|
6
|
-
sections: Section[]
|
|
7
|
-
title: string
|
|
8
|
-
url: string
|
|
9
|
-
}
|
|
10
|
-
|
|
11
|
-
export type Section = {
|
|
12
|
-
anchor: string
|
|
13
|
-
description: string
|
|
14
|
-
html: string
|
|
15
|
-
references: ReferencesMap
|
|
16
|
-
text: string
|
|
17
|
-
title: string
|
|
18
|
-
}
|
|
19
|
-
|
|
20
|
-
export type ReferencesMap = {
|
|
21
|
-
[key: string]: {
|
|
22
|
-
name: string
|
|
23
|
-
articles: ReferenceFTE[]
|
|
24
|
-
}
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
export type ReferenceFTE = {
|
|
28
|
-
id: string
|
|
29
|
-
cid: string
|
|
30
|
-
fmt: string
|
|
31
|
-
text: string
|
|
32
|
-
}
|
package/index.esm.js
DELETED