@socialgouv/fiches-travail-data-types 4.374.0 → 4.376.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,3 +1,23 @@
1
+ # [4.376.0](https://github.com/SocialGouv/fiches-travail-data/compare/v4.375.0...v4.376.0) (2022-12-06)
2
+
3
+
4
+ ### Bug Fixes
5
+
6
+ * **build:** ajout de la gestion build full typescript ✨ ([#386](https://github.com/SocialGouv/fiches-travail-data/issues/386)) ([46e5ade](https://github.com/SocialGouv/fiches-travail-data/commit/46e5ade139b31b3e4ffd220dff1136248cd3aeaa))
7
+ * **picture:** ajout du scénario pour la balise picture + typescript / eslint ([#385](https://github.com/SocialGouv/fiches-travail-data/issues/385)) ([5dce100](https://github.com/SocialGouv/fiches-travail-data/commit/5dce100607fbfc938a96468ac6774e3924df559b))
8
+
9
+
10
+ ### Features
11
+
12
+ * **data:** 20221206_1018 update ([8ee594e](https://github.com/SocialGouv/fiches-travail-data/commit/8ee594e4e29b30300a501030f37dbeeac83980a8))
13
+
14
+ # [4.375.0](https://github.com/SocialGouv/fiches-travail-data/compare/v4.374.0...v4.375.0) (2022-12-01)
15
+
16
+
17
+ ### Features
18
+
19
+ * **data:** 20221201_2211 update ([9bf8887](https://github.com/SocialGouv/fiches-travail-data/commit/9bf888781d07d7c8d1bdc87af5dc038fa1dcf9b3))
20
+
1
21
  # [4.374.0](https://github.com/SocialGouv/fiches-travail-data/compare/v4.373.0...v4.374.0) (2022-11-30)
2
22
 
3
23
 
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,24 @@
1
+ "use strict";
2
+ const fiches = require("../data/fiches-travail.json");
3
+ const UNDEFINED_KEY = "UNDEFINED";
4
+ const undefinedReferences = fiches.filter((fiche) => {
5
+ const refErrors = fiche.sections.filter((section) => {
6
+ if (!section.references) {
7
+ console.log("no refs in " + fiche.title);
8
+ }
9
+ return section.references && UNDEFINED_KEY in section.references;
10
+ });
11
+ return refErrors.length > 0;
12
+ });
13
+ const printMissingRef = (fiche) => {
14
+ console.log(`#### [${fiche.title}](${fiche.url})`);
15
+ fiche.sections.forEach((section) => {
16
+ if (section.references && UNDEFINED_KEY in section.references) {
17
+ console.log(`- ${section.anchor}`);
18
+ const fmt = section.references[UNDEFINED_KEY].articles.map((ref) => ref.text);
19
+ console.log(`> ${Array.from(new Set(fmt)).join(" / ")}`);
20
+ }
21
+ });
22
+ };
23
+ console.log(`### ${undefinedReferences.length}/${fiches.length} fiches aux références non résolues.`);
24
+ undefinedReferences.map((fiche) => printMissingRef(fiche));
@@ -0,0 +1,3 @@
1
+ declare function encode(str: string): string;
2
+ declare function decode(str: string): string;
3
+ export { decode, encode };
package/build/email.js ADDED
@@ -0,0 +1,12 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.encode = exports.decode = void 0;
4
+ const char = "_";
5
+ function encode(str) {
6
+ return str.replace(/@/g, `${char}@`);
7
+ }
8
+ exports.encode = encode;
9
+ function decode(str) {
10
+ return str.replace(new RegExp(`${char}@`, "g"), "@");
11
+ }
12
+ exports.decode = decode;
@@ -0,0 +1,4 @@
1
+ /**
2
+ * Build the header for request with a specific token to bypass bot protection
3
+ */
4
+ export function generateHeaders(extras: any): any;
@@ -0,0 +1,16 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.generateHeaders = void 0;
4
+ /**
5
+ * Build the header for request with a specific token to bypass bot protection
6
+ */
7
+ function generateHeaders(extras) {
8
+ if (!process.env.TOKEN_MT) {
9
+ throw Error("Token (cgtoken) is required to fetch the data. This token is provided by the travail-emploi.gouv.fr team.");
10
+ }
11
+ return {
12
+ ...extras,
13
+ Cookie: `cgtoken=${process.env.TOKEN_MT};`,
14
+ };
15
+ }
16
+ exports.generateHeaders = generateHeaders;
@@ -0,0 +1,2 @@
1
+ export function fetchFeed(url: any): Promise<any>;
2
+ export function scrap(urls: any): Promise<any[]>;
@@ -0,0 +1,66 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.scrap = exports.fetchFeed = void 0;
7
+ const fs_1 = __importDefault(require("fs"));
8
+ const got_1 = __importDefault(require("got"));
9
+ const p_limit_1 = __importDefault(require("p-limit"));
10
+ const path_1 = __importDefault(require("path"));
11
+ const generateHeaders_1 = require("./generateHeaders");
12
+ const scrapUrl_1 = require("./scrapUrl");
13
+ const FEED_URL = "https://travail-emploi.gouv.fr/?page=oseo_json";
14
+ const limit = (0, p_limit_1.default)(10);
15
+ async function fetchFeed(url) {
16
+ const response = await got_1.default.post(url, {
17
+ headers: (0, generateHeaders_1.generateHeaders)({
18
+ "Content-Type": "application/json",
19
+ }),
20
+ http2: true,
21
+ retry: 3,
22
+ });
23
+ const { fiches: feed } = JSON.parse(response.body);
24
+ return feed;
25
+ }
26
+ exports.fetchFeed = fetchFeed;
27
+ async function scrap(urls) {
28
+ const inputs = urls.map(({ id, url }) => limit(() => (0, scrapUrl_1.scrapUrl)(id, url)));
29
+ const results = await Promise.allSettled(inputs);
30
+ const failedPromise = results.filter(({ status }) => status === "rejected");
31
+ if (failedPromise.length > 0) {
32
+ console.error("scrap fail", failedPromise.map(({ reason }) => reason));
33
+ throw new Error("Error - fetching pages fail. Some pages are missing");
34
+ }
35
+ const resolvedPromise = results.flatMap(({ status, value }) => status === "fulfilled" ? [value] : []);
36
+ // ensure we not have duplicate url
37
+ let hasDuplicate = false;
38
+ for (const { pubId, url } of resolvedPromise) {
39
+ const count = resolvedPromise.filter((fiche) => fiche.pubId === pubId && pubId !== undefined).length;
40
+ if (count > 1) {
41
+ hasDuplicate = true;
42
+ console.error(`[error] la fiche ${url} est présente ${count} fois. Veuillez supprimer le doublon du datafiller`);
43
+ }
44
+ }
45
+ if (hasDuplicate) {
46
+ throw new Error(`[error] fiches en doublons. Veuillez supprimer les doublons du datafiller`);
47
+ }
48
+ return resolvedPromise;
49
+ }
50
+ exports.scrap = scrap;
51
+ if (module === require.main) {
52
+ const t0 = Date.now();
53
+ fetchFeed(FEED_URL)
54
+ .then(scrap)
55
+ .then((fiches) => {
56
+ console.log(`done in ${Math.round((Date.now() - t0) / 1000)} sec`);
57
+ const dataFilePath = path_1.default.join(__dirname, "..", "..", "data", "fiches-travail.json");
58
+ fs_1.default.mkdirSync(path_1.default.dirname(dataFilePath), { recursive: true });
59
+ fs_1.default.writeFileSync(dataFilePath, JSON.stringify(fiches, null, 2));
60
+ })
61
+ .catch((error) => {
62
+ console.error(error);
63
+ console.error(`fail in ${Math.round((Date.now() - t0) / 1000)} sec`);
64
+ process.exit(1);
65
+ });
66
+ }
@@ -0,0 +1,14 @@
1
+ export function parseDom(dom: any, id: any, url: any): {
2
+ date: string;
3
+ description: any;
4
+ intro: any;
5
+ pubId: any;
6
+ sections: {
7
+ anchor: string;
8
+ html: string;
9
+ text: string;
10
+ title: any;
11
+ }[];
12
+ title: any;
13
+ url: any;
14
+ };
@@ -0,0 +1,211 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.parseDom = void 0;
7
+ const cdtn_slugify_1 = __importDefault(require("@socialgouv/cdtn-slugify"));
8
+ const got_1 = require("got");
9
+ const email_1 = require("../email");
10
+ const postProcess_1 = require("./postProcess");
11
+ const referenceExtractor_1 = require("./referenceExtractor");
12
+ const referenceResolver_1 = require("./referenceResolver");
13
+ const $$ = (node, selector) => Array.from(node.querySelectorAll(selector));
14
+ const $ = (node, selector) => node.querySelector(selector);
15
+ function unwrapEmail(data = "") {
16
+ const [k, ...tokens] = Array.from({ length: data.length / 2 }, (_, i) => i * 2).map((val) => parseInt(data.slice(val, val + 2), 16));
17
+ const rawValue = tokens.map((v) => String.fromCharCode(v ^ k)).join("");
18
+ return (0, email_1.encode)(decodeURIComponent(escape(rawValue)));
19
+ }
20
+ const formatEmail = (node) => {
21
+ const value = unwrapEmail(node.getAttribute("data-cfemail"));
22
+ node.removeAttribute("data-cfemail");
23
+ node.textContent = value;
24
+ };
25
+ const formatPicture = (node) => {
26
+ const comment = node.parentElement.childNodes[0];
27
+ if (comment.nodeName !== "#comment") {
28
+ //upper sibbling node is not a comment so it's not a case we handle
29
+ return;
30
+ }
31
+ const [, src = ""] = comment.data.match(/src=["']([^'"]*)["']/);
32
+ if (src.length === 0) {
33
+ return;
34
+ }
35
+ let [srcClean] = src.split("?");
36
+ if (!srcClean.match(/^https?:\/\//)) {
37
+ if (srcClean.slice(0, 1) !== "/") {
38
+ srcClean = "/" + srcClean;
39
+ }
40
+ srcClean = `https://travail-emploi.gouv.fr${srcClean}`;
41
+ }
42
+ // we remove the ie comment that have timestamp in the url
43
+ comment.remove();
44
+ // we add e
45
+ const sourceNode = node.ownerDocument.createElement("source");
46
+ sourceNode.setAttribute("srcset", srcClean);
47
+ sourceNode.setAttribute("media", "(min-width: 300px)");
48
+ node.appendChild(sourceNode);
49
+ return node;
50
+ };
51
+ const formatAnchor = (node) => {
52
+ if (node.innerHTML.trim() === "") {
53
+ node.remove();
54
+ return;
55
+ }
56
+ if (node.getElementsByTagName("img").length) {
57
+ node.classList.add("no-after");
58
+ }
59
+ let href = node.getAttribute("href");
60
+ // remove ATTAg(...) on pdf link
61
+ node.removeAttribute("onclick");
62
+ if (!href)
63
+ return;
64
+ // unwrap link with href="javascript:"
65
+ if (/^javascript:/.test(href)) {
66
+ node.parentNode.innerHTML = node.textContent;
67
+ }
68
+ if (/email-protection/.test(href)) {
69
+ const [, data = ""] = href.split("#");
70
+ const value = unwrapEmail(data);
71
+ node.setAttribute("href", `mailto:${value}`);
72
+ return;
73
+ }
74
+ if (!href.match(/^https?:\/\//)) {
75
+ if (href.slice(0, 1) !== "/") {
76
+ href = "/" + href;
77
+ }
78
+ node.setAttribute("href", `https://travail-emploi.gouv.fr${href}`);
79
+ node.setAttribute("target", "_blank");
80
+ node.setAttribute("rel", "nofollow, noopener");
81
+ }
82
+ };
83
+ const flattenCsBlocs = (node) => {
84
+ node.insertAdjacentHTML("afterend", node.innerHTML);
85
+ node.parentNode.removeChild(node);
86
+ };
87
+ const getSectionTag = (article) => {
88
+ const h3 = $$(article, ".main-article__texte > h3").length && "h3";
89
+ const h4 = $$(article, ".main-article__texte > h4").length && "h4";
90
+ const h5 = $$(article, ".main-article__texte > h5").length && "h5";
91
+ return h3 || h4 || h5 || "sectionTag";
92
+ };
93
+ const getReferences = (text) => {
94
+ // first we extract the tokens referencing articles
95
+ const references = (0, referenceExtractor_1.extractReferences)(text);
96
+ // then we try to resolve the actual articles ids using legi-data
97
+ return (0, referenceResolver_1.resolveReferences)(references);
98
+ };
99
+ function parseDom(dom, id, url) {
100
+ const article = $(dom.window.document, "main");
101
+ if (!article) {
102
+ throw new got_1.ParseError("no <main>");
103
+ }
104
+ $$(article, "a").forEach(formatAnchor);
105
+ $$(article, "picture").forEach(formatPicture);
106
+ $$(article, "[data-cfemail]").forEach(formatEmail);
107
+ $$(article, ".cs_blocs").forEach(flattenCsBlocs);
108
+ const imgs = $$(article, "img");
109
+ imgs.forEach((node) => {
110
+ // remove adaptImgFix(this) on hero img
111
+ node.removeAttribute("onmousedown");
112
+ });
113
+ imgs
114
+ .filter((node) => node.getAttribute("src").indexOf("data:image") === -1)
115
+ .forEach((node) => {
116
+ let src = node.getAttribute("src");
117
+ if (!src.match(/^https?:\/\//)) {
118
+ if (src.slice(0, 1) !== "/") {
119
+ src = "/" + src;
120
+ }
121
+ src = `https://travail-emploi.gouv.fr${src}`;
122
+ node.setAttribute("src", src);
123
+ }
124
+ });
125
+ let titleElement = $(article, "h1");
126
+ if (!titleElement) {
127
+ titleElement = $(article, "h2");
128
+ if (!titleElement) {
129
+ throw new got_1.ParseError("No <h1> or <h2> element");
130
+ }
131
+ }
132
+ const title = titleElement.textContent.trim();
133
+ if (!id) {
134
+ throw new got_1.ParseError(`No id`);
135
+ }
136
+ const dateRaw = $(dom.window.document, "meta[property*=modified_time]") ||
137
+ $(dom.window.document, "meta[property$=published_time]");
138
+ const [year, month, day] = dateRaw.getAttribute("content").split("-");
139
+ let intro = $(article, ".main-article__chapo") || "";
140
+ intro =
141
+ intro && intro.innerHTML.replace(/\n/g, "").replace(/\s+/g, " ").trim();
142
+ const description = $(dom.window.document, "meta[name=description]").getAttribute("content");
143
+ const sections = [];
144
+ const sectionTag = getSectionTag(article);
145
+ // First pass is only to get a potential untitled section at the top of the article
146
+ // This section has neither anchor nor title
147
+ let nextArticleElement = $(article, ".main-article__texte > *");
148
+ const untitledSection = {
149
+ anchor: "",
150
+ html: "",
151
+ text: "",
152
+ title: title,
153
+ };
154
+ while (nextArticleElement &&
155
+ nextArticleElement.tagName.toLowerCase() !== sectionTag) {
156
+ if (nextArticleElement.textContent) {
157
+ if (!untitledSection.description) {
158
+ untitledSection.description = "temp description";
159
+ }
160
+ untitledSection.html += nextArticleElement.outerHTML
161
+ .replace(/\n+/g, "")
162
+ .replace(/>\s+</g, "><")
163
+ .replace(/\s+/g, " ");
164
+ untitledSection.text +=
165
+ " " + nextArticleElement.textContent.replace(/\s+/g, " ").trim();
166
+ }
167
+ nextArticleElement = nextArticleElement.nextElementSibling;
168
+ }
169
+ if (untitledSection.description) {
170
+ untitledSection.text.trim();
171
+ untitledSection.description = untitledSection.text.slice(0, 200).trim();
172
+ untitledSection.references = getReferences(untitledSection.text);
173
+ sections.push(untitledSection);
174
+ }
175
+ // Gets all the titled content
176
+ const articleChildren = $$(article, `.main-article__texte > ${sectionTag}`);
177
+ articleChildren.forEach(function (el) {
178
+ if (el.tagName.toLowerCase() === sectionTag) {
179
+ let nextEl = el.nextElementSibling;
180
+ let html = "";
181
+ while (nextEl && nextEl.tagName.toLowerCase() !== sectionTag) {
182
+ html += nextEl.outerHTML;
183
+ nextEl = nextEl.nextElementSibling;
184
+ }
185
+ const section = dom.window.document.createElement("div");
186
+ section.innerHTML = html;
187
+ const sectionText = section.textContent.replace(/\s+/g, " ").trim();
188
+ sections.push({
189
+ anchor: el.getAttribute("id") || (0, cdtn_slugify_1.default)(el.textContent),
190
+ description: sectionText.slice(0, 200).trim(),
191
+ html: (0, postProcess_1.htmlPostParser)(html.replace(/\n+/g, "").replace(/>\s+</g, "><").replace(/\s+/g, " ")),
192
+ references: getReferences(sectionText),
193
+ text: sectionText,
194
+ title: el.textContent.trim(),
195
+ });
196
+ }
197
+ });
198
+ if (sections.length === 0) {
199
+ throw new got_1.ParseError(`No sections`);
200
+ }
201
+ return {
202
+ date: `${day}/${month}/${year}`,
203
+ description,
204
+ intro,
205
+ pubId: id,
206
+ sections,
207
+ title,
208
+ url,
209
+ };
210
+ }
211
+ exports.parseDom = parseDom;
@@ -0,0 +1 @@
1
+ export declare const htmlPostParser: (html: string) => string;
@@ -0,0 +1,60 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || function (mod) {
19
+ if (mod && mod.__esModule) return mod;
20
+ var result = {};
21
+ if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
22
+ __setModuleDefault(result, mod);
23
+ return result;
24
+ };
25
+ Object.defineProperty(exports, "__esModule", { value: true });
26
+ exports.htmlPostParser = void 0;
27
+ const cheerio = __importStar(require("cheerio"));
28
+ const htmlPostParser = (html) => {
29
+ const $ = cheerio.load(html, null, false);
30
+ const arrImgSrc = [];
31
+ let currentIndex = 0;
32
+ $("style").remove();
33
+ // https://travail-emploi.gouv.fr/le-ministere-en-action/coronavirus-covid-19/questions-reponses-par-theme/article/mesures-de-prevention-dans-l-entreprise-contre-la-covid-19
34
+ $("button").remove();
35
+ $(".oembed-source").remove();
36
+ // https://travail-emploi.gouv.fr/emploi-et-insertion/accompagnement-des-mutations-economiques/activite-partielle-chomage-partiel/article/activite-partielle-chomage-partiel
37
+ $("*")
38
+ .contents()
39
+ .each(function () {
40
+ if (this.nodeType === 8) {
41
+ const regex = /src=['"](.*?)['"]/;
42
+ const result = regex.exec(this.nodeValue);
43
+ if (result) {
44
+ const src = result[0].slice(5, -1);
45
+ if (src)
46
+ arrImgSrc.push(src);
47
+ }
48
+ }
49
+ });
50
+ $("picture").replaceWith(() => {
51
+ const src = arrImgSrc[currentIndex];
52
+ if (src) {
53
+ currentIndex++;
54
+ return `<img src="https://travail-emploi.gouv.fr/${src}" style="width:100%;height:auto;"/>`;
55
+ }
56
+ return $(this).html()?.toString() ?? "";
57
+ });
58
+ return $.html();
59
+ };
60
+ exports.htmlPostParser = htmlPostParser;
@@ -0,0 +1,18 @@
1
+ export function classifyTokens(tokens: any): any;
2
+ export namespace CODE_SECU {
3
+ const id: string;
4
+ const name: string;
5
+ }
6
+ export namespace CODE_TRAVAIL {
7
+ const id_1: string;
8
+ export { id_1 as id };
9
+ const name_1: string;
10
+ export { name_1 as name };
11
+ }
12
+ export const codesFullNames: {
13
+ [x: string]: {
14
+ id: string;
15
+ name: string;
16
+ };
17
+ };
18
+ export function extractReferences(text: any): any;
@@ -0,0 +1,239 @@
1
+ "use strict";
2
+ /*
3
+ Extracting references is done in several steps :
4
+ 1) classifyTokens : we identify valid article references (start with l/r/d then token of shape like 1234-12) => split text into sequence of tokens and give a label to each token
5
+ 2) identifyCode : we search for the associated code after the ref tokens (valid options are : code du travail / code de la scurite sociale)
6
+ 3) we group those to constitute structured reference of shape :
7
+ {
8
+ "article": "L. 2313-8",
9
+ "code": Object {
10
+ "id": "LEGITEXT000006072050",
11
+ "name": "code du travail",
12
+ },
13
+ }
14
+ */
15
+ var __importDefault = (this && this.__importDefault) || function (mod) {
16
+ return (mod && mod.__esModule) ? mod : { "default": mod };
17
+ };
18
+ Object.defineProperty(exports, "__esModule", { value: true });
19
+ exports.extractReferences = exports.codesFullNames = exports.CODE_TRAVAIL = exports.CODE_SECU = exports.classifyTokens = void 0;
20
+ const treebank_1 = __importDefault(require("talisman/tokenizers/words/treebank"));
21
+ const NEGATIVE = "O";
22
+ const ARTICLE = "B-ART";
23
+ const CODE_PREFIX = "B-COD";
24
+ // code du travail
25
+ const CODE_TRA = CODE_PREFIX + "_TRA";
26
+ // code sécurité sociale
27
+ const CODE_SS = CODE_PREFIX + "_SS";
28
+ // code any other
29
+ const CODE_OTHER = CODE_PREFIX + "_O";
30
+ const UNRECOGNIZED = "unrecognized";
31
+ const CODE_TRAVAIL = {
32
+ id: "LEGITEXT000006072050",
33
+ name: "code du travail",
34
+ };
35
+ exports.CODE_TRAVAIL = CODE_TRAVAIL;
36
+ const CODE_SECU = {
37
+ id: "LEGITEXT000006073189",
38
+ name: "code de la sécurité sociale",
39
+ };
40
+ exports.CODE_SECU = CODE_SECU;
41
+ const codesFullNames = {
42
+ [CODE_SS]: CODE_SECU,
43
+ [CODE_TRA]: CODE_TRAVAIL,
44
+ };
45
+ exports.codesFullNames = codesFullNames;
46
+ // maximum distance between code tokens and corresponding article ref
47
+ const range = 20;
48
+ const articleRegEx = new RegExp("^(\\d{1,4}(-\\d+){0,3})\\b"); // nums 123 123-45 123-45-6 123-45-6-7
49
+ function articleMatcher(token) {
50
+ return token.match(articleRegEx);
51
+ }
52
+ const validPrefix = ["l", "r", "d"];
53
+ // returns :
54
+ // 0 if not matching
55
+ // 1 if matching prefix only (L.)
56
+ // 2 if matching prefix and valid ref (L123.12)
57
+ function prefixMatcher(token) {
58
+ const lowToken = token.toLowerCase();
59
+ // if starts with possible prefix
60
+ const matchingPrefix = validPrefix.filter((p) => lowToken.startsWith(p)).length > 0;
61
+ if (matchingPrefix) {
62
+ const residual = lowToken.slice(1);
63
+ // case only L
64
+ if (!residual.length) {
65
+ return 1;
66
+ }
67
+ else {
68
+ // case L.
69
+ if (residual == ".") {
70
+ return 1;
71
+ }
72
+ else {
73
+ // case L.123-12
74
+ if (residual.slice(0, 1) == "." && articleMatcher(residual.slice(1))) {
75
+ return 2;
76
+ }
77
+ // case L.123-12
78
+ else if (articleMatcher(residual.slice(1))) {
79
+ return 2;
80
+ }
81
+ }
82
+ }
83
+ }
84
+ // no match
85
+ return 0;
86
+ }
87
+ function infixMatcher(token) {
88
+ // this is quite subtle...
89
+ return ["à", "à"].includes(token);
90
+ }
91
+ // classify sequence of tokens to identify references to articles
92
+ function classifyTokens(tokens) {
93
+ // step 1 : check for prefix matches or articles
94
+ const step1 = tokens.map((token) => {
95
+ const prefix = prefixMatcher(token);
96
+ const infix = infixMatcher(token);
97
+ const article = articleMatcher(token);
98
+ if (prefix > 0) {
99
+ return prefix;
100
+ }
101
+ else if (article) {
102
+ return 3;
103
+ }
104
+ else if (infix) {
105
+ return 4;
106
+ }
107
+ else {
108
+ return 0;
109
+ }
110
+ });
111
+ // step 2 : confirm valid sequences
112
+ // hack : we keep a buffer as last element of the accumulator
113
+ const predictions = step1.reduce((acc, e) => {
114
+ const buffer = acc[acc.length - 1];
115
+ const inSequence = buffer.length > 0;
116
+ const lastElement = buffer[buffer.length - 1];
117
+ // case continue existing
118
+ if (e >= 1 && inSequence) {
119
+ buffer.push(e);
120
+ }
121
+ // case finish existing
122
+ else if (e == 0 && inSequence && lastElement > 1) {
123
+ acc.pop();
124
+ // push buffer
125
+ buffer.forEach(() => acc.push(true));
126
+ // push for current
127
+ acc.push(false);
128
+ acc.push([]);
129
+ }
130
+ // case start (valid start are 1 or 2, as 3 is number only without prefix)
131
+ else if (e > 0 && e < 3 && !inSequence) {
132
+ buffer.push(e);
133
+ }
134
+ // other cases, flush buffer and append current
135
+ else {
136
+ acc.pop();
137
+ acc.push(...buffer.map(() => false));
138
+ acc.push(false);
139
+ acc.push([]);
140
+ }
141
+ return acc;
142
+ }, [[]]);
143
+ // conclude
144
+ const residual = predictions.pop();
145
+ // if ends with bigger than 1, then add residual as true
146
+ if (residual.length > 0 && residual[residual.length - 1] > 1) {
147
+ predictions.push(...residual.map(() => true));
148
+ }
149
+ else {
150
+ predictions.push(...residual.map(() => false));
151
+ }
152
+ return predictions.map((p) => (p ? ARTICLE : NEGATIVE));
153
+ }
154
+ exports.classifyTokens = classifyTokens;
155
+ function identifyCodes(tokens, predicitions) {
156
+ // we look for "code" tokens (starting a code reference)
157
+ const matchCode = tokens.map((token, i) => {
158
+ return token.toLowerCase() == "code" ? CODE_PREFIX : predicitions[i];
159
+ });
160
+ // we search for entire code references
161
+ const resolvedCodePreds = matchCode.map((pred, i) => {
162
+ if (pred == CODE_PREFIX) {
163
+ const joinedNextTokens = tokens
164
+ .slice(i, i + 5)
165
+ .join(" ")
166
+ .toLowerCase();
167
+ if (joinedNextTokens.startsWith(codesFullNames[CODE_SS].name)) {
168
+ return CODE_SS;
169
+ }
170
+ else if (joinedNextTokens.startsWith(codesFullNames[CODE_TRA].name)) {
171
+ return CODE_TRA;
172
+ }
173
+ else {
174
+ return CODE_OTHER;
175
+ }
176
+ }
177
+ else {
178
+ return pred;
179
+ }
180
+ });
181
+ return resolvedCodePreds;
182
+ }
183
+ // extract references from free text : tokenize and classify
184
+ function extractReferences(text) {
185
+ const tokens = (0, treebank_1.default)(text);
186
+ let predictions = classifyTokens(tokens);
187
+ predictions = identifyCodes(tokens, predictions);
188
+ // console.log(tokens);
189
+ // console.log(predictions);
190
+ // group continuous positives tokens and set code
191
+ // while continuous match, merge
192
+ // if code, then associate it to articles within range
193
+ return tokens
194
+ .map((token, index) => {
195
+ return { index, pred: predictions[index], token };
196
+ })
197
+ .reduce((acc, { token, index, pred }) => {
198
+ // case article : we start or merge
199
+ if (pred == ARTICLE) {
200
+ if (acc.length == 0) {
201
+ acc.push({ index, token });
202
+ }
203
+ else {
204
+ const last = acc[acc.length - 1];
205
+ // case continuous : we merge
206
+ if (last.index + 1 == index) {
207
+ last.token = `${last.token} ${token}`;
208
+ last.index = index;
209
+ }
210
+ else {
211
+ acc.push({ index, token });
212
+ }
213
+ }
214
+ }
215
+ // case code, we associate it to articles within range
216
+ else if (pred.startsWith(CODE_PREFIX) && acc.length > 0) {
217
+ acc.forEach((match) => {
218
+ // if no code yet and in range
219
+ if (!match.code && match.index + range >= index) {
220
+ if (pred in codesFullNames) {
221
+ match.code = codesFullNames[pred];
222
+ }
223
+ else {
224
+ match.code = UNRECOGNIZED;
225
+ }
226
+ }
227
+ });
228
+ }
229
+ return acc;
230
+ }, [])
231
+ .filter(({ code }) => {
232
+ // valid cases are no code or code different than UNRECOGNIZED (for other codes : rural, education...)
233
+ return !code || (code && code != UNRECOGNIZED);
234
+ })
235
+ .map(({ token, code }) => {
236
+ return { code, text: token };
237
+ });
238
+ }
239
+ exports.extractReferences = extractReferences;
@@ -0,0 +1 @@
1
+ export function resolveReferences(refs: any): any;
@@ -0,0 +1,176 @@
1
+ "use strict";
2
+ /*
3
+ Here we resolve the references :
4
+ Given an article (or a range) and its code (code du travail ou securite sociale), we search for its
5
+ actual id in the legi data corpus.
6
+ */
7
+ var __importDefault = (this && this.__importDefault) || function (mod) {
8
+ return (mod && mod.__esModule) ? mod : { "default": mod };
9
+ };
10
+ Object.defineProperty(exports, "__esModule", { value: true });
11
+ exports.resolveReferences = void 0;
12
+ const unist_util_find_1 = __importDefault(require("unist-util-find"));
13
+ const unist_util_visit_1 = __importDefault(require("unist-util-visit"));
14
+ const referenceExtractor_1 = require("./referenceExtractor");
15
+ const codes = {};
16
+ Object.values(referenceExtractor_1.codesFullNames).forEach(({ id }) => {
17
+ const code = require(`@socialgouv/legi-data/data/${id}.json`);
18
+ codes[id] = code;
19
+ });
20
+ // duplicated in reference Extractor
21
+ const rangeMarkers = ["à", "à"];
22
+ const CODE_UNKNOWN = { id: "UNDEFINED" };
23
+ // shall we use "code du travail" by default ?
24
+ const DEFAULT_CODE = referenceExtractor_1.CODE_TRAVAIL;
25
+ // dumb convert article.data.num as integer for comparison
26
+ // each part up to MAX_DEPTH is padded with PAD_LENGTH
27
+ const PAD_LENGTH = 5; // left pad numbers to X chars
28
+ const MAX_DEPTH = 5; // max number of L432-1-1-1
29
+ // padding numbers : 2 -> "0002"
30
+ const leftPad = (num) => {
31
+ let padded = "" + num;
32
+ while (padded.length < PAD_LENGTH) {
33
+ padded = "0" + padded;
34
+ }
35
+ return padded;
36
+ };
37
+ // transform articles into comparable integers
38
+ const asInt = (num) => {
39
+ const parts = num
40
+ .replace(/[^\d-]/g, "")
41
+ .split("-")
42
+ .map(leftPad);
43
+ while (parts.length < MAX_DEPTH) {
44
+ parts.push(leftPad(0));
45
+ }
46
+ const int = parseInt(parts.join(""));
47
+ return int;
48
+ };
49
+ function getLegiDataRange(code, start, end) {
50
+ // check if num is numerically after start. also check LRD prefix
51
+ const isAfterStart = (node) => asInt(node.data.num) >= asInt(start) &&
52
+ node.data.num.charAt(0) === start.charAt(0);
53
+ // check if num is numerically before end. also check LRD prefix
54
+ const isBeforeEnd = (node) => asInt(node.data.num) <= asInt(end) &&
55
+ node.data.num.charAt(0) === end.charAt(0);
56
+ const articles = [];
57
+ (0, unist_util_visit_1.default)(code, "article", (node) => {
58
+ if (isAfterStart(node) && isBeforeEnd(node)) {
59
+ articles.push(node);
60
+ }
61
+ });
62
+ return articles;
63
+ }
64
+ function formatStartEnd(startRaw, endRaw) {
65
+ // we need to identify special case where end ref is relative to start ref (e.g. L. 4733-9 à 11)
66
+ // if there's nothing in common between end and start, we consider being in this special case
67
+ const [startParts, endParts] = [startRaw, endRaw].map((a) => a
68
+ .replace(/\u2011/g, "-")
69
+ .replace()
70
+ .split("-")
71
+ .map((p) => p.trim()));
72
+ const letter = startParts[0].slice(0, 1);
73
+ const startNums = Array.from(startParts);
74
+ startNums[0] = startNums[0].replace(/\D/g, "");
75
+ let endNums = Array.from(endParts);
76
+ endNums[0] = endNums[0].replace(/\D/g, "");
77
+ if (endNums.length == 1 && /^\d+$/.test(endParts[0])) {
78
+ const endRange = endNums[0];
79
+ endNums = Array.from(startNums.slice(0, -1));
80
+ endNums.push(endRange);
81
+ }
82
+ return [letter + startNums.join("-"), letter + endNums.join("-")];
83
+ }
84
+ // in case of a range (like "L. 4733-9 à 4733-11"), we try to identify
85
+ // the articles implicitly included within the range
86
+ function unravelRange(range) {
87
+ const mark = rangeMarkers.filter((a) => range.text.includes(a))[0];
88
+ const rawParts = range.text.split(mark).map((p) => p.trim());
89
+ const code = range.code ? range.code : DEFAULT_CODE;
90
+ if (rawParts.length == 2 && code != CODE_UNKNOWN) {
91
+ // objective is to identify starting and ending articles (with the legi data correct format)
92
+ // then we can do a legi-data lookup
93
+ const [startRaw, endRaw] = rawParts;
94
+ const [startFMT, endFMT] = formatStartEnd(startRaw, endRaw);
95
+ const unraveled = getLegiDataRange(codes[code.id], startFMT, endFMT).map((a) => {
96
+ const fmt = a.data.num;
97
+ // keep original text for beginning and end
98
+ let text;
99
+ if (startFMT == fmt) {
100
+ text = startRaw;
101
+ }
102
+ else if (endFMT == fmt) {
103
+ text = endRaw;
104
+ }
105
+ return { ...(text && { text }), code, fmt };
106
+ });
107
+ if (unraveled.length > 0) {
108
+ return unraveled;
109
+ }
110
+ }
111
+ // default in case of error, note that we explicitly set code to unknown
112
+ // in order to identify range errors
113
+ return range.text.split(mark).map((a) => {
114
+ return { code: CODE_UNKNOWN, text: a.trim() };
115
+ });
116
+ }
117
+ function formatArticle(article) {
118
+ // remove dot and spaces + remove non digit trailing chars + replace unicode dash ‑ to standard -
119
+ return article
120
+ .toUpperCase()
121
+ .replace(".", "")
122
+ .replace(" ", "")
123
+ .replace(/\D*$/, "")
124
+ .replace(/\u2011/g, "-");
125
+ }
126
+ function resolveReference(ref) {
127
+ let toResolve = [ref];
128
+ if (rangeMarkers.filter((a) => ref.text.includes(a)).length != 0) {
129
+ toResolve = unravelRange(ref);
130
+ }
131
+ return toResolve.map((a) => {
132
+ // use default code if no defined
133
+ const code = (a.code == CODE_UNKNOWN) | (a.code == undefined) ? DEFAULT_CODE : a.code;
134
+ if (!a.fmt)
135
+ a.fmt = formatArticle(a.text);
136
+ if (code && code != CODE_UNKNOWN) {
137
+ const article = (0, unist_util_find_1.default)(codes[code.id], (node) => node.type === "article" && node.data.num === a.fmt);
138
+ if (article) {
139
+ a.cid = article.data.cid;
140
+ a.id = article.data.id;
141
+ a.code = code;
142
+ }
143
+ else {
144
+ // not found in code
145
+ a.code = CODE_UNKNOWN;
146
+ }
147
+ }
148
+ return a;
149
+ });
150
+ }
151
+ function resolveReferences(refs) {
152
+ const resolvedRefs = refs.map((ref) => resolveReference(ref)).flat();
153
+ const deduplicated = resolvedRefs.reduce((acc, art) => {
154
+ // drop duplicated references
155
+ const existing = acc
156
+ .map((a) => [a.text, a.fmt])
157
+ .flat()
158
+ .filter((v) => v);
159
+ if (!(existing.includes(art.fmt) || existing.includes(art.text))) {
160
+ acc.push(art);
161
+ }
162
+ return acc;
163
+ }, []);
164
+ // group by code
165
+ const grouped = deduplicated.reduce((acc, art) => {
166
+ const { code, ...rawArticle } = art;
167
+ const parsedCode = code ? code : CODE_UNKNOWN;
168
+ if (!Object.keys(acc).includes(parsedCode.id)) {
169
+ acc[parsedCode.id] = { articles: [], name: parsedCode.name };
170
+ }
171
+ acc[parsedCode.id].articles.push(rawArticle);
172
+ return acc;
173
+ }, {});
174
+ return grouped;
175
+ }
176
+ exports.resolveReferences = resolveReferences;
@@ -0,0 +1,14 @@
1
+ export function scrapUrl(id: any, url: any): Promise<{
2
+ date: string;
3
+ description: any;
4
+ intro: any;
5
+ pubId: any;
6
+ sections: {
7
+ anchor: string;
8
+ html: string;
9
+ text: string;
10
+ title: any;
11
+ }[];
12
+ title: any;
13
+ url: any;
14
+ }>;
@@ -0,0 +1,52 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.scrapUrl = void 0;
7
+ const got_1 = __importDefault(require("got"));
8
+ const jsdom_1 = require("jsdom");
9
+ const generateHeaders_1 = require("./generateHeaders");
10
+ const parseDom_1 = require("./parseDom");
11
+ async function scrapUrl(id, url) {
12
+ const headers = (0, generateHeaders_1.generateHeaders)();
13
+ try {
14
+ let response = await (0, got_1.default)(url, {
15
+ followRedirect: true,
16
+ headers,
17
+ http2: true,
18
+ retry: 3,
19
+ });
20
+ if (/HTTP 30\d/.test(response.body)) {
21
+ const [, redirectUrl] = response.body.match(/href="(.*)"/);
22
+ try {
23
+ response = await (0, got_1.default)(redirectUrl, {
24
+ followRedirect: true,
25
+ headers,
26
+ http2: true,
27
+ retry: 3,
28
+ });
29
+ }
30
+ catch (error) {
31
+ throw new Error(`Wrong redirectUrl: ${url} => ${redirectUrl}`);
32
+ }
33
+ }
34
+ const dom = new jsdom_1.JSDOM(response.body, { url });
35
+ return (0, parseDom_1.parseDom)(dom, id, url);
36
+ }
37
+ catch (error) {
38
+ let err;
39
+ if (error instanceof got_1.default.ParseError) {
40
+ err = new Error(`Parsing Error: ${error.message}`);
41
+ }
42
+ else if (error instanceof got_1.default.HTTPError) {
43
+ err = new Error(`HTTP Error: ${error.response.statusCode} - ${error.options.url.href} - ${error.message}`);
44
+ }
45
+ else {
46
+ err = new Error(error.message);
47
+ }
48
+ err.url = url;
49
+ throw err;
50
+ }
51
+ }
52
+ exports.scrapUrl = scrapUrl;
@@ -0,0 +1,3 @@
1
+ import { decode } from "./email";
2
+ export { decode };
3
+ export * from "./types";
package/build/index.js ADDED
@@ -0,0 +1,20 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __exportStar = (this && this.__exportStar) || function(m, exports) {
14
+ for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
15
+ };
16
+ Object.defineProperty(exports, "__esModule", { value: true });
17
+ exports.decode = void 0;
18
+ const email_1 = require("./email");
19
+ Object.defineProperty(exports, "decode", { enumerable: true, get: function () { return email_1.decode; } });
20
+ __exportStar(require("./types"), exports);
@@ -0,0 +1,29 @@
1
+ export type FicheTravailEmploi = {
2
+ date: string;
3
+ description: string;
4
+ intro: string;
5
+ pubId: string;
6
+ sections: Section[];
7
+ title: string;
8
+ url: string;
9
+ };
10
+ export type Section = {
11
+ anchor: string;
12
+ description: string;
13
+ html: string;
14
+ references: ReferencesMap;
15
+ text: string;
16
+ title: string;
17
+ };
18
+ export type ReferencesMap = {
19
+ [key: string]: {
20
+ name: string;
21
+ articles: ReferenceFTE[];
22
+ };
23
+ };
24
+ export type ReferenceFTE = {
25
+ id: string;
26
+ cid: string;
27
+ fmt: string;
28
+ text: string;
29
+ };
package/build/types.js ADDED
@@ -0,0 +1,2 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
package/package.json CHANGED
@@ -1,16 +1,12 @@
1
1
  {
2
2
  "name": "@socialgouv/fiches-travail-data-types",
3
- "version": "4.374.0",
4
- "main": "index.js",
5
- "module": "index.esm.js",
3
+ "version": "4.376.0",
4
+ "main": "build/index.js",
5
+ "module": "build/index.js",
6
6
  "files": [
7
- "data",
8
- "index.js",
9
- "index.esm.js",
10
- "index.d.ts",
11
- "README.md"
7
+ "build"
12
8
  ],
13
- "types": "index.d.ts",
9
+ "types": "build/index.d.ts",
14
10
  "repository": {
15
11
  "type": "git",
16
12
  "url": "https://github.com/SocialGouv/fiches-travail-data.git"
@@ -21,45 +17,26 @@
21
17
  "access": "public"
22
18
  },
23
19
  "scripts": {
24
- "build": "rollup -c",
25
- "start": "node -r esm src/fetch-data",
26
- "checkRefs": "node -r esm src/checkRefs",
27
- "lint": "eslint src --ext .js",
20
+ "build": "tsc",
21
+ "start": "node build/fetch-data",
22
+ "checkRefs": "node build/checkRefs",
23
+ "lint": "eslint \"./src/**/*.{js,ts}\"",
28
24
  "test": "jest"
29
25
  },
30
26
  "devDependencies": {
31
27
  "@babel/core": "^7.16.0",
32
28
  "@babel/preset-env": "^7.16.4",
33
- "@socialgouv/cdtn-slugify": "^4.60.3",
34
- "@socialgouv/eslint-config-recommended": "^1.100.0",
35
- "@socialgouv/legi-data": "^2.100.0",
29
+ "@swc/core": "^1.3.21",
30
+ "@swc/jest": "^0.2.23",
31
+ "@typescript-eslint/eslint-plugin": "^5.45.0",
32
+ "@typescript-eslint/parser": "^5.45.0",
36
33
  "babel-jest": "^27.4.4",
37
- "eslint": "^8.0.0",
38
- "esm": "^3.2.25",
39
- "got": "^11.8.3",
34
+ "eslint": "^8.28.0",
35
+ "eslint-plugin-jest": "^27.1.6",
40
36
  "husky": "^7.0.4",
41
- "jest": "^27.4.4",
37
+ "jest": "^29.3.1",
42
38
  "jsdom": "^17.0.0",
43
- "npm-run-all": "^4.1.5",
44
- "p-limit": "^3.1.0",
45
39
  "prettier": "^2.5.1",
46
- "rollup": "^2.60.2",
47
- "superagent": "^6.1.0",
48
- "talisman": "^1.1.4",
49
- "unist-util-find": "^1.0.2",
50
- "unist-util-visit": "^2.0.3"
51
- },
52
- "jest": {
53
- "roots": [
54
- "<rootDir>/src"
55
- ],
56
- "transform": {
57
- "^.+\\.jsx?$": "babel-jest"
58
- },
59
- "testRegex": "(/__tests__/.*|(\\.|/)(test|spec))\\.jsx?$",
60
- "moduleFileExtensions": [
61
- "js",
62
- "json"
63
- ]
40
+ "typescript": "^4.9.3"
64
41
  }
65
42
  }
package/index.d.ts DELETED
@@ -1,32 +0,0 @@
1
- export type FicheTravailEmploi = {
2
- date: string
3
- description: string
4
- intro: string
5
- pubId: string
6
- sections: Section[]
7
- title: string
8
- url: string
9
- }
10
-
11
- export type Section = {
12
- anchor: string
13
- description: string
14
- html: string
15
- references: ReferencesMap
16
- text: string
17
- title: string
18
- }
19
-
20
- export type ReferencesMap = {
21
- [key: string]: {
22
- name: string
23
- articles: ReferenceFTE[]
24
- }
25
- }
26
-
27
- export type ReferenceFTE = {
28
- id: string
29
- cid: string
30
- fmt: string
31
- text: string
32
- }
package/index.esm.js DELETED
@@ -1,7 +0,0 @@
1
- const char = "_";
2
-
3
- function decode(str) {
4
- return str.replace(new RegExp(`${char}@`, "g"), "@");
5
- }
6
-
7
- export { decode };
package/index.js DELETED
@@ -1,11 +0,0 @@
1
- 'use strict';
2
-
3
- Object.defineProperty(exports, '__esModule', { value: true });
4
-
5
- const char = "_";
6
-
7
- function decode(str) {
8
- return str.replace(new RegExp(`${char}@`, "g"), "@");
9
- }
10
-
11
- exports.decode = decode;