@socialgouv/fiches-travail-data 4.375.0 → 4.377.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,3 +1,23 @@
1
+ # [4.377.0](https://github.com/SocialGouv/fiches-travail-data/compare/v4.376.0...v4.377.0) (2022-12-06)
2
+
3
+
4
+ ### Features
5
+
6
+ * **data:** 20221206_2210 update ([5f6bf40](https://github.com/SocialGouv/fiches-travail-data/commit/5f6bf4023141cd21cecfcdfa2abeaca695aac3a9))
7
+
8
+ # [4.376.0](https://github.com/SocialGouv/fiches-travail-data/compare/v4.375.0...v4.376.0) (2022-12-06)
9
+
10
+
11
+ ### Bug Fixes
12
+
13
+ * **build:** ajout de la gestion build full typescript ✨ ([#386](https://github.com/SocialGouv/fiches-travail-data/issues/386)) ([46e5ade](https://github.com/SocialGouv/fiches-travail-data/commit/46e5ade139b31b3e4ffd220dff1136248cd3aeaa))
14
+ * **picture:** ajout du scénario pour la balise picture + typescript / eslint ([#385](https://github.com/SocialGouv/fiches-travail-data/issues/385)) ([5dce100](https://github.com/SocialGouv/fiches-travail-data/commit/5dce100607fbfc938a96468ac6774e3924df559b))
15
+
16
+
17
+ ### Features
18
+
19
+ * **data:** 20221206_1018 update ([8ee594e](https://github.com/SocialGouv/fiches-travail-data/commit/8ee594e4e29b30300a501030f37dbeeac83980a8))
20
+
1
21
  # [4.375.0](https://github.com/SocialGouv/fiches-travail-data/compare/v4.374.0...v4.375.0) (2022-12-01)
2
22
 
3
23
 
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,24 @@
1
+ "use strict";
2
+ const fiches = require("../data/fiches-travail.json");
3
+ const UNDEFINED_KEY = "UNDEFINED";
4
+ const undefinedReferences = fiches.filter((fiche) => {
5
+ const refErrors = fiche.sections.filter((section) => {
6
+ if (!section.references) {
7
+ console.log("no refs in " + fiche.title);
8
+ }
9
+ return section.references && UNDEFINED_KEY in section.references;
10
+ });
11
+ return refErrors.length > 0;
12
+ });
13
+ const printMissingRef = (fiche) => {
14
+ console.log(`#### [${fiche.title}](${fiche.url})`);
15
+ fiche.sections.forEach((section) => {
16
+ if (section.references && UNDEFINED_KEY in section.references) {
17
+ console.log(`- ${section.anchor}`);
18
+ const fmt = section.references[UNDEFINED_KEY].articles.map((ref) => ref.text);
19
+ console.log(`> ${Array.from(new Set(fmt)).join(" / ")}`);
20
+ }
21
+ });
22
+ };
23
+ console.log(`### ${undefinedReferences.length}/${fiches.length} fiches aux références non résolues.`);
24
+ undefinedReferences.map((fiche) => printMissingRef(fiche));
@@ -0,0 +1,3 @@
1
+ declare function encode(str: string): string;
2
+ declare function decode(str: string): string;
3
+ export { decode, encode };
package/build/email.js ADDED
@@ -0,0 +1,12 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.encode = exports.decode = void 0;
4
+ const char = "_";
5
+ function encode(str) {
6
+ return str.replace(/@/g, `${char}@`);
7
+ }
8
+ exports.encode = encode;
9
+ function decode(str) {
10
+ return str.replace(new RegExp(`${char}@`, "g"), "@");
11
+ }
12
+ exports.decode = decode;
@@ -0,0 +1,4 @@
1
+ /**
2
+ * Build the header for request with a specific token to bypass bot protection
3
+ */
4
+ export function generateHeaders(extras: any): any;
@@ -0,0 +1,16 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.generateHeaders = void 0;
4
+ /**
5
+ * Build the header for request with a specific token to bypass bot protection
6
+ */
7
+ function generateHeaders(extras) {
8
+ if (!process.env.TOKEN_MT) {
9
+ throw Error("Token (cgtoken) is required to fetch the data. This token is provided by the travail-emploi.gouv.fr team.");
10
+ }
11
+ return {
12
+ ...extras,
13
+ Cookie: `cgtoken=${process.env.TOKEN_MT};`,
14
+ };
15
+ }
16
+ exports.generateHeaders = generateHeaders;
@@ -0,0 +1,2 @@
1
+ export function fetchFeed(url: any): Promise<any>;
2
+ export function scrap(urls: any): Promise<any[]>;
@@ -0,0 +1,66 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.scrap = exports.fetchFeed = void 0;
7
+ const fs_1 = __importDefault(require("fs"));
8
+ const got_1 = __importDefault(require("got"));
9
+ const p_limit_1 = __importDefault(require("p-limit"));
10
+ const path_1 = __importDefault(require("path"));
11
+ const generateHeaders_1 = require("./generateHeaders");
12
+ const scrapUrl_1 = require("./scrapUrl");
13
+ const FEED_URL = "https://travail-emploi.gouv.fr/?page=oseo_json";
14
+ const limit = (0, p_limit_1.default)(10);
15
+ async function fetchFeed(url) {
16
+ const response = await got_1.default.post(url, {
17
+ headers: (0, generateHeaders_1.generateHeaders)({
18
+ "Content-Type": "application/json",
19
+ }),
20
+ http2: true,
21
+ retry: 3,
22
+ });
23
+ const { fiches: feed } = JSON.parse(response.body);
24
+ return feed;
25
+ }
26
+ exports.fetchFeed = fetchFeed;
27
+ async function scrap(urls) {
28
+ const inputs = urls.map(({ id, url }) => limit(() => (0, scrapUrl_1.scrapUrl)(id, url)));
29
+ const results = await Promise.allSettled(inputs);
30
+ const failedPromise = results.filter(({ status }) => status === "rejected");
31
+ if (failedPromise.length > 0) {
32
+ console.error("scrap fail", failedPromise.map(({ reason }) => reason));
33
+ throw new Error("Error - fetching pages fail. Some pages are missing");
34
+ }
35
+ const resolvedPromise = results.flatMap(({ status, value }) => status === "fulfilled" ? [value] : []);
36
+ // ensure we not have duplicate url
37
+ let hasDuplicate = false;
38
+ for (const { pubId, url } of resolvedPromise) {
39
+ const count = resolvedPromise.filter((fiche) => fiche.pubId === pubId && pubId !== undefined).length;
40
+ if (count > 1) {
41
+ hasDuplicate = true;
42
+ console.error(`[error] la fiche ${url} est présente ${count} fois. Veuillez supprimer le doublon du datafiller`);
43
+ }
44
+ }
45
+ if (hasDuplicate) {
46
+ throw new Error(`[error] fiches en doublons. Veuillez supprimer les doublons du datafiller`);
47
+ }
48
+ return resolvedPromise;
49
+ }
50
+ exports.scrap = scrap;
51
+ if (module === require.main) {
52
+ const t0 = Date.now();
53
+ fetchFeed(FEED_URL)
54
+ .then(scrap)
55
+ .then((fiches) => {
56
+ console.log(`done in ${Math.round((Date.now() - t0) / 1000)} sec`);
57
+ const dataFilePath = path_1.default.join(__dirname, "..", "..", "data", "fiches-travail.json");
58
+ fs_1.default.mkdirSync(path_1.default.dirname(dataFilePath), { recursive: true });
59
+ fs_1.default.writeFileSync(dataFilePath, JSON.stringify(fiches, null, 2));
60
+ })
61
+ .catch((error) => {
62
+ console.error(error);
63
+ console.error(`fail in ${Math.round((Date.now() - t0) / 1000)} sec`);
64
+ process.exit(1);
65
+ });
66
+ }
@@ -0,0 +1,14 @@
1
+ export function parseDom(dom: any, id: any, url: any): {
2
+ date: string;
3
+ description: any;
4
+ intro: any;
5
+ pubId: any;
6
+ sections: {
7
+ anchor: string;
8
+ html: string;
9
+ text: string;
10
+ title: any;
11
+ }[];
12
+ title: any;
13
+ url: any;
14
+ };
@@ -0,0 +1,211 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.parseDom = void 0;
7
+ const cdtn_slugify_1 = __importDefault(require("@socialgouv/cdtn-slugify"));
8
+ const got_1 = require("got");
9
+ const email_1 = require("../email");
10
+ const postProcess_1 = require("./postProcess");
11
+ const referenceExtractor_1 = require("./referenceExtractor");
12
+ const referenceResolver_1 = require("./referenceResolver");
13
+ const $$ = (node, selector) => Array.from(node.querySelectorAll(selector));
14
+ const $ = (node, selector) => node.querySelector(selector);
15
+ function unwrapEmail(data = "") {
16
+ const [k, ...tokens] = Array.from({ length: data.length / 2 }, (_, i) => i * 2).map((val) => parseInt(data.slice(val, val + 2), 16));
17
+ const rawValue = tokens.map((v) => String.fromCharCode(v ^ k)).join("");
18
+ return (0, email_1.encode)(decodeURIComponent(escape(rawValue)));
19
+ }
20
+ const formatEmail = (node) => {
21
+ const value = unwrapEmail(node.getAttribute("data-cfemail"));
22
+ node.removeAttribute("data-cfemail");
23
+ node.textContent = value;
24
+ };
25
+ const formatPicture = (node) => {
26
+ const comment = node.parentElement.childNodes[0];
27
+ if (comment.nodeName !== "#comment") {
28
+ //upper sibbling node is not a comment so it's not a case we handle
29
+ return;
30
+ }
31
+ const [, src = ""] = comment.data.match(/src=["']([^'"]*)["']/);
32
+ if (src.length === 0) {
33
+ return;
34
+ }
35
+ let [srcClean] = src.split("?");
36
+ if (!srcClean.match(/^https?:\/\//)) {
37
+ if (srcClean.slice(0, 1) !== "/") {
38
+ srcClean = "/" + srcClean;
39
+ }
40
+ srcClean = `https://travail-emploi.gouv.fr${srcClean}`;
41
+ }
42
+ // we remove the ie comment that have timestamp in the url
43
+ comment.remove();
44
+ // we add e
45
+ const sourceNode = node.ownerDocument.createElement("source");
46
+ sourceNode.setAttribute("srcset", srcClean);
47
+ sourceNode.setAttribute("media", "(min-width: 300px)");
48
+ node.appendChild(sourceNode);
49
+ return node;
50
+ };
51
+ const formatAnchor = (node) => {
52
+ if (node.innerHTML.trim() === "") {
53
+ node.remove();
54
+ return;
55
+ }
56
+ if (node.getElementsByTagName("img").length) {
57
+ node.classList.add("no-after");
58
+ }
59
+ let href = node.getAttribute("href");
60
+ // remove ATTAg(...) on pdf link
61
+ node.removeAttribute("onclick");
62
+ if (!href)
63
+ return;
64
+ // unwrap link with href="javascript:"
65
+ if (/^javascript:/.test(href)) {
66
+ node.parentNode.innerHTML = node.textContent;
67
+ }
68
+ if (/email-protection/.test(href)) {
69
+ const [, data = ""] = href.split("#");
70
+ const value = unwrapEmail(data);
71
+ node.setAttribute("href", `mailto:${value}`);
72
+ return;
73
+ }
74
+ if (!href.match(/^https?:\/\//)) {
75
+ if (href.slice(0, 1) !== "/") {
76
+ href = "/" + href;
77
+ }
78
+ node.setAttribute("href", `https://travail-emploi.gouv.fr${href}`);
79
+ node.setAttribute("target", "_blank");
80
+ node.setAttribute("rel", "nofollow, noopener");
81
+ }
82
+ };
83
+ const flattenCsBlocs = (node) => {
84
+ node.insertAdjacentHTML("afterend", node.innerHTML);
85
+ node.parentNode.removeChild(node);
86
+ };
87
+ const getSectionTag = (article) => {
88
+ const h3 = $$(article, ".main-article__texte > h3").length && "h3";
89
+ const h4 = $$(article, ".main-article__texte > h4").length && "h4";
90
+ const h5 = $$(article, ".main-article__texte > h5").length && "h5";
91
+ return h3 || h4 || h5 || "sectionTag";
92
+ };
93
+ const getReferences = (text) => {
94
+ // first we extract the tokens referencing articles
95
+ const references = (0, referenceExtractor_1.extractReferences)(text);
96
+ // then we try to resolve the actual articles ids using legi-data
97
+ return (0, referenceResolver_1.resolveReferences)(references);
98
+ };
99
+ function parseDom(dom, id, url) {
100
+ const article = $(dom.window.document, "main");
101
+ if (!article) {
102
+ throw new got_1.ParseError("no <main>");
103
+ }
104
+ $$(article, "a").forEach(formatAnchor);
105
+ $$(article, "picture").forEach(formatPicture);
106
+ $$(article, "[data-cfemail]").forEach(formatEmail);
107
+ $$(article, ".cs_blocs").forEach(flattenCsBlocs);
108
+ const imgs = $$(article, "img");
109
+ imgs.forEach((node) => {
110
+ // remove adaptImgFix(this) on hero img
111
+ node.removeAttribute("onmousedown");
112
+ });
113
+ imgs
114
+ .filter((node) => node.getAttribute("src").indexOf("data:image") === -1)
115
+ .forEach((node) => {
116
+ let src = node.getAttribute("src");
117
+ if (!src.match(/^https?:\/\//)) {
118
+ if (src.slice(0, 1) !== "/") {
119
+ src = "/" + src;
120
+ }
121
+ src = `https://travail-emploi.gouv.fr${src}`;
122
+ node.setAttribute("src", src);
123
+ }
124
+ });
125
+ let titleElement = $(article, "h1");
126
+ if (!titleElement) {
127
+ titleElement = $(article, "h2");
128
+ if (!titleElement) {
129
+ throw new got_1.ParseError("No <h1> or <h2> element");
130
+ }
131
+ }
132
+ const title = titleElement.textContent.trim();
133
+ if (!id) {
134
+ throw new got_1.ParseError(`No id`);
135
+ }
136
+ const dateRaw = $(dom.window.document, "meta[property*=modified_time]") ||
137
+ $(dom.window.document, "meta[property$=published_time]");
138
+ const [year, month, day] = dateRaw.getAttribute("content").split("-");
139
+ let intro = $(article, ".main-article__chapo") || "";
140
+ intro =
141
+ intro && intro.innerHTML.replace(/\n/g, "").replace(/\s+/g, " ").trim();
142
+ const description = $(dom.window.document, "meta[name=description]").getAttribute("content");
143
+ const sections = [];
144
+ const sectionTag = getSectionTag(article);
145
+ // First pass is only to get a potential untitled section at the top of the article
146
+ // This section has neither anchor nor title
147
+ let nextArticleElement = $(article, ".main-article__texte > *");
148
+ const untitledSection = {
149
+ anchor: "",
150
+ html: "",
151
+ text: "",
152
+ title: title,
153
+ };
154
+ while (nextArticleElement &&
155
+ nextArticleElement.tagName.toLowerCase() !== sectionTag) {
156
+ if (nextArticleElement.textContent) {
157
+ if (!untitledSection.description) {
158
+ untitledSection.description = "temp description";
159
+ }
160
+ untitledSection.html += nextArticleElement.outerHTML
161
+ .replace(/\n+/g, "")
162
+ .replace(/>\s+</g, "><")
163
+ .replace(/\s+/g, " ");
164
+ untitledSection.text +=
165
+ " " + nextArticleElement.textContent.replace(/\s+/g, " ").trim();
166
+ }
167
+ nextArticleElement = nextArticleElement.nextElementSibling;
168
+ }
169
+ if (untitledSection.description) {
170
+ untitledSection.text.trim();
171
+ untitledSection.description = untitledSection.text.slice(0, 200).trim();
172
+ untitledSection.references = getReferences(untitledSection.text);
173
+ sections.push(untitledSection);
174
+ }
175
+ // Gets all the titled content
176
+ const articleChildren = $$(article, `.main-article__texte > ${sectionTag}`);
177
+ articleChildren.forEach(function (el) {
178
+ if (el.tagName.toLowerCase() === sectionTag) {
179
+ let nextEl = el.nextElementSibling;
180
+ let html = "";
181
+ while (nextEl && nextEl.tagName.toLowerCase() !== sectionTag) {
182
+ html += nextEl.outerHTML;
183
+ nextEl = nextEl.nextElementSibling;
184
+ }
185
+ const section = dom.window.document.createElement("div");
186
+ section.innerHTML = html;
187
+ const sectionText = section.textContent.replace(/\s+/g, " ").trim();
188
+ sections.push({
189
+ anchor: el.getAttribute("id") || (0, cdtn_slugify_1.default)(el.textContent),
190
+ description: sectionText.slice(0, 200).trim(),
191
+ html: (0, postProcess_1.htmlPostParser)(html.replace(/\n+/g, "").replace(/>\s+</g, "><").replace(/\s+/g, " ")),
192
+ references: getReferences(sectionText),
193
+ text: sectionText,
194
+ title: el.textContent.trim(),
195
+ });
196
+ }
197
+ });
198
+ if (sections.length === 0) {
199
+ throw new got_1.ParseError(`No sections`);
200
+ }
201
+ return {
202
+ date: `${day}/${month}/${year}`,
203
+ description,
204
+ intro,
205
+ pubId: id,
206
+ sections,
207
+ title,
208
+ url,
209
+ };
210
+ }
211
+ exports.parseDom = parseDom;
@@ -0,0 +1 @@
1
+ export declare const htmlPostParser: (html: string) => string;
@@ -0,0 +1,60 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || function (mod) {
19
+ if (mod && mod.__esModule) return mod;
20
+ var result = {};
21
+ if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
22
+ __setModuleDefault(result, mod);
23
+ return result;
24
+ };
25
+ Object.defineProperty(exports, "__esModule", { value: true });
26
+ exports.htmlPostParser = void 0;
27
+ const cheerio = __importStar(require("cheerio"));
28
+ const htmlPostParser = (html) => {
29
+ const $ = cheerio.load(html, null, false);
30
+ const arrImgSrc = [];
31
+ let currentIndex = 0;
32
+ $("style").remove();
33
+ // https://travail-emploi.gouv.fr/le-ministere-en-action/coronavirus-covid-19/questions-reponses-par-theme/article/mesures-de-prevention-dans-l-entreprise-contre-la-covid-19
34
+ $("button").remove();
35
+ $(".oembed-source").remove();
36
+ // https://travail-emploi.gouv.fr/emploi-et-insertion/accompagnement-des-mutations-economiques/activite-partielle-chomage-partiel/article/activite-partielle-chomage-partiel
37
+ $("*")
38
+ .contents()
39
+ .each(function () {
40
+ if (this.nodeType === 8) {
41
+ const regex = /src=['"](.*?)['"]/;
42
+ const result = regex.exec(this.nodeValue);
43
+ if (result) {
44
+ const src = result[0].slice(5, -1);
45
+ if (src)
46
+ arrImgSrc.push(src);
47
+ }
48
+ }
49
+ });
50
+ $("picture").replaceWith(() => {
51
+ const src = arrImgSrc[currentIndex];
52
+ if (src) {
53
+ currentIndex++;
54
+ return `<img src="https://travail-emploi.gouv.fr/${src}" style="width:100%;height:auto;"/>`;
55
+ }
56
+ return $(this).html()?.toString() ?? "";
57
+ });
58
+ return $.html();
59
+ };
60
+ exports.htmlPostParser = htmlPostParser;
@@ -0,0 +1,18 @@
1
+ export function classifyTokens(tokens: any): any;
2
+ export namespace CODE_SECU {
3
+ const id: string;
4
+ const name: string;
5
+ }
6
+ export namespace CODE_TRAVAIL {
7
+ const id_1: string;
8
+ export { id_1 as id };
9
+ const name_1: string;
10
+ export { name_1 as name };
11
+ }
12
+ export const codesFullNames: {
13
+ [x: string]: {
14
+ id: string;
15
+ name: string;
16
+ };
17
+ };
18
+ export function extractReferences(text: any): any;