@socialgouv/fiches-travail-data-types 4.438.0 → 4.439.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -7,7 +7,6 @@ exports.parseDom = void 0;
|
|
|
7
7
|
const cdtn_slugify_1 = __importDefault(require("@socialgouv/cdtn-slugify"));
|
|
8
8
|
const got_1 = require("got");
|
|
9
9
|
const email_1 = require("../email");
|
|
10
|
-
const postProcess_1 = require("./postProcess");
|
|
11
10
|
const referenceExtractor_1 = require("./referenceExtractor");
|
|
12
11
|
const referenceResolver_1 = require("./referenceResolver");
|
|
13
12
|
const $$ = (node, selector) => Array.from(node.querySelectorAll(selector));
|
|
@@ -22,31 +21,50 @@ const formatEmail = (node) => {
|
|
|
22
21
|
node.removeAttribute("data-cfemail");
|
|
23
22
|
node.textContent = value;
|
|
24
23
|
};
|
|
24
|
+
const SRC_REGEX = /src=["']([^'"]*)["']/;
|
|
25
|
+
function getCleanSrc(src) {
|
|
26
|
+
let [srcClean] = src.split("?");
|
|
27
|
+
if (!srcClean.match(/^https?:\/\//)) {
|
|
28
|
+
if (srcClean.slice(0, 1) !== "/") {
|
|
29
|
+
srcClean = "/" + srcClean;
|
|
30
|
+
}
|
|
31
|
+
srcClean = `https://travail-emploi.gouv.fr${srcClean}`;
|
|
32
|
+
}
|
|
33
|
+
return srcClean;
|
|
34
|
+
}
|
|
25
35
|
const formatPicture = (node) => {
|
|
26
|
-
|
|
27
|
-
|
|
36
|
+
let comment;
|
|
37
|
+
node.parentElement
|
|
38
|
+
.childNodes
|
|
39
|
+
.forEach(function (childNode) {
|
|
40
|
+
if (childNode.nodeName === "#comment" || childNode.nodeType === 8) {
|
|
41
|
+
if (childNode.data.match(SRC_REGEX)) {
|
|
42
|
+
comment = childNode;
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
});
|
|
46
|
+
if (!comment) {
|
|
28
47
|
//upper sibbling node is not a comment so it's not a case we handle
|
|
29
48
|
return;
|
|
30
49
|
}
|
|
31
|
-
const [, src = ""] = comment.data.match(
|
|
50
|
+
const [, src = ""] = comment.data.match(SRC_REGEX);
|
|
32
51
|
if (src.length === 0) {
|
|
33
52
|
return;
|
|
34
53
|
}
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
54
|
+
const srcClean = getCleanSrc(src);
|
|
55
|
+
node.parentNode.innerHTML = `<img src="${srcClean}" style="width:100%;height:auto;" />`;
|
|
56
|
+
};
|
|
57
|
+
const formatImage = (node) => {
|
|
58
|
+
node.removeAttribute("onmousedown");
|
|
59
|
+
if (node.getAttribute("src").indexOf("data:image") === -1) {
|
|
60
|
+
let src = node.getAttribute("src");
|
|
61
|
+
if (!src.match(/^https?:\/\//)) {
|
|
62
|
+
const srcClean = getCleanSrc(src);
|
|
63
|
+
node.setAttribute("src", srcClean);
|
|
64
|
+
node.removeAttribute("srcset");
|
|
65
|
+
node.removeAttribute("sizes");
|
|
39
66
|
}
|
|
40
|
-
srcClean = `https://travail-emploi.gouv.fr${srcClean}`;
|
|
41
67
|
}
|
|
42
|
-
// we remove the ie comment that have timestamp in the url
|
|
43
|
-
comment.remove();
|
|
44
|
-
// we add e
|
|
45
|
-
const sourceNode = node.ownerDocument.createElement("source");
|
|
46
|
-
sourceNode.setAttribute("srcset", srcClean);
|
|
47
|
-
sourceNode.setAttribute("media", "(min-width: 300px)");
|
|
48
|
-
node.appendChild(sourceNode);
|
|
49
|
-
return node;
|
|
50
68
|
};
|
|
51
69
|
const formatAnchor = (node) => {
|
|
52
70
|
if (node.innerHTML.trim() === "") {
|
|
@@ -80,6 +98,9 @@ const formatAnchor = (node) => {
|
|
|
80
98
|
node.setAttribute("rel", "nofollow, noopener");
|
|
81
99
|
}
|
|
82
100
|
};
|
|
101
|
+
const removeNode = (node) => {
|
|
102
|
+
node.remove();
|
|
103
|
+
};
|
|
83
104
|
const flattenCsBlocs = (node) => {
|
|
84
105
|
node.insertAdjacentHTML("afterend", node.innerHTML);
|
|
85
106
|
node.parentNode.removeChild(node);
|
|
@@ -101,27 +122,17 @@ function parseDom(dom, id, url) {
|
|
|
101
122
|
if (!article) {
|
|
102
123
|
throw new got_1.ParseError("no <main>");
|
|
103
124
|
}
|
|
125
|
+
if (!id) {
|
|
126
|
+
throw new got_1.ParseError(`No id`);
|
|
127
|
+
}
|
|
104
128
|
$$(article, "a").forEach(formatAnchor);
|
|
105
129
|
$$(article, "picture").forEach(formatPicture);
|
|
106
130
|
$$(article, "[data-cfemail]").forEach(formatEmail);
|
|
107
131
|
$$(article, ".cs_blocs").forEach(flattenCsBlocs);
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
});
|
|
113
|
-
imgs
|
|
114
|
-
.filter((node) => node.getAttribute("src").indexOf("data:image") === -1)
|
|
115
|
-
.forEach((node) => {
|
|
116
|
-
let src = node.getAttribute("src");
|
|
117
|
-
if (!src.match(/^https?:\/\//)) {
|
|
118
|
-
if (src.slice(0, 1) !== "/") {
|
|
119
|
-
src = "/" + src;
|
|
120
|
-
}
|
|
121
|
-
src = `https://travail-emploi.gouv.fr${src}`;
|
|
122
|
-
node.setAttribute("src", src);
|
|
123
|
-
}
|
|
124
|
-
});
|
|
132
|
+
$$(article, "img").forEach(formatImage);
|
|
133
|
+
$$(article, "style").forEach(removeNode);
|
|
134
|
+
$$(article, "button").forEach(removeNode);
|
|
135
|
+
$$(article, ".oembed-source").forEach(removeNode);
|
|
125
136
|
let titleElement = $(article, "h1");
|
|
126
137
|
if (!titleElement) {
|
|
127
138
|
titleElement = $(article, "h2");
|
|
@@ -130,40 +141,29 @@ function parseDom(dom, id, url) {
|
|
|
130
141
|
}
|
|
131
142
|
}
|
|
132
143
|
const title = titleElement.textContent.trim();
|
|
133
|
-
|
|
134
|
-
throw new got_1.ParseError(`No id`);
|
|
135
|
-
}
|
|
136
|
-
const dateRaw = $(dom.window.document, "meta[property*=modified_time]") ||
|
|
137
|
-
$(dom.window.document, "meta[property$=published_time]");
|
|
144
|
+
const dateRaw = $(dom.window.document, "meta[property*=modified_time]") || $(dom.window.document, "meta[property$=published_time]");
|
|
138
145
|
const [year, month, day] = dateRaw.getAttribute("content").split("-");
|
|
139
146
|
let intro = $(article, ".main-article__chapo") || "";
|
|
140
|
-
intro =
|
|
141
|
-
|
|
142
|
-
const description = $(dom.window.document, "meta[name=description]")?.getAttribute("content") ??
|
|
143
|
-
"";
|
|
147
|
+
intro = intro && intro.innerHTML.replace(/\n/g, "").replace(/\s+/g, " ").trim();
|
|
148
|
+
const description = $(dom.window.document, "meta[name=description]")?.getAttribute("content") ?? "";
|
|
144
149
|
const sections = [];
|
|
145
150
|
const sectionTag = getSectionTag(article);
|
|
146
151
|
// First pass is only to get a potential untitled section at the top of the article
|
|
147
152
|
// This section has neither anchor nor title
|
|
148
153
|
let nextArticleElement = $(article, ".main-article__texte > *");
|
|
149
154
|
const untitledSection = {
|
|
150
|
-
anchor: "",
|
|
151
|
-
html: "",
|
|
152
|
-
text: "",
|
|
153
|
-
title: title,
|
|
155
|
+
anchor: "", html: "", text: "", title: title,
|
|
154
156
|
};
|
|
155
|
-
while (nextArticleElement &&
|
|
156
|
-
nextArticleElement.tagName.toLowerCase() !== sectionTag) {
|
|
157
|
+
while (nextArticleElement && nextArticleElement.tagName.toLowerCase() !== sectionTag) {
|
|
157
158
|
if (nextArticleElement.textContent) {
|
|
158
159
|
if (!untitledSection.description) {
|
|
159
160
|
untitledSection.description = "temp description";
|
|
160
161
|
}
|
|
161
|
-
untitledSection.html +=
|
|
162
|
+
untitledSection.html += nextArticleElement.outerHTML
|
|
162
163
|
.replace(/\n+/g, "")
|
|
163
164
|
.replace(/>\s+</g, "><")
|
|
164
|
-
.replace(/\s+/g, " ")
|
|
165
|
-
untitledSection.text +=
|
|
166
|
-
" " + nextArticleElement.textContent.replace(/\s+/g, " ").trim();
|
|
165
|
+
.replace(/\s+/g, " ");
|
|
166
|
+
untitledSection.text += " " + nextArticleElement.textContent.replace(/\s+/g, " ").trim();
|
|
167
167
|
}
|
|
168
168
|
nextArticleElement = nextArticleElement.nextElementSibling;
|
|
169
169
|
}
|
|
@@ -189,7 +189,7 @@ function parseDom(dom, id, url) {
|
|
|
189
189
|
sections.push({
|
|
190
190
|
anchor: el.getAttribute("id") || (0, cdtn_slugify_1.default)(el.textContent),
|
|
191
191
|
description: sectionText.slice(0, 200).trim(),
|
|
192
|
-
html:
|
|
192
|
+
html: html.replace(/\n+/g, "").replace(/>\s+</g, "><").replace(/\s+/g, " "),
|
|
193
193
|
references: getReferences(sectionText),
|
|
194
194
|
text: sectionText,
|
|
195
195
|
title: el.textContent.trim(),
|
|
@@ -200,13 +200,7 @@ function parseDom(dom, id, url) {
|
|
|
200
200
|
throw new got_1.ParseError(`No sections`);
|
|
201
201
|
}
|
|
202
202
|
return {
|
|
203
|
-
date: `${day}/${month}/${year}`,
|
|
204
|
-
description,
|
|
205
|
-
intro,
|
|
206
|
-
pubId: id,
|
|
207
|
-
sections,
|
|
208
|
-
title,
|
|
209
|
-
url,
|
|
203
|
+
date: `${day}/${month}/${year}`, description, intro, pubId: id, sections, title, url,
|
|
210
204
|
};
|
|
211
205
|
}
|
|
212
206
|
exports.parseDom = parseDom;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@socialgouv/fiches-travail-data-types",
|
|
3
|
-
"version": "4.
|
|
3
|
+
"version": "4.439.0",
|
|
4
4
|
"main": "build/index.js",
|
|
5
5
|
"module": "build/index.js",
|
|
6
6
|
"files": [
|
|
@@ -34,7 +34,7 @@
|
|
|
34
34
|
"babel-jest": "^27.4.4",
|
|
35
35
|
"eslint": "^8.28.0",
|
|
36
36
|
"eslint-plugin-jest": "^27.1.6",
|
|
37
|
-
"husky": "^
|
|
37
|
+
"husky": "^8.0.0",
|
|
38
38
|
"jest": "^29.3.1",
|
|
39
39
|
"jsdom": "^17.0.0",
|
|
40
40
|
"prettier": "^2.5.1",
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
export declare const htmlPostParser: (html: string) => string;
|
|
@@ -1,61 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
-
if (k2 === undefined) k2 = k;
|
|
4
|
-
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
-
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
-
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
-
}
|
|
8
|
-
Object.defineProperty(o, k2, desc);
|
|
9
|
-
}) : (function(o, m, k, k2) {
|
|
10
|
-
if (k2 === undefined) k2 = k;
|
|
11
|
-
o[k2] = m[k];
|
|
12
|
-
}));
|
|
13
|
-
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
-
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
-
}) : function(o, v) {
|
|
16
|
-
o["default"] = v;
|
|
17
|
-
});
|
|
18
|
-
var __importStar = (this && this.__importStar) || function (mod) {
|
|
19
|
-
if (mod && mod.__esModule) return mod;
|
|
20
|
-
var result = {};
|
|
21
|
-
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
|
|
22
|
-
__setModuleDefault(result, mod);
|
|
23
|
-
return result;
|
|
24
|
-
};
|
|
25
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
26
|
-
exports.htmlPostParser = void 0;
|
|
27
|
-
const cheerio = __importStar(require("cheerio"));
|
|
28
|
-
const htmlPostParser = (html) => {
|
|
29
|
-
const $ = cheerio.load(html, null, false);
|
|
30
|
-
const arrImgSrc = [];
|
|
31
|
-
let currentIndex = 0;
|
|
32
|
-
$("style").remove();
|
|
33
|
-
// https://travail-emploi.gouv.fr/le-ministere-en-action/coronavirus-covid-19/questions-reponses-par-theme/article/mesures-de-prevention-dans-l-entreprise-contre-la-covid-19
|
|
34
|
-
$("button").remove();
|
|
35
|
-
$(".oembed-source").remove();
|
|
36
|
-
// https://travail-emploi.gouv.fr/emploi-et-insertion/accompagnement-des-mutations-economiques/activite-partielle-chomage-partiel/article/activite-partielle-chomage-partiel
|
|
37
|
-
$("*")
|
|
38
|
-
.contents()
|
|
39
|
-
.each(function () {
|
|
40
|
-
if (this.nodeType === 8) {
|
|
41
|
-
const regex = /src=['"](.*?)['"]/;
|
|
42
|
-
const result = regex.exec(this.nodeValue);
|
|
43
|
-
if (result) {
|
|
44
|
-
const src = result[0].slice(5, -1);
|
|
45
|
-
if (src)
|
|
46
|
-
arrImgSrc.push(src);
|
|
47
|
-
}
|
|
48
|
-
}
|
|
49
|
-
});
|
|
50
|
-
$("picture").replaceWith(() => {
|
|
51
|
-
const src = arrImgSrc[currentIndex];
|
|
52
|
-
if (src) {
|
|
53
|
-
currentIndex++;
|
|
54
|
-
return `<img src="https://travail-emploi.gouv.fr/${src}" style="width:100%;height:auto;"/>`;
|
|
55
|
-
}
|
|
56
|
-
return $.html()?.toString() ?? "";
|
|
57
|
-
});
|
|
58
|
-
$("picture.adapt-img-wrapper").remove();
|
|
59
|
-
return $.html();
|
|
60
|
-
};
|
|
61
|
-
exports.htmlPostParser = htmlPostParser;
|