@socialgouv/fiches-travail-data-types 4.438.0 → 4.440.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -7,7 +7,6 @@ exports.parseDom = void 0;
|
|
|
7
7
|
const cdtn_slugify_1 = __importDefault(require("@socialgouv/cdtn-slugify"));
|
|
8
8
|
const got_1 = require("got");
|
|
9
9
|
const email_1 = require("../email");
|
|
10
|
-
const postProcess_1 = require("./postProcess");
|
|
11
10
|
const referenceExtractor_1 = require("./referenceExtractor");
|
|
12
11
|
const referenceResolver_1 = require("./referenceResolver");
|
|
13
12
|
const $$ = (node, selector) => Array.from(node.querySelectorAll(selector));
|
|
@@ -22,16 +21,8 @@ const formatEmail = (node) => {
|
|
|
22
21
|
node.removeAttribute("data-cfemail");
|
|
23
22
|
node.textContent = value;
|
|
24
23
|
};
|
|
25
|
-
const
|
|
26
|
-
|
|
27
|
-
if (comment.nodeName !== "#comment") {
|
|
28
|
-
//upper sibbling node is not a comment so it's not a case we handle
|
|
29
|
-
return;
|
|
30
|
-
}
|
|
31
|
-
const [, src = ""] = comment.data.match(/src=["']([^'"]*)["']/);
|
|
32
|
-
if (src.length === 0) {
|
|
33
|
-
return;
|
|
34
|
-
}
|
|
24
|
+
const SRC_REGEX = /src=["']([^'"]*)["']/;
|
|
25
|
+
function getCleanSrc(src) {
|
|
35
26
|
let [srcClean] = src.split("?");
|
|
36
27
|
if (!srcClean.match(/^https?:\/\//)) {
|
|
37
28
|
if (srcClean.slice(0, 1) !== "/") {
|
|
@@ -39,14 +30,50 @@ const formatPicture = (node) => {
|
|
|
39
30
|
}
|
|
40
31
|
srcClean = `https://travail-emploi.gouv.fr${srcClean}`;
|
|
41
32
|
}
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
33
|
+
return srcClean;
|
|
34
|
+
}
|
|
35
|
+
const formatPicture = (node) => {
|
|
36
|
+
let comment;
|
|
37
|
+
node.parentElement
|
|
38
|
+
.childNodes
|
|
39
|
+
.forEach(function (childNode) {
|
|
40
|
+
if (childNode.nodeName === "#comment" || childNode.nodeType === 8) {
|
|
41
|
+
if (childNode.data.match(SRC_REGEX)) {
|
|
42
|
+
comment = childNode;
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
});
|
|
46
|
+
if (comment) {
|
|
47
|
+
const [, src = ""] = comment.data.match(SRC_REGEX);
|
|
48
|
+
if (src.length) {
|
|
49
|
+
const srcClean = getCleanSrc(src);
|
|
50
|
+
node.parentNode.innerHTML = `<img src="${srcClean}" style="width:100%;height:auto;" />`;
|
|
51
|
+
return;
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
let image;
|
|
55
|
+
node
|
|
56
|
+
.childNodes
|
|
57
|
+
.forEach(function (childNode) {
|
|
58
|
+
if (childNode.nodeName === "IMG") {
|
|
59
|
+
image = childNode;
|
|
60
|
+
}
|
|
61
|
+
});
|
|
62
|
+
if (image) {
|
|
63
|
+
node.replaceWith(image);
|
|
64
|
+
}
|
|
65
|
+
};
|
|
66
|
+
const formatImage = (node) => {
|
|
67
|
+
node.removeAttribute("onmousedown");
|
|
68
|
+
if (node.getAttribute("src").indexOf("data:image") === -1) {
|
|
69
|
+
node.removeAttribute("srcset");
|
|
70
|
+
node.removeAttribute("sizes");
|
|
71
|
+
let src = node.getAttribute("src");
|
|
72
|
+
if (!src.match(/^https?:\/\//)) {
|
|
73
|
+
const srcClean = getCleanSrc(src);
|
|
74
|
+
node.setAttribute("src", srcClean);
|
|
75
|
+
}
|
|
76
|
+
}
|
|
50
77
|
};
|
|
51
78
|
const formatAnchor = (node) => {
|
|
52
79
|
if (node.innerHTML.trim() === "") {
|
|
@@ -80,6 +107,9 @@ const formatAnchor = (node) => {
|
|
|
80
107
|
node.setAttribute("rel", "nofollow, noopener");
|
|
81
108
|
}
|
|
82
109
|
};
|
|
110
|
+
const removeNode = (node) => {
|
|
111
|
+
node.remove();
|
|
112
|
+
};
|
|
83
113
|
const flattenCsBlocs = (node) => {
|
|
84
114
|
node.insertAdjacentHTML("afterend", node.innerHTML);
|
|
85
115
|
node.parentNode.removeChild(node);
|
|
@@ -101,27 +131,17 @@ function parseDom(dom, id, url) {
|
|
|
101
131
|
if (!article) {
|
|
102
132
|
throw new got_1.ParseError("no <main>");
|
|
103
133
|
}
|
|
134
|
+
if (!id) {
|
|
135
|
+
throw new got_1.ParseError(`No id`);
|
|
136
|
+
}
|
|
104
137
|
$$(article, "a").forEach(formatAnchor);
|
|
105
138
|
$$(article, "picture").forEach(formatPicture);
|
|
106
139
|
$$(article, "[data-cfemail]").forEach(formatEmail);
|
|
107
140
|
$$(article, ".cs_blocs").forEach(flattenCsBlocs);
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
});
|
|
113
|
-
imgs
|
|
114
|
-
.filter((node) => node.getAttribute("src").indexOf("data:image") === -1)
|
|
115
|
-
.forEach((node) => {
|
|
116
|
-
let src = node.getAttribute("src");
|
|
117
|
-
if (!src.match(/^https?:\/\//)) {
|
|
118
|
-
if (src.slice(0, 1) !== "/") {
|
|
119
|
-
src = "/" + src;
|
|
120
|
-
}
|
|
121
|
-
src = `https://travail-emploi.gouv.fr${src}`;
|
|
122
|
-
node.setAttribute("src", src);
|
|
123
|
-
}
|
|
124
|
-
});
|
|
141
|
+
$$(article, "img").forEach(formatImage);
|
|
142
|
+
$$(article, "style").forEach(removeNode);
|
|
143
|
+
$$(article, "button").forEach(removeNode);
|
|
144
|
+
$$(article, ".oembed-source").forEach(removeNode);
|
|
125
145
|
let titleElement = $(article, "h1");
|
|
126
146
|
if (!titleElement) {
|
|
127
147
|
titleElement = $(article, "h2");
|
|
@@ -130,40 +150,29 @@ function parseDom(dom, id, url) {
|
|
|
130
150
|
}
|
|
131
151
|
}
|
|
132
152
|
const title = titleElement.textContent.trim();
|
|
133
|
-
|
|
134
|
-
throw new got_1.ParseError(`No id`);
|
|
135
|
-
}
|
|
136
|
-
const dateRaw = $(dom.window.document, "meta[property*=modified_time]") ||
|
|
137
|
-
$(dom.window.document, "meta[property$=published_time]");
|
|
153
|
+
const dateRaw = $(dom.window.document, "meta[property*=modified_time]") || $(dom.window.document, "meta[property$=published_time]");
|
|
138
154
|
const [year, month, day] = dateRaw.getAttribute("content").split("-");
|
|
139
155
|
let intro = $(article, ".main-article__chapo") || "";
|
|
140
|
-
intro =
|
|
141
|
-
|
|
142
|
-
const description = $(dom.window.document, "meta[name=description]")?.getAttribute("content") ??
|
|
143
|
-
"";
|
|
156
|
+
intro = intro && intro.innerHTML.replace(/\n/g, "").replace(/\s+/g, " ").trim();
|
|
157
|
+
const description = $(dom.window.document, "meta[name=description]")?.getAttribute("content") ?? "";
|
|
144
158
|
const sections = [];
|
|
145
159
|
const sectionTag = getSectionTag(article);
|
|
146
160
|
// First pass is only to get a potential untitled section at the top of the article
|
|
147
161
|
// This section has neither anchor nor title
|
|
148
162
|
let nextArticleElement = $(article, ".main-article__texte > *");
|
|
149
163
|
const untitledSection = {
|
|
150
|
-
anchor: "",
|
|
151
|
-
html: "",
|
|
152
|
-
text: "",
|
|
153
|
-
title: title,
|
|
164
|
+
anchor: "", html: "", text: "", title: title,
|
|
154
165
|
};
|
|
155
|
-
while (nextArticleElement &&
|
|
156
|
-
nextArticleElement.tagName.toLowerCase() !== sectionTag) {
|
|
166
|
+
while (nextArticleElement && nextArticleElement.tagName.toLowerCase() !== sectionTag) {
|
|
157
167
|
if (nextArticleElement.textContent) {
|
|
158
168
|
if (!untitledSection.description) {
|
|
159
169
|
untitledSection.description = "temp description";
|
|
160
170
|
}
|
|
161
|
-
untitledSection.html +=
|
|
171
|
+
untitledSection.html += nextArticleElement.outerHTML
|
|
162
172
|
.replace(/\n+/g, "")
|
|
163
173
|
.replace(/>\s+</g, "><")
|
|
164
|
-
.replace(/\s+/g, " ")
|
|
165
|
-
untitledSection.text +=
|
|
166
|
-
" " + nextArticleElement.textContent.replace(/\s+/g, " ").trim();
|
|
174
|
+
.replace(/\s+/g, " ");
|
|
175
|
+
untitledSection.text += " " + nextArticleElement.textContent.replace(/\s+/g, " ").trim();
|
|
167
176
|
}
|
|
168
177
|
nextArticleElement = nextArticleElement.nextElementSibling;
|
|
169
178
|
}
|
|
@@ -189,7 +198,7 @@ function parseDom(dom, id, url) {
|
|
|
189
198
|
sections.push({
|
|
190
199
|
anchor: el.getAttribute("id") || (0, cdtn_slugify_1.default)(el.textContent),
|
|
191
200
|
description: sectionText.slice(0, 200).trim(),
|
|
192
|
-
html:
|
|
201
|
+
html: html.replace(/\n+/g, "").replace(/>\s+</g, "><").replace(/\s+/g, " "),
|
|
193
202
|
references: getReferences(sectionText),
|
|
194
203
|
text: sectionText,
|
|
195
204
|
title: el.textContent.trim(),
|
|
@@ -200,13 +209,7 @@ function parseDom(dom, id, url) {
|
|
|
200
209
|
throw new got_1.ParseError(`No sections`);
|
|
201
210
|
}
|
|
202
211
|
return {
|
|
203
|
-
date: `${day}/${month}/${year}`,
|
|
204
|
-
description,
|
|
205
|
-
intro,
|
|
206
|
-
pubId: id,
|
|
207
|
-
sections,
|
|
208
|
-
title,
|
|
209
|
-
url,
|
|
212
|
+
date: `${day}/${month}/${year}`, description, intro, pubId: id, sections, title, url,
|
|
210
213
|
};
|
|
211
214
|
}
|
|
212
215
|
exports.parseDom = parseDom;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@socialgouv/fiches-travail-data-types",
|
|
3
|
-
"version": "4.
|
|
3
|
+
"version": "4.440.0",
|
|
4
4
|
"main": "build/index.js",
|
|
5
5
|
"module": "build/index.js",
|
|
6
6
|
"files": [
|
|
@@ -34,7 +34,7 @@
|
|
|
34
34
|
"babel-jest": "^27.4.4",
|
|
35
35
|
"eslint": "^8.28.0",
|
|
36
36
|
"eslint-plugin-jest": "^27.1.6",
|
|
37
|
-
"husky": "^
|
|
37
|
+
"husky": "^8.0.0",
|
|
38
38
|
"jest": "^29.3.1",
|
|
39
39
|
"jsdom": "^17.0.0",
|
|
40
40
|
"prettier": "^2.5.1",
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
export declare const htmlPostParser: (html: string) => string;
|
|
@@ -1,61 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
-
if (k2 === undefined) k2 = k;
|
|
4
|
-
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
-
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
-
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
-
}
|
|
8
|
-
Object.defineProperty(o, k2, desc);
|
|
9
|
-
}) : (function(o, m, k, k2) {
|
|
10
|
-
if (k2 === undefined) k2 = k;
|
|
11
|
-
o[k2] = m[k];
|
|
12
|
-
}));
|
|
13
|
-
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
-
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
-
}) : function(o, v) {
|
|
16
|
-
o["default"] = v;
|
|
17
|
-
});
|
|
18
|
-
var __importStar = (this && this.__importStar) || function (mod) {
|
|
19
|
-
if (mod && mod.__esModule) return mod;
|
|
20
|
-
var result = {};
|
|
21
|
-
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
|
|
22
|
-
__setModuleDefault(result, mod);
|
|
23
|
-
return result;
|
|
24
|
-
};
|
|
25
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
26
|
-
exports.htmlPostParser = void 0;
|
|
27
|
-
const cheerio = __importStar(require("cheerio"));
|
|
28
|
-
const htmlPostParser = (html) => {
|
|
29
|
-
const $ = cheerio.load(html, null, false);
|
|
30
|
-
const arrImgSrc = [];
|
|
31
|
-
let currentIndex = 0;
|
|
32
|
-
$("style").remove();
|
|
33
|
-
// https://travail-emploi.gouv.fr/le-ministere-en-action/coronavirus-covid-19/questions-reponses-par-theme/article/mesures-de-prevention-dans-l-entreprise-contre-la-covid-19
|
|
34
|
-
$("button").remove();
|
|
35
|
-
$(".oembed-source").remove();
|
|
36
|
-
// https://travail-emploi.gouv.fr/emploi-et-insertion/accompagnement-des-mutations-economiques/activite-partielle-chomage-partiel/article/activite-partielle-chomage-partiel
|
|
37
|
-
$("*")
|
|
38
|
-
.contents()
|
|
39
|
-
.each(function () {
|
|
40
|
-
if (this.nodeType === 8) {
|
|
41
|
-
const regex = /src=['"](.*?)['"]/;
|
|
42
|
-
const result = regex.exec(this.nodeValue);
|
|
43
|
-
if (result) {
|
|
44
|
-
const src = result[0].slice(5, -1);
|
|
45
|
-
if (src)
|
|
46
|
-
arrImgSrc.push(src);
|
|
47
|
-
}
|
|
48
|
-
}
|
|
49
|
-
});
|
|
50
|
-
$("picture").replaceWith(() => {
|
|
51
|
-
const src = arrImgSrc[currentIndex];
|
|
52
|
-
if (src) {
|
|
53
|
-
currentIndex++;
|
|
54
|
-
return `<img src="https://travail-emploi.gouv.fr/${src}" style="width:100%;height:auto;"/>`;
|
|
55
|
-
}
|
|
56
|
-
return $.html()?.toString() ?? "";
|
|
57
|
-
});
|
|
58
|
-
$("picture.adapt-img-wrapper").remove();
|
|
59
|
-
return $.html();
|
|
60
|
-
};
|
|
61
|
-
exports.htmlPostParser = htmlPostParser;
|