@socialgouv/fiches-travail-data-types 4.532.0 → 4.533.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/fetch-data/parseDom.js +29 -14
- package/package.json +1 -1
|
@@ -34,9 +34,7 @@ function getCleanSrc(src) {
|
|
|
34
34
|
}
|
|
35
35
|
const formatPicture = (node) => {
|
|
36
36
|
let comment;
|
|
37
|
-
node.parentElement
|
|
38
|
-
.childNodes
|
|
39
|
-
.forEach(function (childNode) {
|
|
37
|
+
node.parentElement.childNodes.forEach(function (childNode) {
|
|
40
38
|
if (childNode.nodeName === "#comment" || childNode.nodeType === 8) {
|
|
41
39
|
if (childNode.data.match(SRC_REGEX)) {
|
|
42
40
|
comment = childNode;
|
|
@@ -52,9 +50,7 @@ const formatPicture = (node) => {
|
|
|
52
50
|
}
|
|
53
51
|
}
|
|
54
52
|
let image;
|
|
55
|
-
node
|
|
56
|
-
.childNodes
|
|
57
|
-
.forEach(function (childNode) {
|
|
53
|
+
node.childNodes.forEach(function (childNode) {
|
|
58
54
|
if (childNode.nodeName === "IMG") {
|
|
59
55
|
image = childNode;
|
|
60
56
|
}
|
|
@@ -150,20 +146,29 @@ function parseDom(dom, id, url) {
|
|
|
150
146
|
}
|
|
151
147
|
}
|
|
152
148
|
const title = titleElement.textContent.trim();
|
|
153
|
-
const dateRaw = $(dom.window.document, "meta[property*=modified_time]") ||
|
|
149
|
+
const dateRaw = $(dom.window.document, "meta[property*=modified_time]") ||
|
|
150
|
+
$(dom.window.document, "meta[property$=published_time]");
|
|
154
151
|
const [year, month, day] = dateRaw.getAttribute("content").split("-");
|
|
155
152
|
let intro = $(article, ".main-article__chapo") || "";
|
|
156
|
-
intro =
|
|
157
|
-
|
|
153
|
+
intro =
|
|
154
|
+
intro && intro.innerHTML.replace(/\n/g, "").replace(/\s+/g, " ").trim();
|
|
155
|
+
// clean script tags and everything inside it
|
|
156
|
+
intro = intro.replace(/<script[^>]*>([\s\S]*?)<\/script>/g, "");
|
|
157
|
+
const description = $(dom.window.document, "meta[name=description]")?.getAttribute("content") ??
|
|
158
|
+
"";
|
|
158
159
|
const sections = [];
|
|
159
160
|
const sectionTag = getSectionTag(article);
|
|
160
161
|
// First pass is only to get a potential untitled section at the top of the article
|
|
161
162
|
// This section has neither anchor nor title
|
|
162
163
|
let nextArticleElement = $(article, ".main-article__texte > *");
|
|
163
164
|
const untitledSection = {
|
|
164
|
-
anchor: "",
|
|
165
|
+
anchor: "",
|
|
166
|
+
html: "",
|
|
167
|
+
text: "",
|
|
168
|
+
title: title,
|
|
165
169
|
};
|
|
166
|
-
while (nextArticleElement &&
|
|
170
|
+
while (nextArticleElement &&
|
|
171
|
+
nextArticleElement.tagName.toLowerCase() !== sectionTag) {
|
|
167
172
|
if (nextArticleElement.textContent) {
|
|
168
173
|
if (!untitledSection.description) {
|
|
169
174
|
untitledSection.description = "temp description";
|
|
@@ -172,7 +177,8 @@ function parseDom(dom, id, url) {
|
|
|
172
177
|
.replace(/\n+/g, "")
|
|
173
178
|
.replace(/>\s+</g, "><")
|
|
174
179
|
.replace(/\s+/g, " ");
|
|
175
|
-
untitledSection.text +=
|
|
180
|
+
untitledSection.text +=
|
|
181
|
+
" " + nextArticleElement.textContent.replace(/\s+/g, " ").trim();
|
|
176
182
|
}
|
|
177
183
|
nextArticleElement = nextArticleElement.nextElementSibling;
|
|
178
184
|
}
|
|
@@ -198,7 +204,10 @@ function parseDom(dom, id, url) {
|
|
|
198
204
|
sections.push({
|
|
199
205
|
anchor: el.getAttribute("id") || (0, cdtn_slugify_1.default)(el.textContent),
|
|
200
206
|
description: sectionText.slice(0, 200).trim(),
|
|
201
|
-
html: html
|
|
207
|
+
html: html
|
|
208
|
+
.replace(/\n+/g, "")
|
|
209
|
+
.replace(/>\s+</g, "><")
|
|
210
|
+
.replace(/\s+/g, " "),
|
|
202
211
|
references: getReferences(sectionText),
|
|
203
212
|
text: sectionText,
|
|
204
213
|
title: el.textContent.trim(),
|
|
@@ -209,7 +218,13 @@ function parseDom(dom, id, url) {
|
|
|
209
218
|
throw new got_1.ParseError(`No sections`);
|
|
210
219
|
}
|
|
211
220
|
return {
|
|
212
|
-
date: `${day}/${month}/${year}`,
|
|
221
|
+
date: `${day}/${month}/${year}`,
|
|
222
|
+
description,
|
|
223
|
+
intro,
|
|
224
|
+
pubId: id,
|
|
225
|
+
sections,
|
|
226
|
+
title,
|
|
227
|
+
url,
|
|
213
228
|
};
|
|
214
229
|
}
|
|
215
230
|
exports.parseDom = parseDom;
|