@socialgouv/fiches-travail-data-types 4.438.0 → 4.440.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,7 +7,6 @@ exports.parseDom = void 0;
7
7
  const cdtn_slugify_1 = __importDefault(require("@socialgouv/cdtn-slugify"));
8
8
  const got_1 = require("got");
9
9
  const email_1 = require("../email");
10
- const postProcess_1 = require("./postProcess");
11
10
  const referenceExtractor_1 = require("./referenceExtractor");
12
11
  const referenceResolver_1 = require("./referenceResolver");
13
12
  const $$ = (node, selector) => Array.from(node.querySelectorAll(selector));
@@ -22,16 +21,8 @@ const formatEmail = (node) => {
22
21
  node.removeAttribute("data-cfemail");
23
22
  node.textContent = value;
24
23
  };
25
- const formatPicture = (node) => {
26
- const comment = node.parentElement.childNodes[0];
27
- if (comment.nodeName !== "#comment") {
28
- //upper sibbling node is not a comment so it's not a case we handle
29
- return;
30
- }
31
- const [, src = ""] = comment.data.match(/src=["']([^'"]*)["']/);
32
- if (src.length === 0) {
33
- return;
34
- }
24
+ const SRC_REGEX = /src=["']([^'"]*)["']/;
25
+ function getCleanSrc(src) {
35
26
  let [srcClean] = src.split("?");
36
27
  if (!srcClean.match(/^https?:\/\//)) {
37
28
  if (srcClean.slice(0, 1) !== "/") {
@@ -39,14 +30,50 @@ const formatPicture = (node) => {
39
30
  }
40
31
  srcClean = `https://travail-emploi.gouv.fr${srcClean}`;
41
32
  }
42
- // we remove the ie comment that have timestamp in the url
43
- comment.remove();
44
- // we add e
45
- const sourceNode = node.ownerDocument.createElement("source");
46
- sourceNode.setAttribute("srcset", srcClean);
47
- sourceNode.setAttribute("media", "(min-width: 300px)");
48
- node.appendChild(sourceNode);
49
- return node;
33
+ return srcClean;
34
+ }
35
+ const formatPicture = (node) => {
36
+ let comment;
37
+ node.parentElement
38
+ .childNodes
39
+ .forEach(function (childNode) {
40
+ if (childNode.nodeName === "#comment" || childNode.nodeType === 8) {
41
+ if (childNode.data.match(SRC_REGEX)) {
42
+ comment = childNode;
43
+ }
44
+ }
45
+ });
46
+ if (comment) {
47
+ const [, src = ""] = comment.data.match(SRC_REGEX);
48
+ if (src.length) {
49
+ const srcClean = getCleanSrc(src);
50
+ node.parentNode.innerHTML = `<img src="${srcClean}" style="width:100%;height:auto;" />`;
51
+ return;
52
+ }
53
+ }
54
+ let image;
55
+ node
56
+ .childNodes
57
+ .forEach(function (childNode) {
58
+ if (childNode.nodeName === "IMG") {
59
+ image = childNode;
60
+ }
61
+ });
62
+ if (image) {
63
+ node.replaceWith(image);
64
+ }
65
+ };
66
+ const formatImage = (node) => {
67
+ node.removeAttribute("onmousedown");
68
+ if (node.getAttribute("src").indexOf("data:image") === -1) {
69
+ node.removeAttribute("srcset");
70
+ node.removeAttribute("sizes");
71
+ let src = node.getAttribute("src");
72
+ if (!src.match(/^https?:\/\//)) {
73
+ const srcClean = getCleanSrc(src);
74
+ node.setAttribute("src", srcClean);
75
+ }
76
+ }
50
77
  };
51
78
  const formatAnchor = (node) => {
52
79
  if (node.innerHTML.trim() === "") {
@@ -80,6 +107,9 @@ const formatAnchor = (node) => {
80
107
  node.setAttribute("rel", "nofollow, noopener");
81
108
  }
82
109
  };
110
+ const removeNode = (node) => {
111
+ node.remove();
112
+ };
83
113
  const flattenCsBlocs = (node) => {
84
114
  node.insertAdjacentHTML("afterend", node.innerHTML);
85
115
  node.parentNode.removeChild(node);
@@ -101,27 +131,17 @@ function parseDom(dom, id, url) {
101
131
  if (!article) {
102
132
  throw new got_1.ParseError("no <main>");
103
133
  }
134
+ if (!id) {
135
+ throw new got_1.ParseError(`No id`);
136
+ }
104
137
  $$(article, "a").forEach(formatAnchor);
105
138
  $$(article, "picture").forEach(formatPicture);
106
139
  $$(article, "[data-cfemail]").forEach(formatEmail);
107
140
  $$(article, ".cs_blocs").forEach(flattenCsBlocs);
108
- const imgs = $$(article, "img");
109
- imgs.forEach((node) => {
110
- // remove adaptImgFix(this) on hero img
111
- node.removeAttribute("onmousedown");
112
- });
113
- imgs
114
- .filter((node) => node.getAttribute("src").indexOf("data:image") === -1)
115
- .forEach((node) => {
116
- let src = node.getAttribute("src");
117
- if (!src.match(/^https?:\/\//)) {
118
- if (src.slice(0, 1) !== "/") {
119
- src = "/" + src;
120
- }
121
- src = `https://travail-emploi.gouv.fr${src}`;
122
- node.setAttribute("src", src);
123
- }
124
- });
141
+ $$(article, "img").forEach(formatImage);
142
+ $$(article, "style").forEach(removeNode);
143
+ $$(article, "button").forEach(removeNode);
144
+ $$(article, ".oembed-source").forEach(removeNode);
125
145
  let titleElement = $(article, "h1");
126
146
  if (!titleElement) {
127
147
  titleElement = $(article, "h2");
@@ -130,40 +150,29 @@ function parseDom(dom, id, url) {
130
150
  }
131
151
  }
132
152
  const title = titleElement.textContent.trim();
133
- if (!id) {
134
- throw new got_1.ParseError(`No id`);
135
- }
136
- const dateRaw = $(dom.window.document, "meta[property*=modified_time]") ||
137
- $(dom.window.document, "meta[property$=published_time]");
153
+ const dateRaw = $(dom.window.document, "meta[property*=modified_time]") || $(dom.window.document, "meta[property$=published_time]");
138
154
  const [year, month, day] = dateRaw.getAttribute("content").split("-");
139
155
  let intro = $(article, ".main-article__chapo") || "";
140
- intro =
141
- intro && intro.innerHTML.replace(/\n/g, "").replace(/\s+/g, " ").trim();
142
- const description = $(dom.window.document, "meta[name=description]")?.getAttribute("content") ??
143
- "";
156
+ intro = intro && intro.innerHTML.replace(/\n/g, "").replace(/\s+/g, " ").trim();
157
+ const description = $(dom.window.document, "meta[name=description]")?.getAttribute("content") ?? "";
144
158
  const sections = [];
145
159
  const sectionTag = getSectionTag(article);
146
160
  // First pass is only to get a potential untitled section at the top of the article
147
161
  // This section has neither anchor nor title
148
162
  let nextArticleElement = $(article, ".main-article__texte > *");
149
163
  const untitledSection = {
150
- anchor: "",
151
- html: "",
152
- text: "",
153
- title: title,
164
+ anchor: "", html: "", text: "", title: title,
154
165
  };
155
- while (nextArticleElement &&
156
- nextArticleElement.tagName.toLowerCase() !== sectionTag) {
166
+ while (nextArticleElement && nextArticleElement.tagName.toLowerCase() !== sectionTag) {
157
167
  if (nextArticleElement.textContent) {
158
168
  if (!untitledSection.description) {
159
169
  untitledSection.description = "temp description";
160
170
  }
161
- untitledSection.html += (0, postProcess_1.htmlPostParser)(nextArticleElement.outerHTML
171
+ untitledSection.html += nextArticleElement.outerHTML
162
172
  .replace(/\n+/g, "")
163
173
  .replace(/>\s+</g, "><")
164
- .replace(/\s+/g, " "));
165
- untitledSection.text +=
166
- " " + nextArticleElement.textContent.replace(/\s+/g, " ").trim();
174
+ .replace(/\s+/g, " ");
175
+ untitledSection.text += " " + nextArticleElement.textContent.replace(/\s+/g, " ").trim();
167
176
  }
168
177
  nextArticleElement = nextArticleElement.nextElementSibling;
169
178
  }
@@ -189,7 +198,7 @@ function parseDom(dom, id, url) {
189
198
  sections.push({
190
199
  anchor: el.getAttribute("id") || (0, cdtn_slugify_1.default)(el.textContent),
191
200
  description: sectionText.slice(0, 200).trim(),
192
- html: (0, postProcess_1.htmlPostParser)(html.replace(/\n+/g, "").replace(/>\s+</g, "><").replace(/\s+/g, " ")),
201
+ html: html.replace(/\n+/g, "").replace(/>\s+</g, "><").replace(/\s+/g, " "),
193
202
  references: getReferences(sectionText),
194
203
  text: sectionText,
195
204
  title: el.textContent.trim(),
@@ -200,13 +209,7 @@ function parseDom(dom, id, url) {
200
209
  throw new got_1.ParseError(`No sections`);
201
210
  }
202
211
  return {
203
- date: `${day}/${month}/${year}`,
204
- description,
205
- intro,
206
- pubId: id,
207
- sections,
208
- title,
209
- url,
212
+ date: `${day}/${month}/${year}`, description, intro, pubId: id, sections, title, url,
210
213
  };
211
214
  }
212
215
  exports.parseDom = parseDom;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@socialgouv/fiches-travail-data-types",
3
- "version": "4.438.0",
3
+ "version": "4.440.0",
4
4
  "main": "build/index.js",
5
5
  "module": "build/index.js",
6
6
  "files": [
@@ -34,7 +34,7 @@
34
34
  "babel-jest": "^27.4.4",
35
35
  "eslint": "^8.28.0",
36
36
  "eslint-plugin-jest": "^27.1.6",
37
- "husky": "^7.0.4",
37
+ "husky": "^8.0.0",
38
38
  "jest": "^29.3.1",
39
39
  "jsdom": "^17.0.0",
40
40
  "prettier": "^2.5.1",
@@ -1 +0,0 @@
1
- export declare const htmlPostParser: (html: string) => string;
@@ -1,61 +0,0 @@
1
- "use strict";
2
- var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
- if (k2 === undefined) k2 = k;
4
- var desc = Object.getOwnPropertyDescriptor(m, k);
5
- if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
- desc = { enumerable: true, get: function() { return m[k]; } };
7
- }
8
- Object.defineProperty(o, k2, desc);
9
- }) : (function(o, m, k, k2) {
10
- if (k2 === undefined) k2 = k;
11
- o[k2] = m[k];
12
- }));
13
- var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
- Object.defineProperty(o, "default", { enumerable: true, value: v });
15
- }) : function(o, v) {
16
- o["default"] = v;
17
- });
18
- var __importStar = (this && this.__importStar) || function (mod) {
19
- if (mod && mod.__esModule) return mod;
20
- var result = {};
21
- if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
22
- __setModuleDefault(result, mod);
23
- return result;
24
- };
25
- Object.defineProperty(exports, "__esModule", { value: true });
26
- exports.htmlPostParser = void 0;
27
- const cheerio = __importStar(require("cheerio"));
28
- const htmlPostParser = (html) => {
29
- const $ = cheerio.load(html, null, false);
30
- const arrImgSrc = [];
31
- let currentIndex = 0;
32
- $("style").remove();
33
- // https://travail-emploi.gouv.fr/le-ministere-en-action/coronavirus-covid-19/questions-reponses-par-theme/article/mesures-de-prevention-dans-l-entreprise-contre-la-covid-19
34
- $("button").remove();
35
- $(".oembed-source").remove();
36
- // https://travail-emploi.gouv.fr/emploi-et-insertion/accompagnement-des-mutations-economiques/activite-partielle-chomage-partiel/article/activite-partielle-chomage-partiel
37
- $("*")
38
- .contents()
39
- .each(function () {
40
- if (this.nodeType === 8) {
41
- const regex = /src=['"](.*?)['"]/;
42
- const result = regex.exec(this.nodeValue);
43
- if (result) {
44
- const src = result[0].slice(5, -1);
45
- if (src)
46
- arrImgSrc.push(src);
47
- }
48
- }
49
- });
50
- $("picture").replaceWith(() => {
51
- const src = arrImgSrc[currentIndex];
52
- if (src) {
53
- currentIndex++;
54
- return `<img src="https://travail-emploi.gouv.fr/${src}" style="width:100%;height:auto;"/>`;
55
- }
56
- return $.html()?.toString() ?? "";
57
- });
58
- $("picture.adapt-img-wrapper").remove();
59
- return $.html();
60
- };
61
- exports.htmlPostParser = htmlPostParser;