@socialgouv/fiches-travail-data-types 4.438.0 → 4.439.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,7 +7,6 @@ exports.parseDom = void 0;
7
7
  const cdtn_slugify_1 = __importDefault(require("@socialgouv/cdtn-slugify"));
8
8
  const got_1 = require("got");
9
9
  const email_1 = require("../email");
10
- const postProcess_1 = require("./postProcess");
11
10
  const referenceExtractor_1 = require("./referenceExtractor");
12
11
  const referenceResolver_1 = require("./referenceResolver");
13
12
  const $$ = (node, selector) => Array.from(node.querySelectorAll(selector));
@@ -22,31 +21,50 @@ const formatEmail = (node) => {
22
21
  node.removeAttribute("data-cfemail");
23
22
  node.textContent = value;
24
23
  };
24
+ const SRC_REGEX = /src=["']([^'"]*)["']/;
25
+ function getCleanSrc(src) {
26
+ let [srcClean] = src.split("?");
27
+ if (!srcClean.match(/^https?:\/\//)) {
28
+ if (srcClean.slice(0, 1) !== "/") {
29
+ srcClean = "/" + srcClean;
30
+ }
31
+ srcClean = `https://travail-emploi.gouv.fr${srcClean}`;
32
+ }
33
+ return srcClean;
34
+ }
25
35
  const formatPicture = (node) => {
26
- const comment = node.parentElement.childNodes[0];
27
- if (comment.nodeName !== "#comment") {
36
+ let comment;
37
+ node.parentElement
38
+ .childNodes
39
+ .forEach(function (childNode) {
40
+ if (childNode.nodeName === "#comment" || childNode.nodeType === 8) {
41
+ if (childNode.data.match(SRC_REGEX)) {
42
+ comment = childNode;
43
+ }
44
+ }
45
+ });
46
+ if (!comment) {
28
47
  //upper sibbling node is not a comment so it's not a case we handle
29
48
  return;
30
49
  }
31
- const [, src = ""] = comment.data.match(/src=["']([^'"]*)["']/);
50
+ const [, src = ""] = comment.data.match(SRC_REGEX);
32
51
  if (src.length === 0) {
33
52
  return;
34
53
  }
35
- let [srcClean] = src.split("?");
36
- if (!srcClean.match(/^https?:\/\//)) {
37
- if (srcClean.slice(0, 1) !== "/") {
38
- srcClean = "/" + srcClean;
54
+ const srcClean = getCleanSrc(src);
55
+ node.parentNode.innerHTML = `<img src="${srcClean}" style="width:100%;height:auto;" />`;
56
+ };
57
+ const formatImage = (node) => {
58
+ node.removeAttribute("onmousedown");
59
+ if (node.getAttribute("src").indexOf("data:image") === -1) {
60
+ let src = node.getAttribute("src");
61
+ if (!src.match(/^https?:\/\//)) {
62
+ const srcClean = getCleanSrc(src);
63
+ node.setAttribute("src", srcClean);
64
+ node.removeAttribute("srcset");
65
+ node.removeAttribute("sizes");
39
66
  }
40
- srcClean = `https://travail-emploi.gouv.fr${srcClean}`;
41
67
  }
42
- // we remove the ie comment that have timestamp in the url
43
- comment.remove();
44
- // we add e
45
- const sourceNode = node.ownerDocument.createElement("source");
46
- sourceNode.setAttribute("srcset", srcClean);
47
- sourceNode.setAttribute("media", "(min-width: 300px)");
48
- node.appendChild(sourceNode);
49
- return node;
50
68
  };
51
69
  const formatAnchor = (node) => {
52
70
  if (node.innerHTML.trim() === "") {
@@ -80,6 +98,9 @@ const formatAnchor = (node) => {
80
98
  node.setAttribute("rel", "nofollow, noopener");
81
99
  }
82
100
  };
101
+ const removeNode = (node) => {
102
+ node.remove();
103
+ };
83
104
  const flattenCsBlocs = (node) => {
84
105
  node.insertAdjacentHTML("afterend", node.innerHTML);
85
106
  node.parentNode.removeChild(node);
@@ -101,27 +122,17 @@ function parseDom(dom, id, url) {
101
122
  if (!article) {
102
123
  throw new got_1.ParseError("no <main>");
103
124
  }
125
+ if (!id) {
126
+ throw new got_1.ParseError(`No id`);
127
+ }
104
128
  $$(article, "a").forEach(formatAnchor);
105
129
  $$(article, "picture").forEach(formatPicture);
106
130
  $$(article, "[data-cfemail]").forEach(formatEmail);
107
131
  $$(article, ".cs_blocs").forEach(flattenCsBlocs);
108
- const imgs = $$(article, "img");
109
- imgs.forEach((node) => {
110
- // remove adaptImgFix(this) on hero img
111
- node.removeAttribute("onmousedown");
112
- });
113
- imgs
114
- .filter((node) => node.getAttribute("src").indexOf("data:image") === -1)
115
- .forEach((node) => {
116
- let src = node.getAttribute("src");
117
- if (!src.match(/^https?:\/\//)) {
118
- if (src.slice(0, 1) !== "/") {
119
- src = "/" + src;
120
- }
121
- src = `https://travail-emploi.gouv.fr${src}`;
122
- node.setAttribute("src", src);
123
- }
124
- });
132
+ $$(article, "img").forEach(formatImage);
133
+ $$(article, "style").forEach(removeNode);
134
+ $$(article, "button").forEach(removeNode);
135
+ $$(article, ".oembed-source").forEach(removeNode);
125
136
  let titleElement = $(article, "h1");
126
137
  if (!titleElement) {
127
138
  titleElement = $(article, "h2");
@@ -130,40 +141,29 @@ function parseDom(dom, id, url) {
130
141
  }
131
142
  }
132
143
  const title = titleElement.textContent.trim();
133
- if (!id) {
134
- throw new got_1.ParseError(`No id`);
135
- }
136
- const dateRaw = $(dom.window.document, "meta[property*=modified_time]") ||
137
- $(dom.window.document, "meta[property$=published_time]");
144
+ const dateRaw = $(dom.window.document, "meta[property*=modified_time]") || $(dom.window.document, "meta[property$=published_time]");
138
145
  const [year, month, day] = dateRaw.getAttribute("content").split("-");
139
146
  let intro = $(article, ".main-article__chapo") || "";
140
- intro =
141
- intro && intro.innerHTML.replace(/\n/g, "").replace(/\s+/g, " ").trim();
142
- const description = $(dom.window.document, "meta[name=description]")?.getAttribute("content") ??
143
- "";
147
+ intro = intro && intro.innerHTML.replace(/\n/g, "").replace(/\s+/g, " ").trim();
148
+ const description = $(dom.window.document, "meta[name=description]")?.getAttribute("content") ?? "";
144
149
  const sections = [];
145
150
  const sectionTag = getSectionTag(article);
146
151
  // First pass is only to get a potential untitled section at the top of the article
147
152
  // This section has neither anchor nor title
148
153
  let nextArticleElement = $(article, ".main-article__texte > *");
149
154
  const untitledSection = {
150
- anchor: "",
151
- html: "",
152
- text: "",
153
- title: title,
155
+ anchor: "", html: "", text: "", title: title,
154
156
  };
155
- while (nextArticleElement &&
156
- nextArticleElement.tagName.toLowerCase() !== sectionTag) {
157
+ while (nextArticleElement && nextArticleElement.tagName.toLowerCase() !== sectionTag) {
157
158
  if (nextArticleElement.textContent) {
158
159
  if (!untitledSection.description) {
159
160
  untitledSection.description = "temp description";
160
161
  }
161
- untitledSection.html += (0, postProcess_1.htmlPostParser)(nextArticleElement.outerHTML
162
+ untitledSection.html += nextArticleElement.outerHTML
162
163
  .replace(/\n+/g, "")
163
164
  .replace(/>\s+</g, "><")
164
- .replace(/\s+/g, " "));
165
- untitledSection.text +=
166
- " " + nextArticleElement.textContent.replace(/\s+/g, " ").trim();
165
+ .replace(/\s+/g, " ");
166
+ untitledSection.text += " " + nextArticleElement.textContent.replace(/\s+/g, " ").trim();
167
167
  }
168
168
  nextArticleElement = nextArticleElement.nextElementSibling;
169
169
  }
@@ -189,7 +189,7 @@ function parseDom(dom, id, url) {
189
189
  sections.push({
190
190
  anchor: el.getAttribute("id") || (0, cdtn_slugify_1.default)(el.textContent),
191
191
  description: sectionText.slice(0, 200).trim(),
192
- html: (0, postProcess_1.htmlPostParser)(html.replace(/\n+/g, "").replace(/>\s+</g, "><").replace(/\s+/g, " ")),
192
+ html: html.replace(/\n+/g, "").replace(/>\s+</g, "><").replace(/\s+/g, " "),
193
193
  references: getReferences(sectionText),
194
194
  text: sectionText,
195
195
  title: el.textContent.trim(),
@@ -200,13 +200,7 @@ function parseDom(dom, id, url) {
200
200
  throw new got_1.ParseError(`No sections`);
201
201
  }
202
202
  return {
203
- date: `${day}/${month}/${year}`,
204
- description,
205
- intro,
206
- pubId: id,
207
- sections,
208
- title,
209
- url,
203
+ date: `${day}/${month}/${year}`, description, intro, pubId: id, sections, title, url,
210
204
  };
211
205
  }
212
206
  exports.parseDom = parseDom;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@socialgouv/fiches-travail-data-types",
3
- "version": "4.438.0",
3
+ "version": "4.439.0",
4
4
  "main": "build/index.js",
5
5
  "module": "build/index.js",
6
6
  "files": [
@@ -34,7 +34,7 @@
34
34
  "babel-jest": "^27.4.4",
35
35
  "eslint": "^8.28.0",
36
36
  "eslint-plugin-jest": "^27.1.6",
37
- "husky": "^7.0.4",
37
+ "husky": "^8.0.0",
38
38
  "jest": "^29.3.1",
39
39
  "jsdom": "^17.0.0",
40
40
  "prettier": "^2.5.1",
@@ -1 +0,0 @@
1
- export declare const htmlPostParser: (html: string) => string;
@@ -1,61 +0,0 @@
1
- "use strict";
2
- var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
- if (k2 === undefined) k2 = k;
4
- var desc = Object.getOwnPropertyDescriptor(m, k);
5
- if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
- desc = { enumerable: true, get: function() { return m[k]; } };
7
- }
8
- Object.defineProperty(o, k2, desc);
9
- }) : (function(o, m, k, k2) {
10
- if (k2 === undefined) k2 = k;
11
- o[k2] = m[k];
12
- }));
13
- var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
- Object.defineProperty(o, "default", { enumerable: true, value: v });
15
- }) : function(o, v) {
16
- o["default"] = v;
17
- });
18
- var __importStar = (this && this.__importStar) || function (mod) {
19
- if (mod && mod.__esModule) return mod;
20
- var result = {};
21
- if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
22
- __setModuleDefault(result, mod);
23
- return result;
24
- };
25
- Object.defineProperty(exports, "__esModule", { value: true });
26
- exports.htmlPostParser = void 0;
27
- const cheerio = __importStar(require("cheerio"));
28
- const htmlPostParser = (html) => {
29
- const $ = cheerio.load(html, null, false);
30
- const arrImgSrc = [];
31
- let currentIndex = 0;
32
- $("style").remove();
33
- // https://travail-emploi.gouv.fr/le-ministere-en-action/coronavirus-covid-19/questions-reponses-par-theme/article/mesures-de-prevention-dans-l-entreprise-contre-la-covid-19
34
- $("button").remove();
35
- $(".oembed-source").remove();
36
- // https://travail-emploi.gouv.fr/emploi-et-insertion/accompagnement-des-mutations-economiques/activite-partielle-chomage-partiel/article/activite-partielle-chomage-partiel
37
- $("*")
38
- .contents()
39
- .each(function () {
40
- if (this.nodeType === 8) {
41
- const regex = /src=['"](.*?)['"]/;
42
- const result = regex.exec(this.nodeValue);
43
- if (result) {
44
- const src = result[0].slice(5, -1);
45
- if (src)
46
- arrImgSrc.push(src);
47
- }
48
- }
49
- });
50
- $("picture").replaceWith(() => {
51
- const src = arrImgSrc[currentIndex];
52
- if (src) {
53
- currentIndex++;
54
- return `<img src="https://travail-emploi.gouv.fr/${src}" style="width:100%;height:auto;"/>`;
55
- }
56
- return $.html()?.toString() ?? "";
57
- });
58
- $("picture.adapt-img-wrapper").remove();
59
- return $.html();
60
- };
61
- exports.htmlPostParser = htmlPostParser;