@socialgouv/fiches-travail-data-types 4.699.0 → 4.701.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,2 +1,2 @@
1
- export function fetchFeed(url: any): Promise<any[]>;
1
+ export function fetchFeed(): Promise<any>;
2
2
  export function scrap(urls: any): Promise<any[]>;
@@ -5,22 +5,14 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
5
5
  Object.defineProperty(exports, "__esModule", { value: true });
6
6
  exports.scrap = exports.fetchFeed = void 0;
7
7
  const fs_1 = __importDefault(require("fs"));
8
- const got_1 = __importDefault(require("got"));
9
8
  const p_limit_1 = __importDefault(require("p-limit"));
10
9
  const path_1 = __importDefault(require("path"));
11
- const injectToken_1 = require("./injectToken");
12
10
  const scrapUrl_1 = require("./scrapUrl");
13
11
  const FEED_URL = "https://travail-emploi.gouv.fr/?page=oseo_json";
14
12
  const limit = (0, p_limit_1.default)(10);
15
- async function fetchFeed(url) {
16
- const response = await got_1.default.post((0, injectToken_1.injectToken)(url), {
17
- http2: true,
18
- retry: 3,
19
- });
20
- const { fiches: feed } = JSON.parse(response.body);
21
- const localJson = fs_1.default.readFileSync(path_1.default.join(__dirname, "../../local.data.json"), "utf8");
22
- const { fiches: localFeed } = JSON.parse(localJson);
23
- return [...feed, ...localFeed];
13
+ async function fetchFeed() {
14
+ const localJsonData = fs_1.default.readFileSync(path_1.default.join(__dirname, "../../local.data.json"), "utf8");
15
+ return JSON.parse(localJsonData).fiches;
24
16
  }
25
17
  exports.fetchFeed = fetchFeed;
26
18
  async function scrap(urls) {
@@ -1,14 +1,10 @@
1
1
  export function parseDom(dom: any, id: any, url: any): {
2
- date: string;
2
+ date: never;
3
3
  description: any;
4
4
  intro: any;
5
5
  pubId: any;
6
- sections: {
7
- anchor: string;
8
- html: string;
9
- text: string;
10
- title: any;
11
- }[];
6
+ sections: any;
12
7
  title: any;
13
8
  url: any;
14
9
  };
10
+ export function textClean(text: any, noNbsp?: boolean): any;
@@ -3,12 +3,13 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
3
3
  return (mod && mod.__esModule) ? mod : { "default": mod };
4
4
  };
5
5
  Object.defineProperty(exports, "__esModule", { value: true });
6
- exports.parseDom = void 0;
6
+ exports.parseDom = exports.textClean = void 0;
7
7
  const cdtn_slugify_1 = __importDefault(require("@socialgouv/cdtn-slugify"));
8
8
  const got_1 = require("got");
9
9
  const email_1 = require("../email");
10
10
  const referenceExtractor_1 = require("./referenceExtractor");
11
11
  const referenceResolver_1 = require("./referenceResolver");
12
+ const jsdom_1 = require("jsdom");
12
13
  const $$ = (node, selector) => Array.from(node.querySelectorAll(selector));
13
14
  const $ = (node, selector) => node.querySelector(selector);
14
15
  function unwrapEmail(data = "") {
@@ -110,22 +111,166 @@ const flattenCsBlocs = (node) => {
110
111
  node.insertAdjacentHTML("afterend", node.innerHTML);
111
112
  node.parentNode.removeChild(node);
112
113
  };
113
- const getSectionTag = (article) => {
114
- const h3 = $$(article, ".main-article__texte > h3").length && "h3";
115
- const h4 = $$(article, ".main-article__texte > h4").length && "h4";
116
- const h5 = $$(article, ".main-article__texte > h5").length && "h5";
117
- return h3 || h4 || h5 || "sectionTag";
118
- };
119
114
  const getReferences = (text) => {
120
115
  // first we extract the tokens referencing articles
121
116
  const references = (0, referenceExtractor_1.extractReferences)(text);
122
117
  // then we try to resolve the actual articles ids using legi-data
123
118
  return (0, referenceResolver_1.resolveReferences)(references);
124
119
  };
120
+ const textClean = (text, noNbsp = false) => {
121
+ const regexStr = "\\n";
122
+ return text
123
+ .replace(new RegExp(noNbsp ? `(${regexStr}|&nbsp;)` : `(${regexStr})`, "g"), " ")
124
+ .replace(/([.!?]+)(?![^<]*>)/g, "$1 ")
125
+ .replace(/[ ]{2,}/g, " ")
126
+ .trim();
127
+ };
128
+ exports.textClean = textClean;
129
+ const duplicateContent = (sections, highlight) => {
130
+ if (highlight) {
131
+ return sections.filter((section) => highlight.text
132
+ .replace(/\s+/g, "")
133
+ .toLowerCase()
134
+ .includes(section.text.replace(/\s+/g, "").toLowerCase())).length;
135
+ }
136
+ return 0;
137
+ };
138
+ function parseHTMLSections(dom) {
139
+ const document = dom.window.document;
140
+ const mainContent = $(document, ".main-content");
141
+ if (!mainContent) {
142
+ throw new Error('No <div class="main-content"> found in the HTML content.');
143
+ }
144
+ const sections = [];
145
+ const h2Tags = $$(mainContent, "h2");
146
+ h2Tags.forEach((h2Tag) => {
147
+ const section = {
148
+ title: (0, exports.textClean)(h2Tag.textContent, true) || "",
149
+ html: "",
150
+ text: "",
151
+ };
152
+ let nextSibling = h2Tag.nextElementSibling;
153
+ if (!nextSibling) {
154
+ nextSibling = h2Tag.parentElement
155
+ ? h2Tag.parentElement.nextElementSibling
156
+ : undefined;
157
+ if (!nextSibling && h2Tag.parentElement) {
158
+ nextSibling = h2Tag.parentElement.parentElement
159
+ ? h2Tag.parentElement.parentElement.nextElementSibling
160
+ : undefined;
161
+ }
162
+ }
163
+ const sectionHtmlContent = [];
164
+ const sectionTextContent = [];
165
+ while (nextSibling && nextSibling.nodeName !== "H2") {
166
+ sectionHtmlContent.push((0, exports.textClean)(nextSibling.outerHTML || "", true));
167
+ sectionTextContent.push((0, exports.textClean)(nextSibling.textContent || "", true));
168
+ nextSibling = nextSibling.nextElementSibling;
169
+ }
170
+ section.html = sectionHtmlContent.join("").trim();
171
+ section.text = sectionTextContent.join("").trim();
172
+ sections.push(section);
173
+ });
174
+ const cleanSections = sections.map((section) => ({
175
+ ...section,
176
+ // Sometimes, we have all the html in a section
177
+ // We check a second times and delete HTML from the h2 found
178
+ // (H2 should not be in a section)
179
+ html: removeExtraH2(section.html),
180
+ }));
181
+ if (cleanSections.find((section) => section.html === "")) {
182
+ return [
183
+ {
184
+ title: "Contenu",
185
+ html: mainContent.innerHTML,
186
+ text: mainContent.textContent,
187
+ },
188
+ ];
189
+ }
190
+ if (cleanSections) {
191
+ return cleanSections;
192
+ }
193
+ }
194
+ const removeExtraH2 = (html) => {
195
+ const dom = new jsdom_1.JSDOM(`<div>${html}</div>`);
196
+ const document = dom.window.document;
197
+ const mainDiv = $(document, "div");
198
+ const firstH2 = $(mainDiv, "h2");
199
+ if (firstH2) {
200
+ let parent = firstH2.parentElement;
201
+ let h2 = firstH2;
202
+ while (parent.nextElementSibling) {
203
+ parent.nextElementSibling.remove();
204
+ }
205
+ while (firstH2.nextElementSibling) {
206
+ firstH2.nextElementSibling.remove();
207
+ }
208
+ h2.remove();
209
+ }
210
+ return (0, exports.textClean)(mainDiv.innerHTML, true);
211
+ };
212
+ const parseHighlight = (dom) => {
213
+ const document = dom.window.document;
214
+ const mainContent = $(document, ".main-content");
215
+ if (!mainContent) {
216
+ throw new Error('No <div class="main-content"> found in the HTML content.');
217
+ }
218
+ const highlightHtmlContent = [];
219
+ const highlightTextContent = [];
220
+ let nextSibling = mainContent.firstElementChild;
221
+ while (nextSibling && nextSibling.nodeName !== "H2") {
222
+ highlightHtmlContent.push((0, exports.textClean)(nextSibling.outerHTML || "", true));
223
+ highlightTextContent.push((0, exports.textClean)(nextSibling.textContent || "", true));
224
+ nextSibling = nextSibling.nextSibling;
225
+ }
226
+ if (highlightHtmlContent.length > 0) {
227
+ return {
228
+ title: "",
229
+ html: (0, exports.textClean)(highlightHtmlContent.join("").trim(), true),
230
+ text: highlightTextContent.join("").trim(),
231
+ };
232
+ }
233
+ return undefined;
234
+ };
235
+ const getDate = (article) => {
236
+ const firstParagraph = $(article, "p");
237
+ let publicationAt = null;
238
+ let updatedAt = null;
239
+ if (!firstParagraph) {
240
+ throw new Error("Can't find the updated date, first paragraph missing");
241
+ }
242
+ const spans = $$(firstParagraph, "span");
243
+ spans.forEach((span) => {
244
+ const textContent = span.textContent;
245
+ if (textContent.includes("Publié le")) {
246
+ publicationAt = textContent.match(/\d{1,2}\/\d{1,2}\/\d{4}/);
247
+ }
248
+ if (textContent.includes("Mis à jour le")) {
249
+ updatedAt = textContent.match(/\d{1,2}\/\d{1,2}\/\d{4}/);
250
+ }
251
+ });
252
+ if (updatedAt) {
253
+ return updatedAt[0];
254
+ }
255
+ if (publicationAt) {
256
+ return publicationAt[0];
257
+ }
258
+ throw new Error("Can't find the updated date in the first paragraph");
259
+ };
260
+ const populateSections = (sections) => {
261
+ return sections.map((section) => ({
262
+ anchor: (0, cdtn_slugify_1.default)(section.title),
263
+ description: section.text.slice(0, 200),
264
+ html: section.html,
265
+ references: getReferences(section.text),
266
+ text: section.text,
267
+ title: section.title,
268
+ }));
269
+ };
125
270
  function parseDom(dom, id, url) {
126
- const article = $(dom.window.document, "main");
271
+ const article = $(dom.window.document, "article");
127
272
  if (!article) {
128
- throw new got_1.ParseError("no <main>");
273
+ throw new got_1.ParseError("no <article>");
129
274
  }
130
275
  if (!id) {
131
276
  throw new got_1.ParseError(`No id`);
@@ -135,8 +280,6 @@ function parseDom(dom, id, url) {
135
280
  $$(article, "[data-cfemail]").forEach(formatEmail);
136
281
  $$(article, ".cs_blocs").forEach(flattenCsBlocs);
137
282
  $$(article, "img").forEach(formatImage);
138
- $$(article, "style").forEach(removeNode);
139
- $$(article, "button").forEach(removeNode);
140
283
  $$(article, ".oembed-source").forEach(removeNode);
141
284
  let titleElement = $(article, "h1");
142
285
  if (!titleElement) {
@@ -145,87 +288,38 @@ function parseDom(dom, id, url) {
145
288
  throw new got_1.ParseError("No <h1> or <h2> element");
146
289
  }
147
290
  }
148
- const title = titleElement.textContent.trim();
149
- const dateRaw = $(dom.window.document, "meta[property*=modified_time]") ||
150
- $(dom.window.document, "meta[property$=published_time]");
151
- const [year, month, day] = dateRaw.getAttribute("content").split("-");
152
- let intro = $(article, ".main-article__chapo") || "";
291
+ const title = (0, exports.textClean)(titleElement.textContent, true);
292
+ const date = getDate(article);
293
+ let intro = $(article, ".fr-text--lead") || "";
153
294
  intro =
154
295
  intro &&
155
- intro.innerHTML
156
- .replace(/\n/g, "")
157
- .replace(/\s+/g, " ")
158
- .trim()
159
- .replace(/<script[^>]*>([\s\S]*?)<\/script>/g, "");
296
+ (0, exports.textClean)(intro.innerHTML, true).replace(/<script[^>]*>([\s\S]*?)<\/script>/g, "");
160
297
  const description = $(dom.window.document, "meta[name=description]")?.getAttribute("content") ??
161
298
  "";
162
- const sections = [];
163
- const sectionTag = getSectionTag(article);
164
- // First pass is only to get a potential untitled section at the top of the article
165
- // This section has neither anchor nor title
166
- let nextArticleElement = $(article, ".main-article__texte > *");
167
- const untitledSection = {
168
- anchor: "",
169
- html: "",
170
- text: "",
171
- title: title,
172
- };
173
- while (nextArticleElement &&
174
- nextArticleElement.tagName.toLowerCase() !== sectionTag) {
175
- if (nextArticleElement.textContent) {
176
- if (!untitledSection.description) {
177
- untitledSection.description = "temp description";
178
- }
179
- untitledSection.html += nextArticleElement.outerHTML
180
- .replace(/\n+/g, "")
181
- .replace(/>\s+</g, "><")
182
- .replace(/\s+/g, " ");
183
- untitledSection.text +=
184
- " " + nextArticleElement.textContent.replace(/\s+/g, " ").trim();
185
- }
186
- nextArticleElement = nextArticleElement.nextElementSibling;
187
- }
188
- if (untitledSection.description) {
189
- untitledSection.text.trim();
190
- untitledSection.description = untitledSection.text.slice(0, 200).trim();
191
- untitledSection.references = getReferences(untitledSection.text);
192
- sections.push(untitledSection);
193
- }
194
- // Gets all the titled content
195
- const articleChildren = $$(article, `.main-article__texte > ${sectionTag}`);
196
- articleChildren.forEach(function (el) {
197
- if (el.tagName.toLowerCase() === sectionTag) {
198
- let nextEl = el.nextElementSibling;
199
- let html = "";
200
- while (nextEl && nextEl.tagName.toLowerCase() !== sectionTag) {
201
- html += nextEl.outerHTML;
202
- nextEl = nextEl.nextElementSibling;
203
- }
204
- const section = dom.window.document.createElement("div");
205
- section.innerHTML = html;
206
- const sectionText = section.textContent.replace(/\s+/g, " ").trim();
207
- sections.push({
208
- anchor: el.getAttribute("id") || (0, cdtn_slugify_1.default)(el.textContent),
209
- description: sectionText.slice(0, 200).trim(),
210
- html: html
211
- .replace(/\n+/g, "")
212
- .replace(/>\s+</g, "><")
213
- .replace(/\s+/g, " "),
214
- references: getReferences(sectionText),
215
- text: sectionText,
216
- title: el.textContent.trim(),
217
- });
218
- }
219
- });
299
+ let sections = parseHTMLSections(dom);
300
+ let highlight = parseHighlight(dom);
301
+ const duplicatedCount = duplicateContent(sections, highlight);
302
+ if (duplicatedCount >= sections.length) {
303
+ sections = [];
304
+ }
305
+ else if (duplicatedCount > 0) {
306
+ highlight = {
307
+ ...highlight,
308
+ html: removeExtraH2(highlight.html),
309
+ };
310
+ }
311
+ if (highlight) {
312
+ sections.unshift(highlight);
313
+ }
220
314
  if (sections.length === 0) {
221
315
  throw new got_1.ParseError(`No sections`);
222
316
  }
223
317
  return {
224
- date: `${day}/${month}/${year}`,
318
+ date,
225
319
  description,
226
320
  intro,
227
321
  pubId: id,
228
- sections,
322
+ sections: populateSections(sections),
229
323
  title,
230
324
  url,
231
325
  };
@@ -1,14 +1,9 @@
1
1
  export function scrapUrl(id: any, url: any): Promise<{
2
- date: string;
2
+ date: never;
3
3
  description: any;
4
4
  intro: any;
5
5
  pubId: any;
6
- sections: {
7
- anchor: string;
8
- html: string;
9
- text: string;
10
- title: any;
11
- }[];
6
+ sections: any;
12
7
  title: any;
13
8
  url: any;
14
9
  }>;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@socialgouv/fiches-travail-data-types",
3
- "version": "4.699.0",
3
+ "version": "4.701.0",
4
4
  "main": "build/index.js",
5
5
  "module": "build/index.js",
6
6
  "files": [
@@ -29,6 +29,7 @@
29
29
  "@babel/preset-env": "^7.16.4",
30
30
  "@swc/core": "^1.3.21",
31
31
  "@swc/jest": "^0.2.23",
32
+ "@types/jsdom": "^21.1.7",
32
33
  "@typescript-eslint/eslint-plugin": "^5.45.0",
33
34
  "@typescript-eslint/parser": "^5.45.0",
34
35
  "babel-jest": "^27.4.4",