npm - @socialgouv/fiches-travail-data-types - Versions diffs - 4.699.0 → 4.701.0 - Mend

@socialgouv/fiches-travail-data-types 4.699.0 → 4.701.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/build/fetch-data/index.d.ts +1 -1
package/build/fetch-data/index.js +3 -11
package/build/fetch-data/parseDom.d.ts +3 -7
package/build/fetch-data/parseDom.js +175 -81
package/build/fetch-data/scrapUrl.d.ts +2 -7
package/package.json +2 -1

package/build/fetch-data/index.d.ts CHANGED Viewed

@@ -1,2 +1,2 @@
-export function fetchFeed(url: any): Promise<any[]>;
+export function fetchFeed(): Promise<any>;
 export function scrap(urls: any): Promise<any[]>;

package/build/fetch-data/index.js CHANGED Viewed

@@ -5,22 +5,14 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
 Object.defineProperty(exports, "__esModule", { value: true });
 exports.scrap = exports.fetchFeed = void 0;
 const fs_1 = __importDefault(require("fs"));
-const got_1 = __importDefault(require("got"));
 const p_limit_1 = __importDefault(require("p-limit"));
 const path_1 = __importDefault(require("path"));
-const injectToken_1 = require("./injectToken");
 const scrapUrl_1 = require("./scrapUrl");
 const FEED_URL = "https://travail-emploi.gouv.fr/?page=oseo_json";
 const limit = (0, p_limit_1.default)(10);
-async function fetchFeed(url) {
-    const response = await got_1.default.post((0, injectToken_1.injectToken)(url), {
-        http2: true,
-        retry: 3,
-    });
-    const { fiches: feed } = JSON.parse(response.body);
-    const localJson = fs_1.default.readFileSync(path_1.default.join(__dirname, "../../local.data.json"), "utf8");
-    const { fiches: localFeed } = JSON.parse(localJson);
-    return [...feed, ...localFeed];
+async function fetchFeed() {
+    const localJsonData = fs_1.default.readFileSync(path_1.default.join(__dirname, "../../local.data.json"), "utf8");
+    return JSON.parse(localJsonData).fiches;
 }
 exports.fetchFeed = fetchFeed;
 async function scrap(urls) {

package/build/fetch-data/parseDom.d.ts CHANGED Viewed

@@ -1,14 +1,10 @@
 export function parseDom(dom: any, id: any, url: any): {
-    date: string;
+    date: never;
     description: any;
     intro: any;
     pubId: any;
-    sections: {
-        anchor: string;
-        html: string;
-        text: string;
-        title: any;
-    }[];
+    sections: any;
     title: any;
     url: any;
 };
+export function textClean(text: any, noNbsp?: boolean): any;

package/build/fetch-data/parseDom.js CHANGED Viewed

@@ -3,12 +3,13 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
     return (mod && mod.__esModule) ? mod : { "default": mod };
 };
 Object.defineProperty(exports, "__esModule", { value: true });
-exports.parseDom = void 0;
+exports.parseDom = exports.textClean = void 0;
 const cdtn_slugify_1 = __importDefault(require("@socialgouv/cdtn-slugify"));
 const got_1 = require("got");
 const email_1 = require("../email");
 const referenceExtractor_1 = require("./referenceExtractor");
 const referenceResolver_1 = require("./referenceResolver");
+const jsdom_1 = require("jsdom");
 const $$ = (node, selector) => Array.from(node.querySelectorAll(selector));
 const $ = (node, selector) => node.querySelector(selector);
 function unwrapEmail(data = "") {
@@ -110,22 +111,166 @@ const flattenCsBlocs = (node) => {
     node.insertAdjacentHTML("afterend", node.innerHTML);
     node.parentNode.removeChild(node);
 };
-const getSectionTag = (article) => {
-    const h3 = $$(article, ".main-article__texte > h3").length && "h3";
-    const h4 = $$(article, ".main-article__texte > h4").length && "h4";
-    const h5 = $$(article, ".main-article__texte > h5").length && "h5";
-    return h3 || h4 || h5 || "sectionTag";
-};
 const getReferences = (text) => {
     // first we extract the tokens referencing articles
     const references = (0, referenceExtractor_1.extractReferences)(text);
     // then we try to resolve the actual articles ids using legi-data
     return (0, referenceResolver_1.resolveReferences)(references);
 };
+const textClean = (text, noNbsp = false) => {
+    const regexStr = "\\n";
+    return text
+        .replace(new RegExp(noNbsp ? `(${regexStr}|&nbsp;)` : `(${regexStr})`, "g"), " ")
+        .replace(/([.!?]+)(?![^<]*>)/g, "$1 ")
+        .replace(/[ ]{2,}/g, " ")
+        .trim();
+};
+exports.textClean = textClean;
+const duplicateContent = (sections, highlight) => {
+    if (highlight) {
+        return sections.filter((section) => highlight.text
+            .replace(/\s+/g, "")
+            .toLowerCase()
+            .includes(section.text.replace(/\s+/g, "").toLowerCase())).length;
+    }
+    return 0;
+};
+function parseHTMLSections(dom) {
+    const document = dom.window.document;
+    const mainContent = $(document, ".main-content");
+    if (!mainContent) {
+        throw new Error('No <div class="main-content"> found in the HTML content.');
+    }
+    const sections = [];
+    const h2Tags = $$(mainContent, "h2");
+    h2Tags.forEach((h2Tag) => {
+        const section = {
+            title: (0, exports.textClean)(h2Tag.textContent, true) || "",
+            html: "",
+            text: "",
+        };
+        let nextSibling = h2Tag.nextElementSibling;
+        if (!nextSibling) {
+            nextSibling = h2Tag.parentElement
+                ? h2Tag.parentElement.nextElementSibling
+                : undefined;
+            if (!nextSibling && h2Tag.parentElement) {
+                nextSibling = h2Tag.parentElement.parentElement
+                    ? h2Tag.parentElement.parentElement.nextElementSibling
+                    : undefined;
+            }
+        }
+        const sectionHtmlContent = [];
+        const sectionTextContent = [];
+        while (nextSibling && nextSibling.nodeName !== "H2") {
+            sectionHtmlContent.push((0, exports.textClean)(nextSibling.outerHTML || "", true));
+            sectionTextContent.push((0, exports.textClean)(nextSibling.textContent || "", true));
+            nextSibling = nextSibling.nextElementSibling;
+        }
+        section.html = sectionHtmlContent.join("").trim();
+        section.text = sectionTextContent.join("").trim();
+        sections.push(section);
+    });
+    const cleanSections = sections.map((section) => ({
+        ...section,
+        // Sometimes, we have all the html in a section
+        // We check a second times and delete HTML from the h2 found
+        // (H2 should not be in a section)
+        html: removeExtraH2(section.html),
+    }));
+    if (cleanSections.find((section) => section.html === "")) {
+        return [
+            {
+                title: "Contenu",
+                html: mainContent.innerHTML,
+                text: mainContent.textContent,
+            },
+        ];
+    }
+    if (cleanSections) {
+        return cleanSections;
+    }
+}
+const removeExtraH2 = (html) => {
+    const dom = new jsdom_1.JSDOM(`<div>${html}</div>`);
+    const document = dom.window.document;
+    const mainDiv = $(document, "div");
+    const firstH2 = $(mainDiv, "h2");
+    if (firstH2) {
+        let parent = firstH2.parentElement;
+        let h2 = firstH2;
+        while (parent.nextElementSibling) {
+            parent.nextElementSibling.remove();
+        }
+        while (firstH2.nextElementSibling) {
+            firstH2.nextElementSibling.remove();
+        }
+        h2.remove();
+    }
+    return (0, exports.textClean)(mainDiv.innerHTML, true);
+};
+const parseHighlight = (dom) => {
+    const document = dom.window.document;
+    const mainContent = $(document, ".main-content");
+    if (!mainContent) {
+        throw new Error('No <div class="main-content"> found in the HTML content.');
+    }
+    const highlightHtmlContent = [];
+    const highlightTextContent = [];
+    let nextSibling = mainContent.firstElementChild;
+    while (nextSibling && nextSibling.nodeName !== "H2") {
+        highlightHtmlContent.push((0, exports.textClean)(nextSibling.outerHTML || "", true));
+        highlightTextContent.push((0, exports.textClean)(nextSibling.textContent || "", true));
+        nextSibling = nextSibling.nextSibling;
+    }
+    if (highlightHtmlContent.length > 0) {
+        return {
+            title: "",
+            html: (0, exports.textClean)(highlightHtmlContent.join("").trim(), true),
+            text: highlightTextContent.join("").trim(),
+        };
+    }
+    return undefined;
+};
+const getDate = (article) => {
+    const firstParagraph = $(article, "p");
+    let publicationAt = null;
+    let updatedAt = null;
+    if (!firstParagraph) {
+        throw new Error("Can't find the updated date, first paragraph missing");
+    }
+    const spans = $$(firstParagraph, "span");
+    spans.forEach((span) => {
+        const textContent = span.textContent;
+        if (textContent.includes("Publié le")) {
+            publicationAt = textContent.match(/\d{1,2}\/\d{1,2}\/\d{4}/);
+        }
+        if (textContent.includes("Mis à jour le")) {
+            updatedAt = textContent.match(/\d{1,2}\/\d{1,2}\/\d{4}/);
+        }
+    });
+    if (updatedAt) {
+        return updatedAt[0];
+    }
+    if (publicationAt) {
+        return publicationAt[0];
+    }
+    throw new Error("Can't find the updated date in the first paragraph");
+};
+const populateSections = (sections) => {
+    return sections.map((section) => ({
+        anchor: (0, cdtn_slugify_1.default)(section.title),
+        description: section.text.slice(0, 200),
+        html: section.html,
+        references: getReferences(section.text),
+        text: section.text,
+        title: section.title,
+    }));
+};
 function parseDom(dom, id, url) {
-    const article = $(dom.window.document, "main");
+    const article = $(dom.window.document, "article");
     if (!article) {
-        throw new got_1.ParseError("no <main>");
+        throw new got_1.ParseError("no <article>");
     }
     if (!id) {
         throw new got_1.ParseError(`No id`);
@@ -135,8 +280,6 @@ function parseDom(dom, id, url) {
     $$(article, "[data-cfemail]").forEach(formatEmail);
     $$(article, ".cs_blocs").forEach(flattenCsBlocs);
     $$(article, "img").forEach(formatImage);
-    $$(article, "style").forEach(removeNode);
-    $$(article, "button").forEach(removeNode);
     $$(article, ".oembed-source").forEach(removeNode);
     let titleElement = $(article, "h1");
     if (!titleElement) {
@@ -145,87 +288,38 @@ function parseDom(dom, id, url) {
             throw new got_1.ParseError("No <h1> or <h2> element");
         }
     }
-    const title = titleElement.textContent.trim();
-    const dateRaw = $(dom.window.document, "meta[property*=modified_time]") ||
-        $(dom.window.document, "meta[property$=published_time]");
-    const [year, month, day] = dateRaw.getAttribute("content").split("-");
-    let intro = $(article, ".main-article__chapo") || "";
+    const title = (0, exports.textClean)(titleElement.textContent, true);
+    const date = getDate(article);
+    let intro = $(article, ".fr-text--lead") || "";
     intro =
         intro &&
-            intro.innerHTML
-                .replace(/\n/g, "")
-                .replace(/\s+/g, " ")
-                .trim()
-                .replace(/<script[^>]*>([\s\S]*?)<\/script>/g, "");
+            (0, exports.textClean)(intro.innerHTML, true).replace(/<script[^>]*>([\s\S]*?)<\/script>/g, "");
     const description = $(dom.window.document, "meta[name=description]")?.getAttribute("content") ??
         "";
-    const sections = [];
-    const sectionTag = getSectionTag(article);
-    // First pass is only to get a potential untitled section at the top of the article
-    // This section has neither anchor nor title
-    let nextArticleElement = $(article, ".main-article__texte > *");
-    const untitledSection = {
-        anchor: "",
-        html: "",
-        text: "",
-        title: title,
-    };
-    while (nextArticleElement &&
-        nextArticleElement.tagName.toLowerCase() !== sectionTag) {
-        if (nextArticleElement.textContent) {
-            if (!untitledSection.description) {
-                untitledSection.description = "temp description";
-            }
-            untitledSection.html += nextArticleElement.outerHTML
-                .replace(/\n+/g, "")
-                .replace(/>\s+</g, "><")
-                .replace(/\s+/g, " ");
-            untitledSection.text +=
-                " " + nextArticleElement.textContent.replace(/\s+/g, " ").trim();
-        }
-        nextArticleElement = nextArticleElement.nextElementSibling;
-    }
-    if (untitledSection.description) {
-        untitledSection.text.trim();
-        untitledSection.description = untitledSection.text.slice(0, 200).trim();
-        untitledSection.references = getReferences(untitledSection.text);
-        sections.push(untitledSection);
-    }
-    // Gets all the titled content
-    const articleChildren = $$(article, `.main-article__texte > ${sectionTag}`);
-    articleChildren.forEach(function (el) {
-        if (el.tagName.toLowerCase() === sectionTag) {
-            let nextEl = el.nextElementSibling;
-            let html = "";
-            while (nextEl && nextEl.tagName.toLowerCase() !== sectionTag) {
-                html += nextEl.outerHTML;
-                nextEl = nextEl.nextElementSibling;
-            }
-            const section = dom.window.document.createElement("div");
-            section.innerHTML = html;
-            const sectionText = section.textContent.replace(/\s+/g, " ").trim();
-            sections.push({
-                anchor: el.getAttribute("id") || (0, cdtn_slugify_1.default)(el.textContent),
-                description: sectionText.slice(0, 200).trim(),
-                html: html
-                    .replace(/\n+/g, "")
-                    .replace(/>\s+</g, "><")
-                    .replace(/\s+/g, " "),
-                references: getReferences(sectionText),
-                text: sectionText,
-                title: el.textContent.trim(),
-            });
-        }
-    });
+    let sections = parseHTMLSections(dom);
+    let highlight = parseHighlight(dom);
+    const duplicatedCount = duplicateContent(sections, highlight);
+    if (duplicatedCount >= sections.length) {
+        sections = [];
+    }
+    else if (duplicatedCount > 0) {
+        highlight = {
+            ...highlight,
+            html: removeExtraH2(highlight.html),
+        };
+    }
+    if (highlight) {
+        sections.unshift(highlight);
+    }
     if (sections.length === 0) {
         throw new got_1.ParseError(`No sections`);
     }
     return {
-        date: `${day}/${month}/${year}`,
+        date,
         description,
         intro,
         pubId: id,
-        sections,
+        sections: populateSections(sections),
         title,
         url,
     };

package/build/fetch-data/scrapUrl.d.ts CHANGED Viewed

@@ -1,14 +1,9 @@
 export function scrapUrl(id: any, url: any): Promise<{
-    date: string;
+    date: never;
     description: any;
     intro: any;
     pubId: any;
-    sections: {
-        anchor: string;
-        html: string;
-        text: string;
-        title: any;
-    }[];
+    sections: any;
     title: any;
     url: any;
 }>;

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@socialgouv/fiches-travail-data-types",
-  "version": "4.699.0",
+  "version": "4.701.0",
   "main": "build/index.js",
   "module": "build/index.js",
   "files": [
@@ -29,6 +29,7 @@
     "@babel/preset-env": "^7.16.4",
     "@swc/core": "^1.3.21",
     "@swc/jest": "^0.2.23",
+    "@types/jsdom": "^21.1.7",
     "@typescript-eslint/eslint-plugin": "^5.45.0",
     "@typescript-eslint/parser": "^5.45.0",
     "babel-jest": "^27.4.4",