@socialgouv/fiches-travail-data-types 4.700.0 → 4.701.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
export function fetchFeed(
|
|
1
|
+
export function fetchFeed(): Promise<any>;
|
|
2
2
|
export function scrap(urls: any): Promise<any[]>;
|
|
@@ -5,22 +5,14 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
5
5
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
6
|
exports.scrap = exports.fetchFeed = void 0;
|
|
7
7
|
const fs_1 = __importDefault(require("fs"));
|
|
8
|
-
const got_1 = __importDefault(require("got"));
|
|
9
8
|
const p_limit_1 = __importDefault(require("p-limit"));
|
|
10
9
|
const path_1 = __importDefault(require("path"));
|
|
11
|
-
const injectToken_1 = require("./injectToken");
|
|
12
10
|
const scrapUrl_1 = require("./scrapUrl");
|
|
13
11
|
const FEED_URL = "https://travail-emploi.gouv.fr/?page=oseo_json";
|
|
14
12
|
const limit = (0, p_limit_1.default)(10);
|
|
15
|
-
async function fetchFeed(
|
|
16
|
-
const
|
|
17
|
-
|
|
18
|
-
retry: 3,
|
|
19
|
-
});
|
|
20
|
-
const { fiches: feed } = JSON.parse(response.body);
|
|
21
|
-
const localJson = fs_1.default.readFileSync(path_1.default.join(__dirname, "../../local.data.json"), "utf8");
|
|
22
|
-
const { fiches: localFeed } = JSON.parse(localJson);
|
|
23
|
-
return [...feed, ...localFeed];
|
|
13
|
+
async function fetchFeed() {
|
|
14
|
+
const localJsonData = fs_1.default.readFileSync(path_1.default.join(__dirname, "../../local.data.json"), "utf8");
|
|
15
|
+
return JSON.parse(localJsonData).fiches;
|
|
24
16
|
}
|
|
25
17
|
exports.fetchFeed = fetchFeed;
|
|
26
18
|
async function scrap(urls) {
|
|
@@ -1,14 +1,10 @@
|
|
|
1
1
|
export function parseDom(dom: any, id: any, url: any): {
|
|
2
|
-
date:
|
|
2
|
+
date: never;
|
|
3
3
|
description: any;
|
|
4
4
|
intro: any;
|
|
5
5
|
pubId: any;
|
|
6
|
-
sections:
|
|
7
|
-
anchor: string;
|
|
8
|
-
html: string;
|
|
9
|
-
text: string;
|
|
10
|
-
title: any;
|
|
11
|
-
}[];
|
|
6
|
+
sections: any;
|
|
12
7
|
title: any;
|
|
13
8
|
url: any;
|
|
14
9
|
};
|
|
10
|
+
export function textClean(text: any, noNbsp?: boolean): any;
|
|
@@ -3,12 +3,13 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
3
3
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
4
|
};
|
|
5
5
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
-
exports.parseDom = void 0;
|
|
6
|
+
exports.parseDom = exports.textClean = void 0;
|
|
7
7
|
const cdtn_slugify_1 = __importDefault(require("@socialgouv/cdtn-slugify"));
|
|
8
8
|
const got_1 = require("got");
|
|
9
9
|
const email_1 = require("../email");
|
|
10
10
|
const referenceExtractor_1 = require("./referenceExtractor");
|
|
11
11
|
const referenceResolver_1 = require("./referenceResolver");
|
|
12
|
+
const jsdom_1 = require("jsdom");
|
|
12
13
|
const $$ = (node, selector) => Array.from(node.querySelectorAll(selector));
|
|
13
14
|
const $ = (node, selector) => node.querySelector(selector);
|
|
14
15
|
function unwrapEmail(data = "") {
|
|
@@ -110,22 +111,166 @@ const flattenCsBlocs = (node) => {
|
|
|
110
111
|
node.insertAdjacentHTML("afterend", node.innerHTML);
|
|
111
112
|
node.parentNode.removeChild(node);
|
|
112
113
|
};
|
|
113
|
-
const getSectionTag = (article) => {
|
|
114
|
-
const h3 = $$(article, ".main-article__texte > h3").length && "h3";
|
|
115
|
-
const h4 = $$(article, ".main-article__texte > h4").length && "h4";
|
|
116
|
-
const h5 = $$(article, ".main-article__texte > h5").length && "h5";
|
|
117
|
-
return h3 || h4 || h5 || "sectionTag";
|
|
118
|
-
};
|
|
119
114
|
const getReferences = (text) => {
|
|
120
115
|
// first we extract the tokens referencing articles
|
|
121
116
|
const references = (0, referenceExtractor_1.extractReferences)(text);
|
|
122
117
|
// then we try to resolve the actual articles ids using legi-data
|
|
123
118
|
return (0, referenceResolver_1.resolveReferences)(references);
|
|
124
119
|
};
|
|
120
|
+
const textClean = (text, noNbsp = false) => {
|
|
121
|
+
const regexStr = "\\n";
|
|
122
|
+
return text
|
|
123
|
+
.replace(new RegExp(noNbsp ? `(${regexStr}| )` : `(${regexStr})`, "g"), " ")
|
|
124
|
+
.replace(/([.!?]+)(?![^<]*>)/g, "$1 ")
|
|
125
|
+
.replace(/[ ]{2,}/g, " ")
|
|
126
|
+
.trim();
|
|
127
|
+
};
|
|
128
|
+
exports.textClean = textClean;
|
|
129
|
+
const duplicateContent = (sections, highlight) => {
|
|
130
|
+
if (highlight) {
|
|
131
|
+
return sections.filter((section) => highlight.text
|
|
132
|
+
.replace(/\s+/g, "")
|
|
133
|
+
.toLowerCase()
|
|
134
|
+
.includes(section.text.replace(/\s+/g, "").toLowerCase())).length;
|
|
135
|
+
}
|
|
136
|
+
return 0;
|
|
137
|
+
};
|
|
138
|
+
function parseHTMLSections(dom) {
|
|
139
|
+
const document = dom.window.document;
|
|
140
|
+
const mainContent = $(document, ".main-content");
|
|
141
|
+
if (!mainContent) {
|
|
142
|
+
throw new Error('No <div class="main-content"> found in the HTML content.');
|
|
143
|
+
}
|
|
144
|
+
const sections = [];
|
|
145
|
+
const h2Tags = $$(mainContent, "h2");
|
|
146
|
+
h2Tags.forEach((h2Tag) => {
|
|
147
|
+
const section = {
|
|
148
|
+
title: (0, exports.textClean)(h2Tag.textContent, true) || "",
|
|
149
|
+
html: "",
|
|
150
|
+
text: "",
|
|
151
|
+
};
|
|
152
|
+
let nextSibling = h2Tag.nextElementSibling;
|
|
153
|
+
if (!nextSibling) {
|
|
154
|
+
nextSibling = h2Tag.parentElement
|
|
155
|
+
? h2Tag.parentElement.nextElementSibling
|
|
156
|
+
: undefined;
|
|
157
|
+
if (!nextSibling && h2Tag.parentElement) {
|
|
158
|
+
nextSibling = h2Tag.parentElement.parentElement
|
|
159
|
+
? h2Tag.parentElement.parentElement.nextElementSibling
|
|
160
|
+
: undefined;
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
const sectionHtmlContent = [];
|
|
164
|
+
const sectionTextContent = [];
|
|
165
|
+
while (nextSibling && nextSibling.nodeName !== "H2") {
|
|
166
|
+
sectionHtmlContent.push((0, exports.textClean)(nextSibling.outerHTML || "", true));
|
|
167
|
+
sectionTextContent.push((0, exports.textClean)(nextSibling.textContent || "", true));
|
|
168
|
+
nextSibling = nextSibling.nextElementSibling;
|
|
169
|
+
}
|
|
170
|
+
section.html = sectionHtmlContent.join("").trim();
|
|
171
|
+
section.text = sectionTextContent.join("").trim();
|
|
172
|
+
sections.push(section);
|
|
173
|
+
});
|
|
174
|
+
const cleanSections = sections.map((section) => ({
|
|
175
|
+
...section,
|
|
176
|
+
// Sometimes, we have all the html in a section
|
|
177
|
+
// We check a second times and delete HTML from the h2 found
|
|
178
|
+
// (H2 should not be in a section)
|
|
179
|
+
html: removeExtraH2(section.html),
|
|
180
|
+
}));
|
|
181
|
+
if (cleanSections.find((section) => section.html === "")) {
|
|
182
|
+
return [
|
|
183
|
+
{
|
|
184
|
+
title: "Contenu",
|
|
185
|
+
html: mainContent.innerHTML,
|
|
186
|
+
text: mainContent.textContent,
|
|
187
|
+
},
|
|
188
|
+
];
|
|
189
|
+
}
|
|
190
|
+
if (cleanSections) {
|
|
191
|
+
return cleanSections;
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
const removeExtraH2 = (html) => {
|
|
195
|
+
const dom = new jsdom_1.JSDOM(`<div>${html}</div>`);
|
|
196
|
+
const document = dom.window.document;
|
|
197
|
+
const mainDiv = $(document, "div");
|
|
198
|
+
const firstH2 = $(mainDiv, "h2");
|
|
199
|
+
if (firstH2) {
|
|
200
|
+
let parent = firstH2.parentElement;
|
|
201
|
+
let h2 = firstH2;
|
|
202
|
+
while (parent.nextElementSibling) {
|
|
203
|
+
parent.nextElementSibling.remove();
|
|
204
|
+
}
|
|
205
|
+
while (firstH2.nextElementSibling) {
|
|
206
|
+
firstH2.nextElementSibling.remove();
|
|
207
|
+
}
|
|
208
|
+
h2.remove();
|
|
209
|
+
}
|
|
210
|
+
return (0, exports.textClean)(mainDiv.innerHTML, true);
|
|
211
|
+
};
|
|
212
|
+
const parseHighlight = (dom) => {
|
|
213
|
+
const document = dom.window.document;
|
|
214
|
+
const mainContent = $(document, ".main-content");
|
|
215
|
+
if (!mainContent) {
|
|
216
|
+
throw new Error('No <div class="main-content"> found in the HTML content.');
|
|
217
|
+
}
|
|
218
|
+
const highlightHtmlContent = [];
|
|
219
|
+
const highlightTextContent = [];
|
|
220
|
+
let nextSibling = mainContent.firstElementChild;
|
|
221
|
+
while (nextSibling && nextSibling.nodeName !== "H2") {
|
|
222
|
+
highlightHtmlContent.push((0, exports.textClean)(nextSibling.outerHTML || "", true));
|
|
223
|
+
highlightTextContent.push((0, exports.textClean)(nextSibling.textContent || "", true));
|
|
224
|
+
nextSibling = nextSibling.nextSibling;
|
|
225
|
+
}
|
|
226
|
+
if (highlightHtmlContent.length > 0) {
|
|
227
|
+
return {
|
|
228
|
+
title: "",
|
|
229
|
+
html: (0, exports.textClean)(highlightHtmlContent.join("").trim(), true),
|
|
230
|
+
text: highlightTextContent.join("").trim(),
|
|
231
|
+
};
|
|
232
|
+
}
|
|
233
|
+
return undefined;
|
|
234
|
+
};
|
|
235
|
+
const getDate = (article) => {
|
|
236
|
+
const firstParagraph = $(article, "p");
|
|
237
|
+
let publicationAt = null;
|
|
238
|
+
let updatedAt = null;
|
|
239
|
+
if (!firstParagraph) {
|
|
240
|
+
throw new Error("Can't find the updated date, first paragraph missing");
|
|
241
|
+
}
|
|
242
|
+
const spans = $$(firstParagraph, "span");
|
|
243
|
+
spans.forEach((span) => {
|
|
244
|
+
const textContent = span.textContent;
|
|
245
|
+
if (textContent.includes("Publié le")) {
|
|
246
|
+
publicationAt = textContent.match(/\d{1,2}\/\d{1,2}\/\d{4}/);
|
|
247
|
+
}
|
|
248
|
+
if (textContent.includes("Mis à jour le")) {
|
|
249
|
+
updatedAt = textContent.match(/\d{1,2}\/\d{1,2}\/\d{4}/);
|
|
250
|
+
}
|
|
251
|
+
});
|
|
252
|
+
if (updatedAt) {
|
|
253
|
+
return updatedAt[0];
|
|
254
|
+
}
|
|
255
|
+
if (publicationAt) {
|
|
256
|
+
return publicationAt[0];
|
|
257
|
+
}
|
|
258
|
+
throw new Error("Can't find the updated date in the first paragraph");
|
|
259
|
+
};
|
|
260
|
+
const populateSections = (sections) => {
|
|
261
|
+
return sections.map((section) => ({
|
|
262
|
+
anchor: (0, cdtn_slugify_1.default)(section.title),
|
|
263
|
+
description: section.text.slice(0, 200),
|
|
264
|
+
html: section.html,
|
|
265
|
+
references: getReferences(section.text),
|
|
266
|
+
text: section.text,
|
|
267
|
+
title: section.title,
|
|
268
|
+
}));
|
|
269
|
+
};
|
|
125
270
|
function parseDom(dom, id, url) {
|
|
126
|
-
const article = $(dom.window.document, "
|
|
271
|
+
const article = $(dom.window.document, "article");
|
|
127
272
|
if (!article) {
|
|
128
|
-
throw new got_1.ParseError("no <
|
|
273
|
+
throw new got_1.ParseError("no <article>");
|
|
129
274
|
}
|
|
130
275
|
if (!id) {
|
|
131
276
|
throw new got_1.ParseError(`No id`);
|
|
@@ -135,8 +280,6 @@ function parseDom(dom, id, url) {
|
|
|
135
280
|
$$(article, "[data-cfemail]").forEach(formatEmail);
|
|
136
281
|
$$(article, ".cs_blocs").forEach(flattenCsBlocs);
|
|
137
282
|
$$(article, "img").forEach(formatImage);
|
|
138
|
-
$$(article, "style").forEach(removeNode);
|
|
139
|
-
$$(article, "button").forEach(removeNode);
|
|
140
283
|
$$(article, ".oembed-source").forEach(removeNode);
|
|
141
284
|
let titleElement = $(article, "h1");
|
|
142
285
|
if (!titleElement) {
|
|
@@ -145,87 +288,38 @@ function parseDom(dom, id, url) {
|
|
|
145
288
|
throw new got_1.ParseError("No <h1> or <h2> element");
|
|
146
289
|
}
|
|
147
290
|
}
|
|
148
|
-
const title = titleElement.textContent
|
|
149
|
-
const
|
|
150
|
-
|
|
151
|
-
const [year, month, day] = dateRaw.getAttribute("content").split("-");
|
|
152
|
-
let intro = $(article, ".main-article__chapo") || "";
|
|
291
|
+
const title = (0, exports.textClean)(titleElement.textContent, true);
|
|
292
|
+
const date = getDate(article);
|
|
293
|
+
let intro = $(article, ".fr-text--lead") || "";
|
|
153
294
|
intro =
|
|
154
295
|
intro &&
|
|
155
|
-
intro.innerHTML
|
|
156
|
-
.replace(/\n/g, "")
|
|
157
|
-
.replace(/\s+/g, " ")
|
|
158
|
-
.trim()
|
|
159
|
-
.replace(/<script[^>]*>([\s\S]*?)<\/script>/g, "");
|
|
296
|
+
(0, exports.textClean)(intro.innerHTML, true).replace(/<script[^>]*>([\s\S]*?)<\/script>/g, "");
|
|
160
297
|
const description = $(dom.window.document, "meta[name=description]")?.getAttribute("content") ??
|
|
161
298
|
"";
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
untitledSection.description = "temp description";
|
|
178
|
-
}
|
|
179
|
-
untitledSection.html += nextArticleElement.outerHTML
|
|
180
|
-
.replace(/\n+/g, "")
|
|
181
|
-
.replace(/>\s+</g, "><")
|
|
182
|
-
.replace(/\s+/g, " ");
|
|
183
|
-
untitledSection.text +=
|
|
184
|
-
" " + nextArticleElement.textContent.replace(/\s+/g, " ").trim();
|
|
185
|
-
}
|
|
186
|
-
nextArticleElement = nextArticleElement.nextElementSibling;
|
|
187
|
-
}
|
|
188
|
-
if (untitledSection.description) {
|
|
189
|
-
untitledSection.text.trim();
|
|
190
|
-
untitledSection.description = untitledSection.text.slice(0, 200).trim();
|
|
191
|
-
untitledSection.references = getReferences(untitledSection.text);
|
|
192
|
-
sections.push(untitledSection);
|
|
193
|
-
}
|
|
194
|
-
// Gets all the titled content
|
|
195
|
-
const articleChildren = $$(article, `.main-article__texte > ${sectionTag}`);
|
|
196
|
-
articleChildren.forEach(function (el) {
|
|
197
|
-
if (el.tagName.toLowerCase() === sectionTag) {
|
|
198
|
-
let nextEl = el.nextElementSibling;
|
|
199
|
-
let html = "";
|
|
200
|
-
while (nextEl && nextEl.tagName.toLowerCase() !== sectionTag) {
|
|
201
|
-
html += nextEl.outerHTML;
|
|
202
|
-
nextEl = nextEl.nextElementSibling;
|
|
203
|
-
}
|
|
204
|
-
const section = dom.window.document.createElement("div");
|
|
205
|
-
section.innerHTML = html;
|
|
206
|
-
const sectionText = section.textContent.replace(/\s+/g, " ").trim();
|
|
207
|
-
sections.push({
|
|
208
|
-
anchor: el.getAttribute("id") || (0, cdtn_slugify_1.default)(el.textContent),
|
|
209
|
-
description: sectionText.slice(0, 200).trim(),
|
|
210
|
-
html: html
|
|
211
|
-
.replace(/\n+/g, "")
|
|
212
|
-
.replace(/>\s+</g, "><")
|
|
213
|
-
.replace(/\s+/g, " "),
|
|
214
|
-
references: getReferences(sectionText),
|
|
215
|
-
text: sectionText,
|
|
216
|
-
title: el.textContent.trim(),
|
|
217
|
-
});
|
|
218
|
-
}
|
|
219
|
-
});
|
|
299
|
+
let sections = parseHTMLSections(dom);
|
|
300
|
+
let highlight = parseHighlight(dom);
|
|
301
|
+
const duplicatedCount = duplicateContent(sections, highlight);
|
|
302
|
+
if (duplicatedCount >= sections.length) {
|
|
303
|
+
sections = [];
|
|
304
|
+
}
|
|
305
|
+
else if (duplicatedCount > 0) {
|
|
306
|
+
highlight = {
|
|
307
|
+
...highlight,
|
|
308
|
+
html: removeExtraH2(highlight.html),
|
|
309
|
+
};
|
|
310
|
+
}
|
|
311
|
+
if (highlight) {
|
|
312
|
+
sections.unshift(highlight);
|
|
313
|
+
}
|
|
220
314
|
if (sections.length === 0) {
|
|
221
315
|
throw new got_1.ParseError(`No sections`);
|
|
222
316
|
}
|
|
223
317
|
return {
|
|
224
|
-
date
|
|
318
|
+
date,
|
|
225
319
|
description,
|
|
226
320
|
intro,
|
|
227
321
|
pubId: id,
|
|
228
|
-
sections,
|
|
322
|
+
sections: populateSections(sections),
|
|
229
323
|
title,
|
|
230
324
|
url,
|
|
231
325
|
};
|
|
@@ -1,14 +1,9 @@
|
|
|
1
1
|
export function scrapUrl(id: any, url: any): Promise<{
|
|
2
|
-
date:
|
|
2
|
+
date: never;
|
|
3
3
|
description: any;
|
|
4
4
|
intro: any;
|
|
5
5
|
pubId: any;
|
|
6
|
-
sections:
|
|
7
|
-
anchor: string;
|
|
8
|
-
html: string;
|
|
9
|
-
text: string;
|
|
10
|
-
title: any;
|
|
11
|
-
}[];
|
|
6
|
+
sections: any;
|
|
12
7
|
title: any;
|
|
13
8
|
url: any;
|
|
14
9
|
}>;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@socialgouv/fiches-travail-data-types",
|
|
3
|
-
"version": "4.
|
|
3
|
+
"version": "4.701.0",
|
|
4
4
|
"main": "build/index.js",
|
|
5
5
|
"module": "build/index.js",
|
|
6
6
|
"files": [
|
|
@@ -29,6 +29,7 @@
|
|
|
29
29
|
"@babel/preset-env": "^7.16.4",
|
|
30
30
|
"@swc/core": "^1.3.21",
|
|
31
31
|
"@swc/jest": "^0.2.23",
|
|
32
|
+
"@types/jsdom": "^21.1.7",
|
|
32
33
|
"@typescript-eslint/eslint-plugin": "^5.45.0",
|
|
33
34
|
"@typescript-eslint/parser": "^5.45.0",
|
|
34
35
|
"babel-jest": "^27.4.4",
|