npm - @tricoteuses/senat - Versions diffs - 2.10.0 → 2.10.2 - Mend

@tricoteuses/senat 2.10.0 → 2.10.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

package/LICENSE.md +22 -22
package/README.md +116 -116
package/lib/loaders.d.ts +6 -1
package/lib/loaders.js +54 -0
package/lib/model/agenda.js +0 -2
package/lib/model/compte_rendu.d.ts +9 -2
package/lib/model/compte_rendu.js +223 -211
package/lib/model/util.d.ts +1 -0
package/lib/model/util.js +3 -0
package/lib/scripts/retrieve_agenda.js +25 -6
package/lib/scripts/retrieve_comptes_rendus.d.ts +6 -1
package/lib/scripts/retrieve_comptes_rendus.js +230 -77
package/lib/scripts/retrieve_comptes_rendus_seance.d.ts +6 -0
package/lib/scripts/retrieve_comptes_rendus_seance.js +273 -0
package/lib/scripts/retrieve_videos.js +1 -9
package/lib/types/agenda.d.ts +19 -2
package/lib/types/compte_rendu.d.ts +1 -1
package/lib/utils/cr_spliting.d.ts +7 -0
package/lib/utils/cr_spliting.js +125 -0
package/lib/utils/reunion_grouping.d.ts +6 -0
package/lib/utils/reunion_grouping.js +359 -0
package/lib/validators/senat.d.ts +0 -0
package/lib/validators/senat.js +24 -0
package/package.json +98 -98
package/lib/raw_types/kysely-table-types.d.ts +0 -5
package/lib/raw_types/kysely-table-types.js +0 -1

package/lib/model/compte_rendu.js CHANGED Viewed

@@ -1,69 +1,153 @@
-import { JSDOM } from "jsdom";
+import fs from "fs";
 import * as cheerio from "cheerio";
-const norm = (s) => s.replace(/\u00A0/g, " ").replace(/\s+/g, " ").trim();
-const toTexte = (s) => ({ _: s });
-function extractSommaire($) {
-    const root = $("#wysiwyg").length ? $("#wysiwyg") : $("#cri");
-    const sommaire = {
-        presidentSeance: toTexte(""),
-        sommaire1: [],
-    };
-    // (1) presidency line (e.g., "Présidence de Mme …")
-    const pres = root.find("p.tm2").filter((_, el) => /présidence/i.test($(el).text())).first();
-    if (pres.length) {
-        sommaire.presidentSeance = toTexte(norm(pres.text()));
+import path from "path";
+import { computeIntervalsBySlot } from "../utils/cr_spliting";
+import { norm } from "./util";
+const asArray = (x) => x == null ? [] : Array.isArray(x) ? x : [x];
+const toInt = (s) => Number.isFinite(Number(s)) ? Number(s) : Number.POSITIVE_INFINITY;
+export async function parseCompteRenduSlotFromFile(xmlFilePath, wantedSlot, firstSlotOfDay) {
+    try {
+        const raw = fs.readFileSync(xmlFilePath, "utf8");
+        const $ = cheerio.load(raw, { xml: false });
+        const metadonnees = extractMetadonnees($, xmlFilePath);
+        const order = $("body *").toArray();
+        const idx = new Map(order.map((el, i) => [el, i]));
+        const intervalsAll = computeIntervalsBySlot($, idx, firstSlotOfDay);
+        const intervals = intervalsAll.filter(iv => iv.slot === wantedSlot);
+        if (intervals.length === 0) {
+            console.warn(`[CRI] no intervals for ${path.basename(xmlFilePath)} [${wantedSlot}]`);
+            return null;
+        }
+        metadonnees.sommaire = extractSommaireForIntervals($, idx, intervals);
+        const points = [];
+        let ordre = 0;
+        const addPoint = (p) => points.push({ ...p, ordre_absolu_seance: String(++ordre) });
+        // Titles
+        $("cri\\:titreS1 p.titre_S1").each((_, el) => {
+            if (!elementInAnyInterval(el, idx, intervals))
+                return;
+            const t = normalizeTitle(norm($(el).text() || ""));
+            if (t)
+                addPoint({ code_grammaire: "TITRE_TEXTE_DISCUSSION", texte: { _: t }, code_style: "Titre" });
+        });
+        // Interventions
+        $("div.intervenant").each((_, block) => {
+            if (!elementInAnyInterval(block, idx, intervals))
+                return;
+            const $block = $(block);
+            $block.find([
+                "p[class^='titre_S']",
+                "p.mention_titre",
+                "p.intitule_titre",
+                "p.mention_chapitre",
+                "p.intitule_chapitre",
+                "p.mention_article",
+                "p.intitule_article",
+                "p.mention_section",
+                "p.intitule_section",
+            ].join(",")).remove();
+            const firstP = $block.find("p").first();
+            const speakerLabelRaw = firstP.find(".orateur_nom").text() || firstP.find("a.lien_senfic").text() || "";
+            const speakerLabel = dedupeSpeaker(speakerLabelRaw);
+            const { mat, nom: nomCRI, qua: quaCRI } = readIntervenantMeta($block);
+            const qualFromSpans = extractAndRemoveLeadingQualite($, $block);
+            const qualite = norm(decodeHtmlEntities(quaCRI || "")) || qualFromSpans;
+            const canonicalName = dedupeSpeaker(nomCRI || speakerLabel);
+            const role = roleForSpeaker(speakerLabel) || roleForSpeaker(qualite) || roleForSpeaker(quaCRI || "");
+            const speechHtml = sanitizeInterventionHtml($, $block);
+            if (!norm(cheerio.load(speechHtml).text() || ""))
+                return;
+            addPoint({
+                code_grammaire: "PAROLE_GENERIQUE",
+                roledebat: role,
+                orateurs: { orateur: { nom: canonicalName, id: mat || "", qualite } },
+                texte: { _: speechHtml },
+            });
+        });
+        const contenu = {
+            quantiemes: { journee: metadonnees.dateSeance, session: metadonnees.session },
+            point: points,
+        };
+        return {
+            uid: "CRSSN" + xmlFilePath.replace(/^.*?(\d{8}).*$/i, "$1") + `-${wantedSlot}`,
+            seanceRef: "RUSN" + xmlFilePath.replace(/^.*?(\d{8}).*$/i, "$1") + "IDS-" + wantedSlot,
+            sessionRef: metadonnees.session,
+            metadonnees,
+            contenu,
+        };
     }
-    // (2) extra info lines like "Secrétaires :" (tm5)
-    const paras = [];
-    root.find("p.tm5").each((_, el) => {
-        const t = norm($(el).text());
-        if (t)
-            paras.push(toTexte(t));
-    });
-    if (paras.length) {
-        sommaire.para = paras.length === 1 ? paras[0] : paras;
+    catch (e) {
+        console.error(`[CRI] parseSlot error file=${xmlFilePath} slot=${wantedSlot}:`, e);
+        return null;
     }
-    // (3) first-level items (tm3)
-    const items = [];
-    root.find("p.tm3").each((_, el) => {
-        const $p = $(el);
-        const full = norm($p.text());
-        // try to extract the numeric order at the start: "1. ..." or "2 – ..." etc.
-        const numMatch = full.match(/^(\d+)\s*[.\-–—]/);
-        const valeur = numMatch ? numMatch[1] : undefined;
-        // prefer the linked title text; fallback to full text
-        const a = $p.find("a").first();
-        const intitule = norm(a.text() || full.replace(/^(\d+)\s*[.\-–—]\s*/, ""));
-        // id_syceron = href target without '#' ? TODO verify
-        const href = a.attr("href") || "";
-        const idSyceron = href.startsWith("#") ? href.slice(1) : href;
-        const titreStruct = {
-            id_syceron: idSyceron || "",
-            intitule,
-        };
-        const elem = {
-            valeur_pts_odj: valeur,
-            titreStruct,
-            // sommaire2/3 undefined (first level only)
+}
+export function sessionStartYearFromDate(d) {
+    // Session (1th oct N → 30 sept N+1)
+    const m = d.getMonth();
+    const y = d.getFullYear();
+    return m >= 9 ? y : y - 1;
+}
+export function parseYYYYMMDD(yyyymmdd) {
+    if (!/^\d{8}$/.test(yyyymmdd))
+        return null;
+    const y = Number(yyyymmdd.slice(0, 4));
+    const m = Number(yyyymmdd.slice(4, 6)) - 1;
+    const d = Number(yyyymmdd.slice(6, 8));
+    const dt = new Date(y, m, d);
+    return Number.isFinite(dt.getTime()) ? dt : null;
+}
+export function deriveTitreObjetFromSommaire(sommaire, slot) {
+    const items = extractLevel1Items(sommaire);
+    const meaningful = items.filter(it => !isBoilerplate(it.label));
+    if (meaningful.length === 0) {
+        return {
+            titre: `Séance publique ${slotLabel(slot)}`,
+            objet: "",
         };
-        items.push(elem);
-    });
-    if (items.length) {
-        sommaire.sommaire1 = items;
     }
-    return sommaire;
+    const titre = meaningful[0].label;
+    const objet = meaningful.slice(0, 3).map(it => it.label).join(" ; ");
+    return { titre, objet };
+}
+function slotLabel(slot) {
+    switch (slot) {
+        case "MATIN": return "du matin";
+        case "APRES-MIDI": return "de l’après-midi";
+        case "SOIR": return "du soir";
+        default: return "";
+    }
 }
-function stripTrailingPunct(s) {
-    return s.replace(/\s*([:,.;])\s*$/u, "").trim();
+const BOILERPLATE_PATTERNS = [
+    /proc(?:è|e)s-?verbal/i,
+    /hommages?/i,
+    /désignation des vice-?président/i,
+    /candidatures? aux?/i,
+    /ordre du jour/i,
+    /rappels? au règlement/i,
+    /communications?/i,
+    /dépôts?/i,
+    /proclamation/i,
+    /présidence de/i,
+    /questions? diverses?/i,
+    /ouverture de la séance/i,
+    /clo(?:t|̂)ure de la séance/i,
+];
+const isBoilerplate = (label) => !label?.trim() || BOILERPLATE_PATTERNS.some(rx => rx.test(label));
+function extractLevel1Items(sommaire) {
+    const level1 = asArray(sommaire?.sommaire1);
+    return level1
+        .map(el => ({
+        numero: toInt(el?.valeur_pts_odj),
+        label: String(el?.titreStruct?.intitule ?? "").trim(),
+    }))
+        .filter(it => !!it.label)
+        .sort((a, b) => a.numero - b.numero);
 }
+function stripTrailingPunct(s) { return s.replace(/\s*([:,.;])\s*$/u, "").trim(); }
 function dedupeSpeaker(raw) {
     let s = norm(raw);
     s = stripTrailingPunct(s);
-    const dupPatterns = [
-        /^(.+?)\s*[.]\s*\1$/u,
-        /^(.+?)\s*,\s*\1,?$/u,
-        /^(.+?)\s+\1$/u,
-    ];
+    const dupPatterns = [/^(.+?)\s*[.]\s*\1$/u, /^(.+?)\s*,\s*\1,?$/u, /^(.+?)\s+\1$/u];
     for (const re of dupPatterns) {
         const m = s.match(re);
         if (m) {
@@ -74,12 +158,10 @@ function dedupeSpeaker(raw) {
     return s.replace(/\.\s*$/, "");
 }
 function decodeHtmlEntities(s) {
-    return s
-        .replace(/&#(\d+);/g, (_, d) => String.fromCharCode(parseInt(d, 10)))
+    return s.replace(/&#(\d+);/g, (_, d) => String.fromCharCode(parseInt(d, 10)))
         .replace(/&#x([0-9a-fA-F]+);/g, (_, h) => String.fromCharCode(parseInt(h, 16)));
 }
 function fixApostrophes(s) {
-    // Tighten spacing around French apostrophes and punctuation
     let out = s;
     out = out.replace(/\s*’\s*/g, "’");
     out = out.replace(/\b([dljctmsn])\s*’/gi, (_, m) => m + "’");
@@ -87,43 +169,33 @@ function fixApostrophes(s) {
     out = out.replace(/\s+([,;:.!?])/g, "$1");
     return out;
 }
-function normalizeTitle(text) {
-    return text.replace(/^PR[ÉE]SIDENCE DE\b/i, "Présidence de ");
-}
+function normalizeTitle(text) { return text.replace(/^PR[ÉE]SIDENCE DE\b/i, "Présidence de "); }
 function roleForSpeaker(labelOrQualite) {
-    const s = labelOrQualite.toLowerCase();
-    if (/^(m\.|mme)?\s*(le|la)\s+pr[ée]sident(e)?\b/.test(s) ||
-        /\bpr[ée]sident[e]?\s+de\s+séance\b/.test(s)) {
+    const s = (labelOrQualite || "").toLowerCase();
+    if (/^(m\.|mme)?\s*(le|la)\s+pr[ée]sident(e)?\b/.test(s) || /\bpr[ée]sident[e]?\s+de\s+séance\b/.test(s))
         return "président";
-    }
     return "";
 }
-// ---------------- DOM helpers ----------------
-function parseCriIntervenantComment(html) {
-    // From <!-- cri:intervenant mat="..." nom="..." qua="..." ... -->
+function readIntervenantMeta($block) {
+    const int = $block.find('cri\\:intervenant').first();
+    if (int.length)
+        return { mat: int.attr("mat") || undefined, nom: int.attr("nom") || undefined, qua: int.attr("qua") || undefined };
+    const html = $block.html() || "";
     const m = html.match(/<!--\s*cri:intervenant\b([^>]+)-->/i);
     if (!m)
         return {};
-    const attrs = m[1];
     const out = {};
     const re = /(\w+)="([^"]*)"/g;
     let a;
-    while ((a = re.exec(attrs))) {
+    while ((a = re.exec(m[1])))
         out[a[1]] = decodeHtmlEntities(a[2]);
-    }
     return { mat: out["mat"], nom: out["nom"], qua: out["qua"] };
 }
-/**
- * Extract leading .orateur_qualite chunks from the FIRST <p> only,
- * concatenate them, clean punctuation/apostrophes, and REMOVE those nodes
- * (and .orateur_nom) from the first paragraph so the speech starts cleanly.
- */
 function extractAndRemoveLeadingQualite($, $block) {
     const firstP = $block.find("p").first();
     if (firstP.length === 0)
         return "";
     const parts = [];
-    // Iterate over the first <p>'s children from the start
     let stop = false;
     firstP.contents().each((_, node) => {
         if (stop)
@@ -131,7 +203,6 @@ function extractAndRemoveLeadingQualite($, $block) {
         if (node.type === "tag") {
             const $node = $(node);
             if ($node.hasClass("orateur_nom")) {
-                // speaker label node — remove it
                 $node.remove();
                 return;
             }
@@ -140,66 +211,95 @@ function extractAndRemoveLeadingQualite($, $block) {
                 $node.remove();
                 return;
             }
-            // Non-qualite tag: if it has meaningful text, we reached the speech
             const t = norm($node.text() || "");
-            if (t) {
+            if (t)
                 stop = true;
-            }
-            else {
-                // empty-ish node; remove to avoid stray punctuation
+            else
                 $node.remove();
-            }
         }
         else if (node.type === "text") {
             const t = norm(node.data || "");
-            if (!t) {
-                // whitespace only — drop it
-                ;
-                node.data = "";
-                return;
-            }
-            // boundary punctuation like ":" just after label — drop it
-            if (/^[:.,;–—-]+$/.test(t)) {
-                ;
+            if (!t || /^[:.,;–—-]+$/.test(t)) {
                 node.data = "";
                 return;
             }
-            // any other text means speech starts here
             stop = true;
         }
-        else {
-            // comment or others — ignore
-        }
     });
-    const qual = fixApostrophes(norm(parts.join(" ")));
-    return qual;
+    return fixApostrophes(norm(parts.join(" ")));
 }
 function sanitizeInterventionHtml($, $block) {
-    // Clone to avoid mutating outer tree order
     const $clone = $block.clone();
-    // Remove navigation / anchors / images
     $clone.find('a[name]').remove();
     $clone.find('div[align="right"]').remove();
     $clone.find('a.link').remove();
     $clone.find('img').remove();
-    // Remove technical anchors inside interventions
     $clone.find('a#ameli_amendement_cri_phrase, a#ameli_amendement_cra_contenu, a#ameli_amendement_cra_objet').remove();
-    // Remove any remaining speaker label / quality spans anywhere
-    $clone.find(".orateur_nom").remove();
-    $clone.find(".orateur_qualite").remove();
-    // Strip HTML comments
+    $clone.find(".orateur_nom, .orateur_qualite").remove();
     let html = $clone.html() || "";
     html = html.replace(/<!--[\s\S]*?-->/g, "");
     return html.trim();
 }
-function extractMetadonnees($) {
-    const headerText = norm($("h1.page-title").text() || "");
-    const dateMatch = headerText.match(/\b(\d{1,2}\s+\w+\s+\d{4})\b/i);
-    const bodyText = norm($("#cri").text() || "");
-    const sessionMatch = bodyText.match(/\bsession\s+(\d{4}-\d{4})\b/i);
+function extractSommaireForIntervals($, idx, intervals) {
+    const inIv = (el) => elementInAnyInterval(el, idx, intervals);
+    const root = $("body");
+    const sommaire = { presidentSeance: { _: "" }, sommaire1: [] };
+    // (1) Présidence (tm2) — première ligne dans l’intervalle
+    const pres = root.find("p.tm2").filter((_, el) => inIv(el)).first();
+    if (pres.length)
+        sommaire.presidentSeance = { _: norm(pres.text()) };
+    // (2) Paras tm5 présents dans l’intervalle
+    const paras = [];
+    root.find("p.tm5").each((_, el) => {
+        if (!inIv(el))
+            return;
+        const t = norm($(el).text());
+        if (t)
+            paras.push({ _: t });
+    });
+    if (paras.length)
+        sommaire.para = paras.length === 1 ? paras[0] : paras;
+    // (3) Items de 1er niveau (tm3) présents dans l’intervalle
+    const items = [];
+    root.find("p.tm3").each((_, el) => {
+        if (!inIv(el))
+            return;
+        const $p = $(el);
+        const full = norm($p.text() || "");
+        if (!full)
+            return;
+        const numMatch = full.match(/^(\d+)\s*[.\-–—]\s*/);
+        const valeur = numMatch ? numMatch[1] : undefined;
+        // prefere intitule in ancre <a> if present
+        const a = $p.find("a").first();
+        const intituleRaw = a.length ? a.text() : full.replace(/^(\d+)\s*[.\-–—]\s*/, "");
+        const intitule = norm(intituleRaw);
+        // id_syceron from href="#Niv1_SOMx"
+        const href = (a.attr("href") || "").trim();
+        const idSyceron = href.startsWith("#") ? href.slice(1) : href;
+        const titreStruct = { id_syceron: idSyceron || "", intitule };
+        items.push({ valeur_pts_odj: valeur, titreStruct });
+    });
+    if (items.length)
+        sommaire.sommaire1 = items;
+    return sommaire;
+}
+function extractMetadonnees($, filePath) {
+    let dateText = norm($("h1, h2, .page-title").first().text() || "");
+    if (!dateText)
+        dateText = norm($("p").first().text() || "");
+    const dateMatch = dateText.match(/\b(\d{1,2}\s+\w+\s+\d{4})\b/i);
+    const allText = norm($("body").text() || "");
+    const sessionMatch = allText.match(/\bsession\s+(\d{4}-\d{4})\b/i);
+    let dateSeance = dateMatch?.[1] || "";
+    if (!dateSeance) {
+        const m = filePath.match(/d(\d{4})(\d{2})(\d{2})\.xml$/i);
+        if (m)
+            dateSeance = `${m[1]}-${m[2]}-${m[3]}`;
+    }
     return {
-        dateSeance: dateMatch?.[1] || "",
-        dateSeanceJour: dateMatch?.[1] || "",
+        dateSeance,
+        dateSeanceJour: dateSeance,
         numSeanceJour: "",
         numSeance: "",
         typeAssemblee: "SN",
@@ -211,103 +311,15 @@ function extractMetadonnees($) {
         diffusion: "",
         version: "1.0",
         environnement: "",
-        heureGeneration: new Date(),
-        sommaire: extractSommaire($)
+        heureGeneration: new Date()
     };
 }
-// ---------------- main transform ----------------
-export async function parseCompteRenduFromFile(htmlFilePath) {
-    try {
-        const { window } = await JSDOM.fromFile(htmlFilePath, { contentType: "text/html" });
-        const $ = cheerio.load(window.document.documentElement.outerHTML);
-        const metadonnees = extractMetadonnees($);
-        const points = [];
-        let ordre = 0;
-        const addPoint = (p) => points.push({ ...p, ordre_absolu_seance: String(++ordre) });
-        // (1) Global section titles (common high-level headings)
-        let lastTitle = "";
-        $("#cri p[class^='titre_S']").each((_, el) => {
-            const t = normalizeTitle(norm($(el).text() || ""));
-            if (t && t !== lastTitle) {
-                addPoint({ code_grammaire: "TITRE_TEXTE_DISCUSSION", texte: { _: t }, code_style: "Titre" });
-                lastTitle = t;
-            }
-        });
-        // (2) Interventions
-        $("#cri div.intervenant").each((_, block) => {
-            const $block = $(block);
-            // (2.a) Extract internal structural titles inside this block (and remove them)
-            const structuralSel = [
-                "p[class^='titre_S']",
-                "p.mention_titre",
-                "p.intitule_titre",
-                "p.mention_chapitre",
-                "p.intitule_chapitre",
-                "p.mention_article",
-                "p.intitule_article",
-                "p.mention_section",
-                "p.intitule_section",
-            ].join(",");
-            $block.find(structuralSel).each((__, el) => {
-                const title = normalizeTitle(norm($(el).text() || ""));
-                if (title && title !== lastTitle) {
-                    addPoint({ code_grammaire: "TITRE_TEXTE_DISCUSSION", texte: { _: title }, code_style: "Titre" });
-                    lastTitle = title;
-                }
-                $(el).remove();
-            });
-            // (2.b) Speaker label & quality
-            const firstP = $block.find("p").first();
-            const speakerLabelRaw = firstP.find(".orateur_nom").text() ||
-                firstP.find("a.lien_senfic").text() ||
-                "";
-            const speakerLabel = dedupeSpeaker(speakerLabelRaw);
-            // Prefer <!--cri:intervenant ...--> for id/name/qualite when available
-            const rawHtml = $block.html() || "";
-            const { mat, nom: nomFromComment, qua: quaFromCommentRaw } = parseCriIntervenantComment(rawHtml);
-            // Extract and remove leading .orateur_qualite chunks from first <p>
-            const qualFromSpans = extractAndRemoveLeadingQualite($, $block);
-            const qualite = norm(decodeHtmlEntities(quaFromCommentRaw || "")) ||
-                qualFromSpans;
-            const canonicalName = dedupeSpeaker(nomFromComment || speakerLabel);
-            const role = roleForSpeaker(speakerLabel) ||
-                roleForSpeaker(qualite) ||
-                roleForSpeaker(quaFromCommentRaw || "");
-            // (2.c) Build cleaned speech HTML
-            let speechHtml = sanitizeInterventionHtml($, $block);
-            // If nothing meaningful remains, skip
-            if (!norm(cheerio.load(speechHtml).text() || ""))
-                return;
-            addPoint({
-                code_grammaire: "PAROLE_GENERIQUE",
-                roledebat: role,
-                orateurs: {
-                    orateur: {
-                        nom: canonicalName,
-                        id: mat || "",
-                        qualite: qualite,
-                    },
-                },
-                texte: { _: speechHtml },
-            });
-        });
-        const contenu = {
-            quantiemes: {
-                journee: metadonnees.dateSeance,
-                session: metadonnees.session,
-            },
-            point: points,
-        };
-        return {
-            uid: htmlFilePath.replace(/^.*?(\d{8}).*$/i, "$1"),
-            seanceRef: htmlFilePath.replace(/^.*?(\d{8}).*$/i, "$1"),
-            sessionRef: metadonnees.session,
-            metadonnees,
-            contenu,
-        };
-    }
-    catch (e) {
-        console.error("Could not parse compte-rendu with error", e);
-        return null;
-    }
+function elementInAnyInterval(el, idx, intervals) {
+    const p = idx.get(el);
+    if (p == null)
+        return false;
+    for (const iv of intervals)
+        if (p >= iv.start && p < iv.end)
+            return true;
+    return false;
 }

package/lib/model/util.d.ts CHANGED Viewed

@@ -6,3 +6,4 @@ export declare function removeSubstring(expr: Expression<string | null | undefin
 export declare function replace(expr: Expression<string | null | undefined>, pattern: Expression<string>, replacement: Expression<string>): import("kysely").RawBuilder<string>;
 export declare function rtrim(expr: Expression<string | null | undefined>): import("kysely").RawBuilder<string>;
 export declare function toDateString(expr: Expression<Date | null | undefined>, format?: Expression<string>): import("kysely").RawBuilder<string>;
+export declare function norm(s?: string | null): string;

package/lib/model/util.js CHANGED Viewed

@@ -21,3 +21,6 @@ export function rtrim(expr) {
 export function toDateString(expr, format = sql.val(STANDARD_DATE_FORMAT)) {
     return sql `to_char(${expr}, ${format})`;
 }
+export function norm(s) {
+    return (s || "").replace(/\u00A0/g, " ").replace(/\s+/g, " ").trim();
+}

package/lib/scripts/retrieve_agenda.js CHANGED Viewed

@@ -8,7 +8,8 @@ import { parseAgendaFromFile } from "../model/agenda";
 import { getSessionsFromStart } from "../types/sessions";
 import { ID_DATE_FORMAT } from "./datautil";
 import { commonOptions } from "./shared/cli_helpers";
-import { ensureAndClearDir } from "./shared/util";
+import { ensureAndClearDir, fetchWithRetry } from "./shared/util";
+import { groupNonSPByTypeOrganeHour, groupSeancePubliqueBySlot } from "../utils/reunion_grouping";
 const optionsDefinitions = [
     ...commonOptions,
     {
@@ -64,7 +65,7 @@ async function downloadAgenda(agendaName, agendaPath) {
     if (!options["silent"]) {
         console.log(`Downloading Agenda ${agendaUrl}…`);
     }
-    const response = await fetch(agendaUrl);
+    const response = await fetchWithRetry(agendaUrl);
     if (!response.ok) {
         if (response.status === 404) {
             console.warn(`Agenda ${agendaUrl} not found`);
@@ -80,15 +81,33 @@ async function downloadAgenda(agendaName, agendaPath) {
     }
     fs.writeFileSync(agendaPath, Buffer.from(agendaContent));
 }
+function writeGroupsAsFiles(dir, groups) {
+    for (const g of groups) {
+        const outPath = path.join(dir, `${g.uid}.json`);
+        fs.writeJSONSync(outPath, g, { spaces: 2 });
+    }
+}
 async function parseAgenda(transformedAgendaSessionDir, agendaFileName, agendaPath) {
-    if (!options["silent"]) {
+    if (!options["silent"])
         console.log(`Parsing Agenda ${agendaPath}…`);
-    }
     const parsedAgendaEvents = await parseAgendaFromFile(agendaPath);
-    if (!parsedAgendaEvents || parsedAgendaEvents.length === 0) {
+    if (!parsedAgendaEvents?.length)
         return;
+    const flatPath = path.join(transformedAgendaSessionDir, `${agendaFileName}.json`);
+    fs.writeJSONSync(flatPath, parsedAgendaEvents, { spaces: 2 });
+    // 1) SP → groubed by (date, slot)
+    const spGrouped = groupSeancePubliqueBySlot(parsedAgendaEvents);
+    if (spGrouped.length > 0) {
+        writeGroupsAsFiles(transformedAgendaSessionDir, spGrouped);
+    }
+    // 2) NON-SP → groubed by (date, organe, hour)
+    const groupedBySuffix = groupNonSPByTypeOrganeHour(parsedAgendaEvents);
+    for (const suffix of ["IDC", "IDM", "IDO", "IDI"]) {
+        const groups = groupedBySuffix[suffix] || [];
+        if (groups.length > 0) {
+            writeGroupsAsFiles(transformedAgendaSessionDir, groups);
+        }
     }
-    fs.writeJSONSync(path.join(transformedAgendaSessionDir, `${agendaFileName}.json`), parsedAgendaEvents, { spaces: 2 });
 }
 async function main() {
     const dataDir = options["dataDir"];

package/lib/scripts/retrieve_comptes_rendus.d.ts CHANGED Viewed

@@ -1 +1,6 @@
-export {};
+/**
+ * Needs to be run after retrieve_agenda.ts !
+ * - downloads the ZIP of comptes-rendus des débats (CRI) from data.senat.fr
+ * - extracts XML files, distributes them by session/year
+ */
+export declare function retrieveCriXmlDump(dataDir: string, options?: Record<string, any>): Promise<void>;