npm - @tricoteuses/senat - Versions diffs - 2.15.7 → 2.16.0 - Mend

@tricoteuses/senat 2.15.7 → 2.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/lib/datasets.js +0 -1
package/lib/model/agenda.js +9 -16
package/lib/model/commission.d.ts +9 -1
package/lib/model/commission.js +47 -33
package/lib/model/scrutins.js +4 -56
package/lib/model/seance.js +1 -6
package/lib/model/util.d.ts +3 -0
package/lib/model/util.js +32 -0
package/lib/scripts/retrieve_cr_commission.js +90 -72
package/lib/scripts/retrieve_videos.d.ts +0 -2
package/lib/scripts/retrieve_videos.js +57 -33
package/lib/types/agenda.d.ts +2 -2
package/lib/utils/cr_spliting.js +4 -2
package/lib/utils/reunion_grouping.d.ts +1 -1
package/lib/utils/reunion_grouping.js +13 -42
package/package.json +1 -1
package/lib/model/compte_rendu.d.ts +0 -9
package/lib/model/compte_rendu.js +0 -325
package/lib/raw_types/db.d.ts +0 -11389
package/lib/raw_types/db.js +0 -5
package/lib/scripts/retrieve_comptes_rendus.d.ts +0 -6
package/lib/scripts/retrieve_comptes_rendus.js +0 -274

package/lib/scripts/retrieve_videos.js CHANGED Viewed

@@ -7,8 +7,9 @@ import path from "path";
 import { AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, iterLoadSenatAgendas } from "../loaders";
 import { getSessionsFromStart } from "../types/sessions";
 import { commonOptions } from "./shared/cli_helpers";
+import { decodeHtmlEntities } from "../model/util";
 // ===================== Constants =====================
-const MATCH_THRESHOLD = 0.6;
+const MATCH_THRESHOLD = 0.56;
 const MAX_CANDIDATES = 15;
 const MAX_PAGES = 3;
 const STATS = { total: 0, accepted: 0 };
@@ -145,7 +146,9 @@ function extractCandidatesFromSearchHtml(html) {
         const pageUrl = `https://videos.senat.fr/video.${id}_${hash}.html`;
         const ctx = html.slice(Math.max(0, m.index - 240), Math.min(html.length, m.index + 240));
         const t = ctx.match(/title="([^"]+)"/i) || ctx.match(/>([^<]{10,200})</);
-        out.push({ id, hash, pageUrl, title: t?.[1] });
+        const title = t?.[1]?.trim();
+        const isSeancePublique = title?.toLowerCase().includes("séance publique") ?? false;
+        out.push({ id, hash, pageUrl, title, isSeancePublique });
     }
     const seen = new Set();
     return out.filter((c) => {
@@ -157,20 +160,28 @@ function extractCandidatesFromSearchHtml(html) {
     });
 }
 function parseDataNvs(nvs) {
-    const epoch = nvs.match(/<metadata\s+name="date"\s+value="(\d+)"/i)?.[1];
-    const title = nvs.match(/<metadata\s+name="title"\s+value="([^"]+)"/i)?.[1];
-    return { epoch: epoch ? Number(epoch) : undefined, title };
+    const epochStr = nvs.match(/<metadata\s+name="date"\s+value="(\d+)"/i)?.[1];
+    const epoch = epochStr ? Number(epochStr) : undefined;
+    const organesTag = nvs.match(/<metadata\b[^>]*\bname="organes"[^>]*>/i)?.[0];
+    let organeLabel;
+    let organeValue;
+    if (organesTag) {
+        organeLabel = organesTag.match(/\blabel="([^"]+)"/i)?.[1];
+        organeValue = organesTag.match(/\bvalue="([^"]+)"/i)?.[1];
+    }
+    const organeRaw = organeLabel ?? organeValue;
+    const organe = decodeHtmlEntities(organeRaw)?.trim();
+    const firstChapterLabel = decodeHtmlEntities(nvs.match(/<chapter\b[^>]*\blabel="([^"]+)"/i)[1]).trim();
+    return { epoch, organe, firstChapterLabel };
 }
-// nvsText = contenu texte de data.nvs (utf-8)
-// finalText = contenu texte de finalplayer.nvs (utf-8)
-export function buildSenatVodMasterM3u8FromNvs(nvsText, finalText) {
-    // 1) Base Akamai depuis data.nvs (mp4 "serverfiles://senat/YYYY/MM/encoderX_YYYYMMDDHHMMSS_1.mp4")
+function buildSenatVodMasterM3u8FromNvs(nvsText, finalText) {
+    // 1) Base Akamai from data.nvs (mp4 "serverfiles://senat/YYYY/MM/encoderX_YYYYMMDDHHMMSS_1.mp4")
     const baseMatch = nvsText.match(/serverfiles:\/\/senat\/(\d{4})\/(\d{2})\/(encoder\d)_(\d{14})/i);
     if (!baseMatch)
         return null;
     const [, yyyy, mm, encoder, stamp] = baseMatch;
     const base = `https://vodsenat.akamaized.net/senat/${yyyy}/${mm}/${encoder}_${stamp}`;
-    // 2) start/end depuis finalplayer.nvs
+    // 2) start/end from finalplayer.nvs
     let start = null, end = null;
     const playerAttr = finalText.match(/player[^>]*\bstarttime="(\d+)"[^>]*\bendtime="(\d+)"/i);
     if (playerAttr) {
@@ -178,12 +189,11 @@ export function buildSenatVodMasterM3u8FromNvs(nvsText, finalText) {
         end = parseInt(playerAttr[2], 10);
     }
     else {
-        // fallback: prendre le plus petit timecode des <synchro timecode="...">
+        // fallback: take smallest timecode of <synchro timecode="...">
         const tc = Array.from(finalText.matchAll(/timecode="(\d+)"/g)).map((m) => parseInt(m[1], 10));
         if (tc.length)
             start = Math.min(...tc);
     }
-    // 3) si pas d'end, on peut déduire via "duree" (en secondes) de data.nvs
     if (end == null) {
         const durMeta = nvsText.match(/<metadata[^>]*\bname="duree"[^>]*\bvalue="(\d+)"[^>]*>/i);
         if (durMeta && start != null) {
@@ -191,16 +201,14 @@ export function buildSenatVodMasterM3u8FromNvs(nvsText, finalText) {
             end = start + durMs;
         }
     }
-    // 4) Construction de l’URL
-    //    - si on a start & end → utiliser ps/pd (robuste et conforme à ce que sert le Sénat)
-    //    - sinon fallback sans suffixe (souvent valide aussi)
+    // 4) Construct URL
     if (start != null && end != null && end > start) {
         const pd = end - start;
         return `${base}_ps${start}_pd${pd}.smil/master.m3u8`;
     }
     return `${base}.smil/master.m3u8`;
 }
-export function score(agenda, agendaTs, videoTitle, videoEpoch) {
+function score(agenda, agendaTs, videoTitle, videoEpoch, videoOrgane) {
     const titleScore = dice(agenda.titre || "", videoTitle || "");
     let timeScore = 0;
     if (agendaTs && videoEpoch) {
@@ -209,15 +217,11 @@ export function score(agenda, agendaTs, videoTitle, videoEpoch) {
         // delta : 180min
         timeScore = Math.max(0, 1 - deltaMin / 180);
     }
-    let orgBonus = 0;
-    if (agenda.organe && videoTitle) {
-        const o = normalize(agenda.organe);
-        const t = normalize(videoTitle);
-        const first = o.split(" ").filter(Boolean)[0];
-        if (first && t.includes(first))
-            orgBonus = 0.15;
+    const orgScore = videoOrgane && agenda.organe ? dice(agenda.organe, videoOrgane) : 0;
+    if (orgScore === 0 && agenda.organe === "Séance publique") {
+        return 0.5 * titleScore + 0.5 * timeScore;
     }
-    return 0.3 * titleScore + 0.7 * timeScore + orgBonus; // Can be adjusted
+    return 0.4 * titleScore + 0.3 * timeScore + orgScore * 0.3;
 }
 /**
  * Build search strategies for senat's videos
@@ -263,19 +267,16 @@ async function fetchAllSearchPages(args, baseDir, strategyIndex, maxPages = MAX_
 async function processGroupedReunion(agenda, session, dataDir) {
     if (!agenda)
         return;
-    // 1) Garde-fous
+    // 1) GuardRails
     if (!agenda.captationVideo) {
-        if (!options["silent"])
-            console.log(`[skip] ${agenda.uid} captationVideo=false`);
+        // if (!options["silent"]) console.log(`[skip] ${agenda.uid} captationVideo=false`)
         return;
     }
     if (!agenda.date || !agenda.startTime) {
-        if (!options["silent"])
-            console.log(`[skip] ${agenda.uid} date/hour missing`);
+        // if (!options["silent"]) console.log(`[skip] ${agenda.uid} date/hour missing`)
         return;
     }
     STATS.total++;
-    // 2) Dossier de sortie (utilise directement l'UID)
     const reunionUid = agenda.uid;
     const baseDir = path.join(dataDir, VIDEOS_ROOT_FOLDER, String(session), reunionUid);
     await fs.ensureDir(baseDir);
@@ -312,9 +313,29 @@ async function processGroupedReunion(agenda, session, dataDir) {
         if (!buf)
             continue;
         const meta = parseDataNvs(buf.toString("utf-8"));
-        const s = score(agenda, agendaTs, c.title ?? meta.title, meta.epoch);
+        // If organes are different, go to next candidates
+        if (meta.organe && agenda.organe) {
+            const videoOrgNorm = normalize(meta.organe);
+            const agendaOrgNorm = normalize(agenda.organe);
+            if (dice(agendaOrgNorm, videoOrgNorm) < 0.5) {
+                continue;
+            }
+        }
+        let videoTitle = c.title;
+        if (c.isSeancePublique && meta.firstChapterLabel) {
+            videoTitle = meta.firstChapterLabel;
+        }
+        const s = score(agenda, agendaTs, videoTitle, meta.epoch, meta.organe);
         if (!best || s > best.score) {
-            best = { id: c.id, hash: c.hash, pageUrl: c.pageUrl, epoch: meta.epoch, vtitle: c.title ?? meta.title, score: s };
+            best = {
+                id: c.id,
+                hash: c.hash,
+                pageUrl: c.pageUrl,
+                epoch: meta.epoch,
+                vtitle: videoTitle,
+                score: s,
+                vorgane: meta.organe,
+            };
         }
     }
     if (!best) {
@@ -326,7 +347,10 @@ async function processGroupedReunion(agenda, session, dataDir) {
     if (accepted)
         STATS.accepted++;
     if (!options["silent"]) {
-        console.log(`[pick] ${agenda.uid} best id=${best.id} hash=${best.hash} score=${best.score.toFixed(2)} accepted=${accepted} (strategy=${usedStrategy})`);
+        console.log(`[pick] ${agenda.uid} score=${best.score.toFixed(2)}
+      agenda title="${agenda.titre ?? ""}" agenda organe="${agenda.organe ?? ""}"
+      best title="${best.vtitle ?? ""}" best organe="${best.vorgane ?? ""}"
+      accepted=${accepted} (strategy=${usedStrategy})`);
     }
     // ==== 3) Write metadata + NVS of the best candidate (always) ====
     const bestDt = best?.epoch ? epochToParisDateTime(best.epoch) : null;

package/lib/types/agenda.d.ts CHANGED Viewed

@@ -1,11 +1,11 @@
 export interface AgendaEvent {
     id: string;
     type: string | null;
-    date: string | null;
+    date: string;
     startTime: string | null;
     endTime: string | null;
     timeOriginal: string | null;
-    titre: string | null;
+    titre: string;
     organe: string | null;
     objet: string | null;
     lieu: string | null;

package/lib/utils/cr_spliting.js CHANGED Viewed

@@ -349,12 +349,13 @@ export async function linkCRtoCommissionGroup(opts) {
             uid,
             chambre: "SN",
             date: dateISO,
-            type: organeDetected ?? "Commissions",
+            type: "Commission",
             startTime: hourShortToStartTime(hourShort),
             endTime: null,
             captationVideo: false,
             titre: titreGuess ?? "",
-            objet: "",
+            organe: organeDetected ?? "Commission",
+            objet: titreGuess ?? "",
             events: [],
             compteRenduRefUid: crUid,
         };
@@ -364,6 +365,7 @@ export async function linkCRtoCommissionGroup(opts) {
     else {
         group.compteRenduRefUid = crUid;
         updated = true;
+        console.log(`[AGENDA][COM] Updated group uid=${uid} for CR uid=${crUid}`);
     }
     // Lien CR
     // Enrichir depuis CR si vide

package/lib/utils/reunion_grouping.d.ts CHANGED Viewed

@@ -2,7 +2,7 @@ import { AgendaEvent, GroupedReunion, TimeSlot } from "../types/agenda";
 type KnownType = "SP" | "COM" | "MC" | "OD" | "ID";
 export declare function groupNonSPByTypeOrganeHour(events: AgendaEvent[]): Record<"IDC" | "IDM" | "IDO" | "IDI", GroupedReunion[]>;
 export declare function groupSeancePubliqueBySlot(events: AgendaEvent[]): Record<TimeSlot, GroupedReunion[]>;
-export declare function makeTypeGroupUid(dateISO: string, kind: KnownType, hourShort: string | null, organe?: string | null): string;
+export declare function makeTypeGroupUid(dateISO: string, kind: KnownType, agendaEventId: string, organe?: string | null): string;
 export declare function makeGroupUid(date: string, slot: TimeSlot): string;
 export declare function formatYYYYMMDD(dateYYYYMMDD: string): string;
 export declare function makeReunionUid(agenda: AgendaEvent): string;

package/lib/utils/reunion_grouping.js CHANGED Viewed

@@ -28,56 +28,27 @@ export function groupNonSPByTypeOrganeHour(events) {
     const nonSP = events.filter((e) => !isSeancePublique(e?.type));
     if (nonSP.length === 0)
         return out;
-    const buckets = new Map();
     for (const e of nonSP) {
         const kind = classifyAgendaType(e?.type);
         if (!kind || kind === "SP")
             continue;
         const { startISO, endISO } = deriveTimesForEvent(e);
-        const hourShort = hourShortFromISO(startISO) ?? hourShortFromOriginal(e.timeOriginal);
-        const org = e.organe ?? "NA_ORG";
-        const key = [e.date, kind, org, hourShort || "NA"].join("|");
-        if (!buckets.has(key))
-            buckets.set(key, []);
-        buckets.get(key).push({ ...e, startTime: startISO ?? e.startTime, endTime: endISO ?? e.endTime });
-    }
-    for (const [key, list] of buckets) {
-        const [date, kindStr, organe, hourShort] = key.split("|");
-        const kind = kindStr;
-        const enriched = list
-            .map((ev) => {
-            const { startISO, endISO } = deriveTimesForEvent(ev);
-            return { ev, startISO: startISO ?? ev.startTime, endISO: endISO ?? ev.endTime };
-        })
-            .sort((a, b) => {
-            const ta = a.startISO ? (parseISO(a.startISO)?.toMillis() ?? Number.MAX_SAFE_INTEGER) : Number.MAX_SAFE_INTEGER;
-            const tb = b.startISO ? (parseISO(b.startISO)?.toMillis() ?? Number.MAX_SAFE_INTEGER) : Number.MAX_SAFE_INTEGER;
-            return ta - tb;
-        });
-        const startTime = enriched.find((x) => !!x.startISO)?.startISO ?? null;
-        const endTime = enriched.reduce((acc, x) => {
-            const de = x.endISO ? parseISO(x.endISO)?.toMillis() : null;
-            const accMs = acc ? parseISO(acc)?.toMillis() : null;
-            if (de != null && (accMs == null || de > accMs))
-                return x.endISO;
-            return acc;
-        }, null);
-        const any = enriched[0]?.ev;
-        const hour = hourShort !== "NA" ? hourShort : (hourShortFromISO(startTime) ?? hourShortFromOriginal(any?.timeOriginal));
-        const uid = makeTypeGroupUid(date, kind, hour ?? "", any?.organe ?? "");
+        const startTime = startISO ?? e.startTime ?? null;
+        const endTime = endISO ?? e.endTime ?? null;
+        const uid = makeTypeGroupUid(e.date, kind, e.id, e.organe ?? null);
         const suffix = (kind === "COM" ? "IDC" : kind === "MC" ? "IDM" : kind === "OD" ? "IDO" : "IDI");
         const group = {
             uid,
             chambre: "SN",
-            date,
-            type: any?.type || "",
-            organe: any?.organe || undefined,
+            date: e.date,
+            type: e.type || "",
+            organe: e.organe || undefined,
             startTime,
             endTime,
-            captationVideo: enriched.some((x) => x.ev.captationVideo === true),
-            titre: compactTitleList(enriched.map((x) => x.ev.titre || "").filter(Boolean), 8),
-            objet: joinObjets(enriched.map((x) => x.ev)),
-            events: enriched.map((x) => x.ev),
+            captationVideo: e.captationVideo === true,
+            titre: e.titre,
+            objet: e.objet || "",
+            events: [e],
         };
         out[suffix].push(group);
     }
@@ -270,12 +241,12 @@ function organeInitials(input, maxLen = 8) {
     const out = letters.join("");
     return out.slice(0, maxLen);
 }
-export function makeTypeGroupUid(dateISO, kind, hourShort, organe) {
+export function makeTypeGroupUid(dateISO, kind, agendaEventId, organe) {
     const ymd = dateISO ? formatYYYYMMDD(dateISO) : "00000000";
     const suffix = typeToSuffixStrict(kind);
-    const hh = hourShort ?? "NA";
     const org = organe ? organeInitials(organe) : "";
-    return `RUSN${ymd}${suffix}${org ? org : ""}-${hh}`;
+    let base = `RUSN${ymd}${suffix}${org ? org : ""}${agendaEventId}`;
+    return base;
 }
 function parseISO(isoLike) {
     if (!isoLike)

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@tricoteuses/senat",
-  "version": "2.15.7",
+  "version": "2.16.0",
   "description": "Handle French Sénat's open data",
   "keywords": [
     "France",

package/lib/model/compte_rendu.d.ts DELETED Viewed

@@ -1,9 +0,0 @@
-import { CompteRendu, Sommaire } from "../types/compte_rendu";
-import { TimeSlot } from "../types/agenda";
-export declare function parseCompteRenduSlotFromFile(xmlFilePath: string, wantedSlot: TimeSlot, firstSlotOfDay?: TimeSlot): Promise<CompteRendu | null>;
-export declare function sessionStartYearFromDate(d: Date): number;
-export declare function parseYYYYMMDD(yyyymmdd: string): Date | null;
-export declare function deriveTitreObjetFromSommaire(sommaire: Sommaire | undefined, slot: TimeSlot): {
-    titre: string;
-    objet: string;
-};

package/lib/model/compte_rendu.js DELETED Viewed

@@ -1,325 +0,0 @@
-import fs from "fs";
-import * as cheerio from "cheerio";
-import path from "path";
-import { computeIntervalsBySlot } from "../utils/cr_spliting";
-import { norm } from "./util";
-const asArray = (x) => x == null ? [] : Array.isArray(x) ? x : [x];
-const toInt = (s) => Number.isFinite(Number(s)) ? Number(s) : Number.POSITIVE_INFINITY;
-export async function parseCompteRenduSlotFromFile(xmlFilePath, wantedSlot, firstSlotOfDay) {
-    try {
-        const raw = fs.readFileSync(xmlFilePath, "utf8");
-        const $ = cheerio.load(raw, { xml: false });
-        const metadonnees = extractMetadonnees($, xmlFilePath);
-        const order = $("body *").toArray();
-        const idx = new Map(order.map((el, i) => [el, i]));
-        const intervalsAll = computeIntervalsBySlot($, idx, firstSlotOfDay);
-        const intervals = intervalsAll.filter(iv => iv.slot === wantedSlot);
-        if (intervals.length === 0) {
-            console.warn(`[CRI] no intervals for ${path.basename(xmlFilePath)} [${wantedSlot}]`);
-            return null;
-        }
-        metadonnees.sommaire = extractSommaireForIntervals($, idx, intervals);
-        const points = [];
-        let ordre = 0;
-        const addPoint = (p) => points.push({ ...p, ordre_absolu_seance: String(++ordre) });
-        // Titles
-        $("cri\\:titreS1 p.titre_S1").each((_, el) => {
-            if (!elementInAnyInterval(el, idx, intervals))
-                return;
-            const t = normalizeTitle(norm($(el).text() || ""));
-            if (t)
-                addPoint({ code_grammaire: "TITRE_TEXTE_DISCUSSION", texte: { _: t }, code_style: "Titre" });
-        });
-        // Interventions
-        $("div.intervenant").each((_, block) => {
-            if (!elementInAnyInterval(block, idx, intervals))
-                return;
-            const $block = $(block);
-            $block.find([
-                "p[class^='titre_S']",
-                "p.mention_titre",
-                "p.intitule_titre",
-                "p.mention_chapitre",
-                "p.intitule_chapitre",
-                "p.mention_article",
-                "p.intitule_article",
-                "p.mention_section",
-                "p.intitule_section",
-            ].join(",")).remove();
-            const firstP = $block.find("p").first();
-            const speakerLabelRaw = firstP.find(".orateur_nom").text() || firstP.find("a.lien_senfic").text() || "";
-            const speakerLabel = dedupeSpeaker(speakerLabelRaw);
-            const { mat, nom: nomCRI, qua: quaCRI } = readIntervenantMeta($block);
-            const qualFromSpans = extractAndRemoveLeadingQualite($, $block);
-            const qualite = norm(decodeHtmlEntities(quaCRI || "")) || qualFromSpans;
-            const canonicalName = dedupeSpeaker(nomCRI || speakerLabel);
-            const role = roleForSpeaker(speakerLabel) || roleForSpeaker(qualite) || roleForSpeaker(quaCRI || "");
-            const speechHtml = sanitizeInterventionHtml($, $block);
-            if (!norm(cheerio.load(speechHtml).text() || ""))
-                return;
-            addPoint({
-                code_grammaire: "PAROLE_GENERIQUE",
-                roledebat: role,
-                orateurs: { orateur: { nom: canonicalName, id: mat || "", qualite } },
-                texte: { _: speechHtml },
-            });
-        });
-        const contenu = {
-            quantiemes: { journee: metadonnees.dateSeance, session: metadonnees.session },
-            point: points,
-        };
-        return {
-            uid: "CRSSN" + xmlFilePath.replace(/^.*?(\d{8}).*$/i, "$1") + `-${wantedSlot}`,
-            seanceRef: "RUSN" + xmlFilePath.replace(/^.*?(\d{8}).*$/i, "$1") + "IDS-" + wantedSlot,
-            sessionRef: metadonnees.session,
-            metadonnees,
-            contenu,
-        };
-    }
-    catch (e) {
-        console.error(`[CRI] parseSlot error file=${xmlFilePath} slot=${wantedSlot}:`, e);
-        return null;
-    }
-}
-export function sessionStartYearFromDate(d) {
-    // Session (1th oct N → 30 sept N+1)
-    const m = d.getMonth();
-    const y = d.getFullYear();
-    return m >= 9 ? y : y - 1;
-}
-export function parseYYYYMMDD(yyyymmdd) {
-    if (!/^\d{8}$/.test(yyyymmdd))
-        return null;
-    const y = Number(yyyymmdd.slice(0, 4));
-    const m = Number(yyyymmdd.slice(4, 6)) - 1;
-    const d = Number(yyyymmdd.slice(6, 8));
-    const dt = new Date(y, m, d);
-    return Number.isFinite(dt.getTime()) ? dt : null;
-}
-export function deriveTitreObjetFromSommaire(sommaire, slot) {
-    const items = extractLevel1Items(sommaire);
-    const meaningful = items.filter(it => !isBoilerplate(it.label));
-    if (meaningful.length === 0) {
-        return {
-            titre: `Séance publique ${slotLabel(slot)}`,
-            objet: "",
-        };
-    }
-    const titre = meaningful[0].label;
-    const objet = meaningful.slice(0, 3).map(it => it.label).join(" ; ");
-    return { titre, objet };
-}
-function slotLabel(slot) {
-    switch (slot) {
-        case "MATIN": return "du matin";
-        case "APRES-MIDI": return "de l’après-midi";
-        case "SOIR": return "du soir";
-        default: return "";
-    }
-}
-const BOILERPLATE_PATTERNS = [
-    /proc(?:è|e)s-?verbal/i,
-    /hommages?/i,
-    /désignation des vice-?président/i,
-    /candidatures? aux?/i,
-    /ordre du jour/i,
-    /rappels? au règlement/i,
-    /communications?/i,
-    /dépôts?/i,
-    /proclamation/i,
-    /présidence de/i,
-    /questions? diverses?/i,
-    /ouverture de la séance/i,
-    /clo(?:t|̂)ure de la séance/i,
-];
-const isBoilerplate = (label) => !label?.trim() || BOILERPLATE_PATTERNS.some(rx => rx.test(label));
-function extractLevel1Items(sommaire) {
-    const level1 = asArray(sommaire?.sommaire1);
-    return level1
-        .map(el => ({
-        numero: toInt(el?.valeur_pts_odj),
-        label: String(el?.titreStruct?.intitule ?? "").trim(),
-    }))
-        .filter(it => !!it.label)
-        .sort((a, b) => a.numero - b.numero);
-}
-function stripTrailingPunct(s) { return s.replace(/\s*([:,.;])\s*$/u, "").trim(); }
-function dedupeSpeaker(raw) {
-    let s = norm(raw);
-    s = stripTrailingPunct(s);
-    const dupPatterns = [/^(.+?)\s*[.]\s*\1$/u, /^(.+?)\s*,\s*\1,?$/u, /^(.+?)\s+\1$/u];
-    for (const re of dupPatterns) {
-        const m = s.match(re);
-        if (m) {
-            s = m[1];
-            break;
-        }
-    }
-    return s.replace(/\.\s*$/, "");
-}
-function decodeHtmlEntities(s) {
-    return s.replace(/&#(\d+);/g, (_, d) => String.fromCharCode(parseInt(d, 10)))
-        .replace(/&#x([0-9a-fA-F]+);/g, (_, h) => String.fromCharCode(parseInt(h, 16)));
-}
-function fixApostrophes(s) {
-    let out = s;
-    out = out.replace(/\s*’\s*/g, "’");
-    out = out.replace(/\b([dljctmsn])\s*’/gi, (_, m) => m + "’");
-    out = out.replace(/’\s+([A-Za-zÀ-ÖØ-öø-ÿ])/g, "’$1");
-    out = out.replace(/\s+([,;:.!?])/g, "$1");
-    return out;
-}
-function normalizeTitle(text) { return text.replace(/^PR[ÉE]SIDENCE DE\b/i, "Présidence de "); }
-function roleForSpeaker(labelOrQualite) {
-    const s = (labelOrQualite || "").toLowerCase();
-    if (/^(m\.|mme)?\s*(le|la)\s+pr[ée]sident(e)?\b/.test(s) || /\bpr[ée]sident[e]?\s+de\s+séance\b/.test(s))
-        return "président";
-    return "";
-}
-function readIntervenantMeta($block) {
-    const int = $block.find('cri\\:intervenant').first();
-    if (int.length)
-        return { mat: int.attr("mat") || undefined, nom: int.attr("nom") || undefined, qua: int.attr("qua") || undefined };
-    const html = $block.html() || "";
-    const m = html.match(/<!--\s*cri:intervenant\b([^>]+)-->/i);
-    if (!m)
-        return {};
-    const out = {};
-    const re = /(\w+)="([^"]*)"/g;
-    let a;
-    while ((a = re.exec(m[1])))
-        out[a[1]] = decodeHtmlEntities(a[2]);
-    return { mat: out["mat"], nom: out["nom"], qua: out["qua"] };
-}
-function extractAndRemoveLeadingQualite($, $block) {
-    const firstP = $block.find("p").first();
-    if (firstP.length === 0)
-        return "";
-    const parts = [];
-    let stop = false;
-    firstP.contents().each((_, node) => {
-        if (stop)
-            return;
-        if (node.type === "tag") {
-            const $node = $(node);
-            if ($node.hasClass("orateur_nom")) {
-                $node.remove();
-                return;
-            }
-            if ($node.hasClass("orateur_qualite")) {
-                parts.push($node.text() || "");
-                $node.remove();
-                return;
-            }
-            const t = norm($node.text() || "");
-            if (t)
-                stop = true;
-            else
-                $node.remove();
-        }
-        else if (node.type === "text") {
-            const t = norm(node.data || "");
-            if (!t || /^[:.,;–—-]+$/.test(t)) {
-                node.data = "";
-                return;
-            }
-            stop = true;
-        }
-    });
-    return fixApostrophes(norm(parts.join(" ")));
-}
-function sanitizeInterventionHtml($, $block) {
-    const $clone = $block.clone();
-    $clone.find('a[name]').remove();
-    $clone.find('div[align="right"]').remove();
-    $clone.find('a.link').remove();
-    $clone.find('img').remove();
-    $clone.find('a#ameli_amendement_cri_phrase, a#ameli_amendement_cra_contenu, a#ameli_amendement_cra_objet').remove();
-    $clone.find(".orateur_nom, .orateur_qualite").remove();
-    let html = $clone.html() || "";
-    html = html.replace(/<!--[\s\S]*?-->/g, "");
-    return html.trim();
-}
-function extractSommaireForIntervals($, idx, intervals) {
-    const inIv = (el) => elementInAnyInterval(el, idx, intervals);
-    const root = $("body");
-    const sommaire = { presidentSeance: { _: "" }, sommaire1: [] };
-    // (1) Présidence (tm2) — première ligne dans l’intervalle
-    const pres = root.find("p.tm2").filter((_, el) => inIv(el)).first();
-    if (pres.length)
-        sommaire.presidentSeance = { _: norm(pres.text()) };
-    // (2) Paras tm5 présents dans l’intervalle
-    const paras = [];
-    root.find("p.tm5").each((_, el) => {
-        if (!inIv(el))
-            return;
-        const t = norm($(el).text());
-        if (t)
-            paras.push({ _: t });
-    });
-    if (paras.length)
-        sommaire.para = paras.length === 1 ? paras[0] : paras;
-    // (3) Items de 1er niveau (tm3) présents dans l’intervalle
-    const items = [];
-    root.find("p.tm3").each((_, el) => {
-        if (!inIv(el))
-            return;
-        const $p = $(el);
-        const full = norm($p.text() || "");
-        if (!full)
-            return;
-        const numMatch = full.match(/^(\d+)\s*[.\-–—]\s*/);
-        const valeur = numMatch ? numMatch[1] : undefined;
-        // prefere intitule in ancre <a> if present
-        const a = $p.find("a").first();
-        const intituleRaw = a.length ? a.text() : full.replace(/^(\d+)\s*[.\-–—]\s*/, "");
-        const intitule = norm(intituleRaw);
-        // id_syceron from href="#Niv1_SOMx"
-        const href = (a.attr("href") || "").trim();
-        const idSyceron = href.startsWith("#") ? href.slice(1) : href;
-        const titreStruct = { id_syceron: idSyceron || "", intitule };
-        items.push({ valeur_pts_odj: valeur, titreStruct });
-    });
-    if (items.length)
-        sommaire.sommaire1 = items;
-    return sommaire;
-}
-function extractMetadonnees($, filePath) {
-    let dateText = norm($("h1, h2, .page-title").first().text() || "");
-    if (!dateText)
-        dateText = norm($("p").first().text() || "");
-    const dateMatch = dateText.match(/\b(\d{1,2}\s+\w+\s+\d{4})\b/i);
-    const allText = norm($("body").text() || "");
-    const sessionMatch = allText.match(/\bsession\s+(\d{4}-\d{4})\b/i);
-    let dateSeance = dateMatch?.[1] || "";
-    if (!dateSeance) {
-        const m = filePath.match(/d(\d{4})(\d{2})(\d{2})\.xml$/i);
-        if (m)
-            dateSeance = `${m[1]}-${m[2]}-${m[3]}`;
-    }
-    return {
-        dateSeance,
-        dateSeanceJour: dateSeance,
-        numSeanceJour: "",
-        numSeance: "",
-        typeAssemblee: "SN",
-        legislature: "",
-        session: sessionMatch?.[1] || "",
-        nomFichierJo: "",
-        validite: "",
-        etat: "",
-        diffusion: "",
-        version: "1.0",
-        environnement: "",
-        heureGeneration: new Date()
-    };
-}
-function elementInAnyInterval(el, idx, intervals) {
-    const p = idx.get(el);
-    if (p == null)
-        return false;
-    for (const iv of intervals)
-        if (p >= iv.start && p < iv.end)
-            return true;
-    return false;
-}