@tricoteuses/senat 2.15.7 → 2.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,8 +7,9 @@ import path from "path";
7
7
  import { AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, iterLoadSenatAgendas } from "../loaders";
8
8
  import { getSessionsFromStart } from "../types/sessions";
9
9
  import { commonOptions } from "./shared/cli_helpers";
10
+ import { decodeHtmlEntities } from "../model/util";
10
11
  // ===================== Constants =====================
11
- const MATCH_THRESHOLD = 0.6;
12
+ const MATCH_THRESHOLD = 0.56;
12
13
  const MAX_CANDIDATES = 15;
13
14
  const MAX_PAGES = 3;
14
15
  const STATS = { total: 0, accepted: 0 };
@@ -145,7 +146,9 @@ function extractCandidatesFromSearchHtml(html) {
145
146
  const pageUrl = `https://videos.senat.fr/video.${id}_${hash}.html`;
146
147
  const ctx = html.slice(Math.max(0, m.index - 240), Math.min(html.length, m.index + 240));
147
148
  const t = ctx.match(/title="([^"]+)"/i) || ctx.match(/>([^<]{10,200})</);
148
- out.push({ id, hash, pageUrl, title: t?.[1] });
149
+ const title = t?.[1]?.trim();
150
+ const isSeancePublique = title?.toLowerCase().includes("séance publique") ?? false;
151
+ out.push({ id, hash, pageUrl, title, isSeancePublique });
149
152
  }
150
153
  const seen = new Set();
151
154
  return out.filter((c) => {
@@ -157,20 +160,28 @@ function extractCandidatesFromSearchHtml(html) {
157
160
  });
158
161
  }
159
162
  function parseDataNvs(nvs) {
160
- const epoch = nvs.match(/<metadata\s+name="date"\s+value="(\d+)"/i)?.[1];
161
- const title = nvs.match(/<metadata\s+name="title"\s+value="([^"]+)"/i)?.[1];
162
- return { epoch: epoch ? Number(epoch) : undefined, title };
163
+ const epochStr = nvs.match(/<metadata\s+name="date"\s+value="(\d+)"/i)?.[1];
164
+ const epoch = epochStr ? Number(epochStr) : undefined;
165
+ const organesTag = nvs.match(/<metadata\b[^>]*\bname="organes"[^>]*>/i)?.[0];
166
+ let organeLabel;
167
+ let organeValue;
168
+ if (organesTag) {
169
+ organeLabel = organesTag.match(/\blabel="([^"]+)"/i)?.[1];
170
+ organeValue = organesTag.match(/\bvalue="([^"]+)"/i)?.[1];
171
+ }
172
+ const organeRaw = organeLabel ?? organeValue;
173
+ const organe = decodeHtmlEntities(organeRaw)?.trim();
174
+ const firstChapterLabel = decodeHtmlEntities(nvs.match(/<chapter\b[^>]*\blabel="([^"]+)"/i)[1]).trim();
175
+ return { epoch, organe, firstChapterLabel };
163
176
  }
164
- // nvsText = contenu texte de data.nvs (utf-8)
165
- // finalText = contenu texte de finalplayer.nvs (utf-8)
166
- export function buildSenatVodMasterM3u8FromNvs(nvsText, finalText) {
167
- // 1) Base Akamai depuis data.nvs (mp4 "serverfiles://senat/YYYY/MM/encoderX_YYYYMMDDHHMMSS_1.mp4")
177
+ function buildSenatVodMasterM3u8FromNvs(nvsText, finalText) {
178
+ // 1) Base Akamai from data.nvs (mp4 "serverfiles://senat/YYYY/MM/encoderX_YYYYMMDDHHMMSS_1.mp4")
168
179
  const baseMatch = nvsText.match(/serverfiles:\/\/senat\/(\d{4})\/(\d{2})\/(encoder\d)_(\d{14})/i);
169
180
  if (!baseMatch)
170
181
  return null;
171
182
  const [, yyyy, mm, encoder, stamp] = baseMatch;
172
183
  const base = `https://vodsenat.akamaized.net/senat/${yyyy}/${mm}/${encoder}_${stamp}`;
173
- // 2) start/end depuis finalplayer.nvs
184
+ // 2) start/end from finalplayer.nvs
174
185
  let start = null, end = null;
175
186
  const playerAttr = finalText.match(/player[^>]*\bstarttime="(\d+)"[^>]*\bendtime="(\d+)"/i);
176
187
  if (playerAttr) {
@@ -178,12 +189,11 @@ export function buildSenatVodMasterM3u8FromNvs(nvsText, finalText) {
178
189
  end = parseInt(playerAttr[2], 10);
179
190
  }
180
191
  else {
181
- // fallback: prendre le plus petit timecode des <synchro timecode="...">
192
+ // fallback: take smallest timecode of <synchro timecode="...">
182
193
  const tc = Array.from(finalText.matchAll(/timecode="(\d+)"/g)).map((m) => parseInt(m[1], 10));
183
194
  if (tc.length)
184
195
  start = Math.min(...tc);
185
196
  }
186
- // 3) si pas d'end, on peut déduire via "duree" (en secondes) de data.nvs
187
197
  if (end == null) {
188
198
  const durMeta = nvsText.match(/<metadata[^>]*\bname="duree"[^>]*\bvalue="(\d+)"[^>]*>/i);
189
199
  if (durMeta && start != null) {
@@ -191,16 +201,14 @@ export function buildSenatVodMasterM3u8FromNvs(nvsText, finalText) {
191
201
  end = start + durMs;
192
202
  }
193
203
  }
194
- // 4) Construction de l’URL
195
- // - si on a start & end → utiliser ps/pd (robuste et conforme à ce que sert le Sénat)
196
- // - sinon fallback sans suffixe (souvent valide aussi)
204
+ // 4) Construct URL
197
205
  if (start != null && end != null && end > start) {
198
206
  const pd = end - start;
199
207
  return `${base}_ps${start}_pd${pd}.smil/master.m3u8`;
200
208
  }
201
209
  return `${base}.smil/master.m3u8`;
202
210
  }
203
- export function score(agenda, agendaTs, videoTitle, videoEpoch) {
211
+ function score(agenda, agendaTs, videoTitle, videoEpoch, videoOrgane) {
204
212
  const titleScore = dice(agenda.titre || "", videoTitle || "");
205
213
  let timeScore = 0;
206
214
  if (agendaTs && videoEpoch) {
@@ -209,15 +217,11 @@ export function score(agenda, agendaTs, videoTitle, videoEpoch) {
209
217
  // delta : 180min
210
218
  timeScore = Math.max(0, 1 - deltaMin / 180);
211
219
  }
212
- let orgBonus = 0;
213
- if (agenda.organe && videoTitle) {
214
- const o = normalize(agenda.organe);
215
- const t = normalize(videoTitle);
216
- const first = o.split(" ").filter(Boolean)[0];
217
- if (first && t.includes(first))
218
- orgBonus = 0.15;
220
+ const orgScore = videoOrgane && agenda.organe ? dice(agenda.organe, videoOrgane) : 0;
221
+ if (orgScore === 0 && agenda.organe === "Séance publique") {
222
+ return 0.5 * titleScore + 0.5 * timeScore;
219
223
  }
220
- return 0.3 * titleScore + 0.7 * timeScore + orgBonus; // Can be adjusted
224
+ return 0.4 * titleScore + 0.3 * timeScore + orgScore * 0.3;
221
225
  }
222
226
  /**
223
227
  * Build search strategies for senat's videos
@@ -263,19 +267,16 @@ async function fetchAllSearchPages(args, baseDir, strategyIndex, maxPages = MAX_
263
267
  async function processGroupedReunion(agenda, session, dataDir) {
264
268
  if (!agenda)
265
269
  return;
266
- // 1) Garde-fous
270
+ // 1) GuardRails
267
271
  if (!agenda.captationVideo) {
268
- if (!options["silent"])
269
- console.log(`[skip] ${agenda.uid} captationVideo=false`);
272
+ // if (!options["silent"]) console.log(`[skip] ${agenda.uid} captationVideo=false`)
270
273
  return;
271
274
  }
272
275
  if (!agenda.date || !agenda.startTime) {
273
- if (!options["silent"])
274
- console.log(`[skip] ${agenda.uid} date/hour missing`);
276
+ // if (!options["silent"]) console.log(`[skip] ${agenda.uid} date/hour missing`)
275
277
  return;
276
278
  }
277
279
  STATS.total++;
278
- // 2) Dossier de sortie (utilise directement l'UID)
279
280
  const reunionUid = agenda.uid;
280
281
  const baseDir = path.join(dataDir, VIDEOS_ROOT_FOLDER, String(session), reunionUid);
281
282
  await fs.ensureDir(baseDir);
@@ -312,9 +313,29 @@ async function processGroupedReunion(agenda, session, dataDir) {
312
313
  if (!buf)
313
314
  continue;
314
315
  const meta = parseDataNvs(buf.toString("utf-8"));
315
- const s = score(agenda, agendaTs, c.title ?? meta.title, meta.epoch);
316
+ // If organes are different, go to next candidates
317
+ if (meta.organe && agenda.organe) {
318
+ const videoOrgNorm = normalize(meta.organe);
319
+ const agendaOrgNorm = normalize(agenda.organe);
320
+ if (dice(agendaOrgNorm, videoOrgNorm) < 0.5) {
321
+ continue;
322
+ }
323
+ }
324
+ let videoTitle = c.title;
325
+ if (c.isSeancePublique && meta.firstChapterLabel) {
326
+ videoTitle = meta.firstChapterLabel;
327
+ }
328
+ const s = score(agenda, agendaTs, videoTitle, meta.epoch, meta.organe);
316
329
  if (!best || s > best.score) {
317
- best = { id: c.id, hash: c.hash, pageUrl: c.pageUrl, epoch: meta.epoch, vtitle: c.title ?? meta.title, score: s };
330
+ best = {
331
+ id: c.id,
332
+ hash: c.hash,
333
+ pageUrl: c.pageUrl,
334
+ epoch: meta.epoch,
335
+ vtitle: videoTitle,
336
+ score: s,
337
+ vorgane: meta.organe,
338
+ };
318
339
  }
319
340
  }
320
341
  if (!best) {
@@ -326,7 +347,10 @@ async function processGroupedReunion(agenda, session, dataDir) {
326
347
  if (accepted)
327
348
  STATS.accepted++;
328
349
  if (!options["silent"]) {
329
- console.log(`[pick] ${agenda.uid} best id=${best.id} hash=${best.hash} score=${best.score.toFixed(2)} accepted=${accepted} (strategy=${usedStrategy})`);
350
+ console.log(`[pick] ${agenda.uid} score=${best.score.toFixed(2)}
351
+ agenda title="${agenda.titre ?? ""}" agenda organe="${agenda.organe ?? ""}"
352
+ best title="${best.vtitle ?? ""}" best organe="${best.vorgane ?? ""}"
353
+ accepted=${accepted} (strategy=${usedStrategy})`);
330
354
  }
331
355
  // ==== 3) Write metadata + NVS of the best candidate (always) ====
332
356
  const bestDt = best?.epoch ? epochToParisDateTime(best.epoch) : null;
@@ -1,11 +1,11 @@
1
1
  export interface AgendaEvent {
2
2
  id: string;
3
3
  type: string | null;
4
- date: string | null;
4
+ date: string;
5
5
  startTime: string | null;
6
6
  endTime: string | null;
7
7
  timeOriginal: string | null;
8
- titre: string | null;
8
+ titre: string;
9
9
  organe: string | null;
10
10
  objet: string | null;
11
11
  lieu: string | null;
@@ -349,12 +349,13 @@ export async function linkCRtoCommissionGroup(opts) {
349
349
  uid,
350
350
  chambre: "SN",
351
351
  date: dateISO,
352
- type: organeDetected ?? "Commissions",
352
+ type: "Commission",
353
353
  startTime: hourShortToStartTime(hourShort),
354
354
  endTime: null,
355
355
  captationVideo: false,
356
356
  titre: titreGuess ?? "",
357
- objet: "",
357
+ organe: organeDetected ?? "Commission",
358
+ objet: titreGuess ?? "",
358
359
  events: [],
359
360
  compteRenduRefUid: crUid,
360
361
  };
@@ -364,6 +365,7 @@ export async function linkCRtoCommissionGroup(opts) {
364
365
  else {
365
366
  group.compteRenduRefUid = crUid;
366
367
  updated = true;
368
+ console.log(`[AGENDA][COM] Updated group uid=${uid} for CR uid=${crUid}`);
367
369
  }
368
370
  // Lien CR
369
371
  // Enrichir depuis CR si vide
@@ -2,7 +2,7 @@ import { AgendaEvent, GroupedReunion, TimeSlot } from "../types/agenda";
2
2
  type KnownType = "SP" | "COM" | "MC" | "OD" | "ID";
3
3
  export declare function groupNonSPByTypeOrganeHour(events: AgendaEvent[]): Record<"IDC" | "IDM" | "IDO" | "IDI", GroupedReunion[]>;
4
4
  export declare function groupSeancePubliqueBySlot(events: AgendaEvent[]): Record<TimeSlot, GroupedReunion[]>;
5
- export declare function makeTypeGroupUid(dateISO: string, kind: KnownType, hourShort: string | null, organe?: string | null): string;
5
+ export declare function makeTypeGroupUid(dateISO: string, kind: KnownType, agendaEventId: string, organe?: string | null): string;
6
6
  export declare function makeGroupUid(date: string, slot: TimeSlot): string;
7
7
  export declare function formatYYYYMMDD(dateYYYYMMDD: string): string;
8
8
  export declare function makeReunionUid(agenda: AgendaEvent): string;
@@ -28,56 +28,27 @@ export function groupNonSPByTypeOrganeHour(events) {
28
28
  const nonSP = events.filter((e) => !isSeancePublique(e?.type));
29
29
  if (nonSP.length === 0)
30
30
  return out;
31
- const buckets = new Map();
32
31
  for (const e of nonSP) {
33
32
  const kind = classifyAgendaType(e?.type);
34
33
  if (!kind || kind === "SP")
35
34
  continue;
36
35
  const { startISO, endISO } = deriveTimesForEvent(e);
37
- const hourShort = hourShortFromISO(startISO) ?? hourShortFromOriginal(e.timeOriginal);
38
- const org = e.organe ?? "NA_ORG";
39
- const key = [e.date, kind, org, hourShort || "NA"].join("|");
40
- if (!buckets.has(key))
41
- buckets.set(key, []);
42
- buckets.get(key).push({ ...e, startTime: startISO ?? e.startTime, endTime: endISO ?? e.endTime });
43
- }
44
- for (const [key, list] of buckets) {
45
- const [date, kindStr, organe, hourShort] = key.split("|");
46
- const kind = kindStr;
47
- const enriched = list
48
- .map((ev) => {
49
- const { startISO, endISO } = deriveTimesForEvent(ev);
50
- return { ev, startISO: startISO ?? ev.startTime, endISO: endISO ?? ev.endTime };
51
- })
52
- .sort((a, b) => {
53
- const ta = a.startISO ? (parseISO(a.startISO)?.toMillis() ?? Number.MAX_SAFE_INTEGER) : Number.MAX_SAFE_INTEGER;
54
- const tb = b.startISO ? (parseISO(b.startISO)?.toMillis() ?? Number.MAX_SAFE_INTEGER) : Number.MAX_SAFE_INTEGER;
55
- return ta - tb;
56
- });
57
- const startTime = enriched.find((x) => !!x.startISO)?.startISO ?? null;
58
- const endTime = enriched.reduce((acc, x) => {
59
- const de = x.endISO ? parseISO(x.endISO)?.toMillis() : null;
60
- const accMs = acc ? parseISO(acc)?.toMillis() : null;
61
- if (de != null && (accMs == null || de > accMs))
62
- return x.endISO;
63
- return acc;
64
- }, null);
65
- const any = enriched[0]?.ev;
66
- const hour = hourShort !== "NA" ? hourShort : (hourShortFromISO(startTime) ?? hourShortFromOriginal(any?.timeOriginal));
67
- const uid = makeTypeGroupUid(date, kind, hour ?? "", any?.organe ?? "");
36
+ const startTime = startISO ?? e.startTime ?? null;
37
+ const endTime = endISO ?? e.endTime ?? null;
38
+ const uid = makeTypeGroupUid(e.date, kind, e.id, e.organe ?? null);
68
39
  const suffix = (kind === "COM" ? "IDC" : kind === "MC" ? "IDM" : kind === "OD" ? "IDO" : "IDI");
69
40
  const group = {
70
41
  uid,
71
42
  chambre: "SN",
72
- date,
73
- type: any?.type || "",
74
- organe: any?.organe || undefined,
43
+ date: e.date,
44
+ type: e.type || "",
45
+ organe: e.organe || undefined,
75
46
  startTime,
76
47
  endTime,
77
- captationVideo: enriched.some((x) => x.ev.captationVideo === true),
78
- titre: compactTitleList(enriched.map((x) => x.ev.titre || "").filter(Boolean), 8),
79
- objet: joinObjets(enriched.map((x) => x.ev)),
80
- events: enriched.map((x) => x.ev),
48
+ captationVideo: e.captationVideo === true,
49
+ titre: e.titre,
50
+ objet: e.objet || "",
51
+ events: [e],
81
52
  };
82
53
  out[suffix].push(group);
83
54
  }
@@ -270,12 +241,12 @@ function organeInitials(input, maxLen = 8) {
270
241
  const out = letters.join("");
271
242
  return out.slice(0, maxLen);
272
243
  }
273
- export function makeTypeGroupUid(dateISO, kind, hourShort, organe) {
244
+ export function makeTypeGroupUid(dateISO, kind, agendaEventId, organe) {
274
245
  const ymd = dateISO ? formatYYYYMMDD(dateISO) : "00000000";
275
246
  const suffix = typeToSuffixStrict(kind);
276
- const hh = hourShort ?? "NA";
277
247
  const org = organe ? organeInitials(organe) : "";
278
- return `RUSN${ymd}${suffix}${org ? org : ""}-${hh}`;
248
+ let base = `RUSN${ymd}${suffix}${org ? org : ""}${agendaEventId}`;
249
+ return base;
279
250
  }
280
251
  function parseISO(isoLike) {
281
252
  if (!isoLike)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tricoteuses/senat",
3
- "version": "2.15.7",
3
+ "version": "2.16.0",
4
4
  "description": "Handle French Sénat's open data",
5
5
  "keywords": [
6
6
  "France",
@@ -1,9 +0,0 @@
1
- import { CompteRendu, Sommaire } from "../types/compte_rendu";
2
- import { TimeSlot } from "../types/agenda";
3
- export declare function parseCompteRenduSlotFromFile(xmlFilePath: string, wantedSlot: TimeSlot, firstSlotOfDay?: TimeSlot): Promise<CompteRendu | null>;
4
- export declare function sessionStartYearFromDate(d: Date): number;
5
- export declare function parseYYYYMMDD(yyyymmdd: string): Date | null;
6
- export declare function deriveTitreObjetFromSommaire(sommaire: Sommaire | undefined, slot: TimeSlot): {
7
- titre: string;
8
- objet: string;
9
- };
@@ -1,325 +0,0 @@
1
- import fs from "fs";
2
- import * as cheerio from "cheerio";
3
- import path from "path";
4
- import { computeIntervalsBySlot } from "../utils/cr_spliting";
5
- import { norm } from "./util";
6
- const asArray = (x) => x == null ? [] : Array.isArray(x) ? x : [x];
7
- const toInt = (s) => Number.isFinite(Number(s)) ? Number(s) : Number.POSITIVE_INFINITY;
8
- export async function parseCompteRenduSlotFromFile(xmlFilePath, wantedSlot, firstSlotOfDay) {
9
- try {
10
- const raw = fs.readFileSync(xmlFilePath, "utf8");
11
- const $ = cheerio.load(raw, { xml: false });
12
- const metadonnees = extractMetadonnees($, xmlFilePath);
13
- const order = $("body *").toArray();
14
- const idx = new Map(order.map((el, i) => [el, i]));
15
- const intervalsAll = computeIntervalsBySlot($, idx, firstSlotOfDay);
16
- const intervals = intervalsAll.filter(iv => iv.slot === wantedSlot);
17
- if (intervals.length === 0) {
18
- console.warn(`[CRI] no intervals for ${path.basename(xmlFilePath)} [${wantedSlot}]`);
19
- return null;
20
- }
21
- metadonnees.sommaire = extractSommaireForIntervals($, idx, intervals);
22
- const points = [];
23
- let ordre = 0;
24
- const addPoint = (p) => points.push({ ...p, ordre_absolu_seance: String(++ordre) });
25
- // Titles
26
- $("cri\\:titreS1 p.titre_S1").each((_, el) => {
27
- if (!elementInAnyInterval(el, idx, intervals))
28
- return;
29
- const t = normalizeTitle(norm($(el).text() || ""));
30
- if (t)
31
- addPoint({ code_grammaire: "TITRE_TEXTE_DISCUSSION", texte: { _: t }, code_style: "Titre" });
32
- });
33
- // Interventions
34
- $("div.intervenant").each((_, block) => {
35
- if (!elementInAnyInterval(block, idx, intervals))
36
- return;
37
- const $block = $(block);
38
- $block.find([
39
- "p[class^='titre_S']",
40
- "p.mention_titre",
41
- "p.intitule_titre",
42
- "p.mention_chapitre",
43
- "p.intitule_chapitre",
44
- "p.mention_article",
45
- "p.intitule_article",
46
- "p.mention_section",
47
- "p.intitule_section",
48
- ].join(",")).remove();
49
- const firstP = $block.find("p").first();
50
- const speakerLabelRaw = firstP.find(".orateur_nom").text() || firstP.find("a.lien_senfic").text() || "";
51
- const speakerLabel = dedupeSpeaker(speakerLabelRaw);
52
- const { mat, nom: nomCRI, qua: quaCRI } = readIntervenantMeta($block);
53
- const qualFromSpans = extractAndRemoveLeadingQualite($, $block);
54
- const qualite = norm(decodeHtmlEntities(quaCRI || "")) || qualFromSpans;
55
- const canonicalName = dedupeSpeaker(nomCRI || speakerLabel);
56
- const role = roleForSpeaker(speakerLabel) || roleForSpeaker(qualite) || roleForSpeaker(quaCRI || "");
57
- const speechHtml = sanitizeInterventionHtml($, $block);
58
- if (!norm(cheerio.load(speechHtml).text() || ""))
59
- return;
60
- addPoint({
61
- code_grammaire: "PAROLE_GENERIQUE",
62
- roledebat: role,
63
- orateurs: { orateur: { nom: canonicalName, id: mat || "", qualite } },
64
- texte: { _: speechHtml },
65
- });
66
- });
67
- const contenu = {
68
- quantiemes: { journee: metadonnees.dateSeance, session: metadonnees.session },
69
- point: points,
70
- };
71
- return {
72
- uid: "CRSSN" + xmlFilePath.replace(/^.*?(\d{8}).*$/i, "$1") + `-${wantedSlot}`,
73
- seanceRef: "RUSN" + xmlFilePath.replace(/^.*?(\d{8}).*$/i, "$1") + "IDS-" + wantedSlot,
74
- sessionRef: metadonnees.session,
75
- metadonnees,
76
- contenu,
77
- };
78
- }
79
- catch (e) {
80
- console.error(`[CRI] parseSlot error file=${xmlFilePath} slot=${wantedSlot}:`, e);
81
- return null;
82
- }
83
- }
84
- export function sessionStartYearFromDate(d) {
85
- // Session (1th oct N → 30 sept N+1)
86
- const m = d.getMonth();
87
- const y = d.getFullYear();
88
- return m >= 9 ? y : y - 1;
89
- }
90
- export function parseYYYYMMDD(yyyymmdd) {
91
- if (!/^\d{8}$/.test(yyyymmdd))
92
- return null;
93
- const y = Number(yyyymmdd.slice(0, 4));
94
- const m = Number(yyyymmdd.slice(4, 6)) - 1;
95
- const d = Number(yyyymmdd.slice(6, 8));
96
- const dt = new Date(y, m, d);
97
- return Number.isFinite(dt.getTime()) ? dt : null;
98
- }
99
- export function deriveTitreObjetFromSommaire(sommaire, slot) {
100
- const items = extractLevel1Items(sommaire);
101
- const meaningful = items.filter(it => !isBoilerplate(it.label));
102
- if (meaningful.length === 0) {
103
- return {
104
- titre: `Séance publique ${slotLabel(slot)}`,
105
- objet: "",
106
- };
107
- }
108
- const titre = meaningful[0].label;
109
- const objet = meaningful.slice(0, 3).map(it => it.label).join(" ; ");
110
- return { titre, objet };
111
- }
112
- function slotLabel(slot) {
113
- switch (slot) {
114
- case "MATIN": return "du matin";
115
- case "APRES-MIDI": return "de l’après-midi";
116
- case "SOIR": return "du soir";
117
- default: return "";
118
- }
119
- }
120
- const BOILERPLATE_PATTERNS = [
121
- /proc(?:è|e)s-?verbal/i,
122
- /hommages?/i,
123
- /désignation des vice-?président/i,
124
- /candidatures? aux?/i,
125
- /ordre du jour/i,
126
- /rappels? au règlement/i,
127
- /communications?/i,
128
- /dépôts?/i,
129
- /proclamation/i,
130
- /présidence de/i,
131
- /questions? diverses?/i,
132
- /ouverture de la séance/i,
133
- /clo(?:t|̂)ure de la séance/i,
134
- ];
135
- const isBoilerplate = (label) => !label?.trim() || BOILERPLATE_PATTERNS.some(rx => rx.test(label));
136
- function extractLevel1Items(sommaire) {
137
- const level1 = asArray(sommaire?.sommaire1);
138
- return level1
139
- .map(el => ({
140
- numero: toInt(el?.valeur_pts_odj),
141
- label: String(el?.titreStruct?.intitule ?? "").trim(),
142
- }))
143
- .filter(it => !!it.label)
144
- .sort((a, b) => a.numero - b.numero);
145
- }
146
- function stripTrailingPunct(s) { return s.replace(/\s*([:,.;])\s*$/u, "").trim(); }
147
- function dedupeSpeaker(raw) {
148
- let s = norm(raw);
149
- s = stripTrailingPunct(s);
150
- const dupPatterns = [/^(.+?)\s*[.]\s*\1$/u, /^(.+?)\s*,\s*\1,?$/u, /^(.+?)\s+\1$/u];
151
- for (const re of dupPatterns) {
152
- const m = s.match(re);
153
- if (m) {
154
- s = m[1];
155
- break;
156
- }
157
- }
158
- return s.replace(/\.\s*$/, "");
159
- }
160
- function decodeHtmlEntities(s) {
161
- return s.replace(/&#(\d+);/g, (_, d) => String.fromCharCode(parseInt(d, 10)))
162
- .replace(/&#x([0-9a-fA-F]+);/g, (_, h) => String.fromCharCode(parseInt(h, 16)));
163
- }
164
- function fixApostrophes(s) {
165
- let out = s;
166
- out = out.replace(/\s*’\s*/g, "’");
167
- out = out.replace(/\b([dljctmsn])\s*’/gi, (_, m) => m + "’");
168
- out = out.replace(/’\s+([A-Za-zÀ-ÖØ-öø-ÿ])/g, "’$1");
169
- out = out.replace(/\s+([,;:.!?])/g, "$1");
170
- return out;
171
- }
172
- function normalizeTitle(text) { return text.replace(/^PR[ÉE]SIDENCE DE\b/i, "Présidence de "); }
173
- function roleForSpeaker(labelOrQualite) {
174
- const s = (labelOrQualite || "").toLowerCase();
175
- if (/^(m\.|mme)?\s*(le|la)\s+pr[ée]sident(e)?\b/.test(s) || /\bpr[ée]sident[e]?\s+de\s+séance\b/.test(s))
176
- return "président";
177
- return "";
178
- }
179
- function readIntervenantMeta($block) {
180
- const int = $block.find('cri\\:intervenant').first();
181
- if (int.length)
182
- return { mat: int.attr("mat") || undefined, nom: int.attr("nom") || undefined, qua: int.attr("qua") || undefined };
183
- const html = $block.html() || "";
184
- const m = html.match(/<!--\s*cri:intervenant\b([^>]+)-->/i);
185
- if (!m)
186
- return {};
187
- const out = {};
188
- const re = /(\w+)="([^"]*)"/g;
189
- let a;
190
- while ((a = re.exec(m[1])))
191
- out[a[1]] = decodeHtmlEntities(a[2]);
192
- return { mat: out["mat"], nom: out["nom"], qua: out["qua"] };
193
- }
194
- function extractAndRemoveLeadingQualite($, $block) {
195
- const firstP = $block.find("p").first();
196
- if (firstP.length === 0)
197
- return "";
198
- const parts = [];
199
- let stop = false;
200
- firstP.contents().each((_, node) => {
201
- if (stop)
202
- return;
203
- if (node.type === "tag") {
204
- const $node = $(node);
205
- if ($node.hasClass("orateur_nom")) {
206
- $node.remove();
207
- return;
208
- }
209
- if ($node.hasClass("orateur_qualite")) {
210
- parts.push($node.text() || "");
211
- $node.remove();
212
- return;
213
- }
214
- const t = norm($node.text() || "");
215
- if (t)
216
- stop = true;
217
- else
218
- $node.remove();
219
- }
220
- else if (node.type === "text") {
221
- const t = norm(node.data || "");
222
- if (!t || /^[:.,;–—-]+$/.test(t)) {
223
- node.data = "";
224
- return;
225
- }
226
- stop = true;
227
- }
228
- });
229
- return fixApostrophes(norm(parts.join(" ")));
230
- }
231
- function sanitizeInterventionHtml($, $block) {
232
- const $clone = $block.clone();
233
- $clone.find('a[name]').remove();
234
- $clone.find('div[align="right"]').remove();
235
- $clone.find('a.link').remove();
236
- $clone.find('img').remove();
237
- $clone.find('a#ameli_amendement_cri_phrase, a#ameli_amendement_cra_contenu, a#ameli_amendement_cra_objet').remove();
238
- $clone.find(".orateur_nom, .orateur_qualite").remove();
239
- let html = $clone.html() || "";
240
- html = html.replace(/<!--[\s\S]*?-->/g, "");
241
- return html.trim();
242
- }
243
- function extractSommaireForIntervals($, idx, intervals) {
244
- const inIv = (el) => elementInAnyInterval(el, idx, intervals);
245
- const root = $("body");
246
- const sommaire = { presidentSeance: { _: "" }, sommaire1: [] };
247
- // (1) Présidence (tm2) — première ligne dans l’intervalle
248
- const pres = root.find("p.tm2").filter((_, el) => inIv(el)).first();
249
- if (pres.length)
250
- sommaire.presidentSeance = { _: norm(pres.text()) };
251
- // (2) Paras tm5 présents dans l’intervalle
252
- const paras = [];
253
- root.find("p.tm5").each((_, el) => {
254
- if (!inIv(el))
255
- return;
256
- const t = norm($(el).text());
257
- if (t)
258
- paras.push({ _: t });
259
- });
260
- if (paras.length)
261
- sommaire.para = paras.length === 1 ? paras[0] : paras;
262
- // (3) Items de 1er niveau (tm3) présents dans l’intervalle
263
- const items = [];
264
- root.find("p.tm3").each((_, el) => {
265
- if (!inIv(el))
266
- return;
267
- const $p = $(el);
268
- const full = norm($p.text() || "");
269
- if (!full)
270
- return;
271
- const numMatch = full.match(/^(\d+)\s*[.\-–—]\s*/);
272
- const valeur = numMatch ? numMatch[1] : undefined;
273
- // prefere intitule in ancre <a> if present
274
- const a = $p.find("a").first();
275
- const intituleRaw = a.length ? a.text() : full.replace(/^(\d+)\s*[.\-–—]\s*/, "");
276
- const intitule = norm(intituleRaw);
277
- // id_syceron from href="#Niv1_SOMx"
278
- const href = (a.attr("href") || "").trim();
279
- const idSyceron = href.startsWith("#") ? href.slice(1) : href;
280
- const titreStruct = { id_syceron: idSyceron || "", intitule };
281
- items.push({ valeur_pts_odj: valeur, titreStruct });
282
- });
283
- if (items.length)
284
- sommaire.sommaire1 = items;
285
- return sommaire;
286
- }
287
- function extractMetadonnees($, filePath) {
288
- let dateText = norm($("h1, h2, .page-title").first().text() || "");
289
- if (!dateText)
290
- dateText = norm($("p").first().text() || "");
291
- const dateMatch = dateText.match(/\b(\d{1,2}\s+\w+\s+\d{4})\b/i);
292
- const allText = norm($("body").text() || "");
293
- const sessionMatch = allText.match(/\bsession\s+(\d{4}-\d{4})\b/i);
294
- let dateSeance = dateMatch?.[1] || "";
295
- if (!dateSeance) {
296
- const m = filePath.match(/d(\d{4})(\d{2})(\d{2})\.xml$/i);
297
- if (m)
298
- dateSeance = `${m[1]}-${m[2]}-${m[3]}`;
299
- }
300
- return {
301
- dateSeance,
302
- dateSeanceJour: dateSeance,
303
- numSeanceJour: "",
304
- numSeance: "",
305
- typeAssemblee: "SN",
306
- legislature: "",
307
- session: sessionMatch?.[1] || "",
308
- nomFichierJo: "",
309
- validite: "",
310
- etat: "",
311
- diffusion: "",
312
- version: "1.0",
313
- environnement: "",
314
- heureGeneration: new Date()
315
- };
316
- }
317
- function elementInAnyInterval(el, idx, intervals) {
318
- const p = idx.get(el);
319
- if (p == null)
320
- return false;
321
- for (const iv of intervals)
322
- if (p >= iv.start && p < iv.end)
323
- return true;
324
- return false;
325
- }