@tricoteuses/senat 2.15.6 → 2.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/datasets.js +0 -1
- package/lib/model/agenda.js +9 -16
- package/lib/model/commission.d.ts +9 -1
- package/lib/model/commission.js +47 -33
- package/lib/model/scrutins.js +4 -3
- package/lib/model/seance.js +1 -6
- package/lib/model/util.d.ts +3 -0
- package/lib/model/util.js +32 -0
- package/lib/scripts/retrieve_cr_commission.js +90 -72
- package/lib/scripts/retrieve_videos.d.ts +0 -2
- package/lib/scripts/retrieve_videos.js +57 -33
- package/lib/types/agenda.d.ts +2 -2
- package/lib/utils/cr_spliting.js +4 -2
- package/lib/utils/reunion_grouping.d.ts +1 -1
- package/lib/utils/reunion_grouping.js +13 -42
- package/package.json +1 -1
- package/lib/model/compte_rendu.d.ts +0 -9
- package/lib/model/compte_rendu.js +0 -325
- package/lib/raw_types/db.d.ts +0 -11389
- package/lib/raw_types/db.js +0 -5
- package/lib/scripts/retrieve_comptes_rendus.d.ts +0 -6
- package/lib/scripts/retrieve_comptes_rendus.js +0 -274
|
@@ -7,8 +7,9 @@ import path from "path";
|
|
|
7
7
|
import { AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, iterLoadSenatAgendas } from "../loaders";
|
|
8
8
|
import { getSessionsFromStart } from "../types/sessions";
|
|
9
9
|
import { commonOptions } from "./shared/cli_helpers";
|
|
10
|
+
import { decodeHtmlEntities } from "../model/util";
|
|
10
11
|
// ===================== Constants =====================
|
|
11
|
-
const MATCH_THRESHOLD = 0.
|
|
12
|
+
const MATCH_THRESHOLD = 0.56;
|
|
12
13
|
const MAX_CANDIDATES = 15;
|
|
13
14
|
const MAX_PAGES = 3;
|
|
14
15
|
const STATS = { total: 0, accepted: 0 };
|
|
@@ -145,7 +146,9 @@ function extractCandidatesFromSearchHtml(html) {
|
|
|
145
146
|
const pageUrl = `https://videos.senat.fr/video.${id}_${hash}.html`;
|
|
146
147
|
const ctx = html.slice(Math.max(0, m.index - 240), Math.min(html.length, m.index + 240));
|
|
147
148
|
const t = ctx.match(/title="([^"]+)"/i) || ctx.match(/>([^<]{10,200})</);
|
|
148
|
-
|
|
149
|
+
const title = t?.[1]?.trim();
|
|
150
|
+
const isSeancePublique = title?.toLowerCase().includes("séance publique") ?? false;
|
|
151
|
+
out.push({ id, hash, pageUrl, title, isSeancePublique });
|
|
149
152
|
}
|
|
150
153
|
const seen = new Set();
|
|
151
154
|
return out.filter((c) => {
|
|
@@ -157,20 +160,28 @@ function extractCandidatesFromSearchHtml(html) {
|
|
|
157
160
|
});
|
|
158
161
|
}
|
|
159
162
|
function parseDataNvs(nvs) {
|
|
160
|
-
const
|
|
161
|
-
const
|
|
162
|
-
|
|
163
|
+
const epochStr = nvs.match(/<metadata\s+name="date"\s+value="(\d+)"/i)?.[1];
|
|
164
|
+
const epoch = epochStr ? Number(epochStr) : undefined;
|
|
165
|
+
const organesTag = nvs.match(/<metadata\b[^>]*\bname="organes"[^>]*>/i)?.[0];
|
|
166
|
+
let organeLabel;
|
|
167
|
+
let organeValue;
|
|
168
|
+
if (organesTag) {
|
|
169
|
+
organeLabel = organesTag.match(/\blabel="([^"]+)"/i)?.[1];
|
|
170
|
+
organeValue = organesTag.match(/\bvalue="([^"]+)"/i)?.[1];
|
|
171
|
+
}
|
|
172
|
+
const organeRaw = organeLabel ?? organeValue;
|
|
173
|
+
const organe = decodeHtmlEntities(organeRaw)?.trim();
|
|
174
|
+
const firstChapterLabel = decodeHtmlEntities(nvs.match(/<chapter\b[^>]*\blabel="([^"]+)"/i)[1]).trim();
|
|
175
|
+
return { epoch, organe, firstChapterLabel };
|
|
163
176
|
}
|
|
164
|
-
|
|
165
|
-
//
|
|
166
|
-
export function buildSenatVodMasterM3u8FromNvs(nvsText, finalText) {
|
|
167
|
-
// 1) Base Akamai depuis data.nvs (mp4 "serverfiles://senat/YYYY/MM/encoderX_YYYYMMDDHHMMSS_1.mp4")
|
|
177
|
+
function buildSenatVodMasterM3u8FromNvs(nvsText, finalText) {
|
|
178
|
+
// 1) Base Akamai from data.nvs (mp4 "serverfiles://senat/YYYY/MM/encoderX_YYYYMMDDHHMMSS_1.mp4")
|
|
168
179
|
const baseMatch = nvsText.match(/serverfiles:\/\/senat\/(\d{4})\/(\d{2})\/(encoder\d)_(\d{14})/i);
|
|
169
180
|
if (!baseMatch)
|
|
170
181
|
return null;
|
|
171
182
|
const [, yyyy, mm, encoder, stamp] = baseMatch;
|
|
172
183
|
const base = `https://vodsenat.akamaized.net/senat/${yyyy}/${mm}/${encoder}_${stamp}`;
|
|
173
|
-
// 2) start/end
|
|
184
|
+
// 2) start/end from finalplayer.nvs
|
|
174
185
|
let start = null, end = null;
|
|
175
186
|
const playerAttr = finalText.match(/player[^>]*\bstarttime="(\d+)"[^>]*\bendtime="(\d+)"/i);
|
|
176
187
|
if (playerAttr) {
|
|
@@ -178,12 +189,11 @@ export function buildSenatVodMasterM3u8FromNvs(nvsText, finalText) {
|
|
|
178
189
|
end = parseInt(playerAttr[2], 10);
|
|
179
190
|
}
|
|
180
191
|
else {
|
|
181
|
-
// fallback:
|
|
192
|
+
// fallback: take smallest timecode of <synchro timecode="...">
|
|
182
193
|
const tc = Array.from(finalText.matchAll(/timecode="(\d+)"/g)).map((m) => parseInt(m[1], 10));
|
|
183
194
|
if (tc.length)
|
|
184
195
|
start = Math.min(...tc);
|
|
185
196
|
}
|
|
186
|
-
// 3) si pas d'end, on peut déduire via "duree" (en secondes) de data.nvs
|
|
187
197
|
if (end == null) {
|
|
188
198
|
const durMeta = nvsText.match(/<metadata[^>]*\bname="duree"[^>]*\bvalue="(\d+)"[^>]*>/i);
|
|
189
199
|
if (durMeta && start != null) {
|
|
@@ -191,16 +201,14 @@ export function buildSenatVodMasterM3u8FromNvs(nvsText, finalText) {
|
|
|
191
201
|
end = start + durMs;
|
|
192
202
|
}
|
|
193
203
|
}
|
|
194
|
-
// 4)
|
|
195
|
-
// - si on a start & end → utiliser ps/pd (robuste et conforme à ce que sert le Sénat)
|
|
196
|
-
// - sinon fallback sans suffixe (souvent valide aussi)
|
|
204
|
+
// 4) Construct URL
|
|
197
205
|
if (start != null && end != null && end > start) {
|
|
198
206
|
const pd = end - start;
|
|
199
207
|
return `${base}_ps${start}_pd${pd}.smil/master.m3u8`;
|
|
200
208
|
}
|
|
201
209
|
return `${base}.smil/master.m3u8`;
|
|
202
210
|
}
|
|
203
|
-
|
|
211
|
+
function score(agenda, agendaTs, videoTitle, videoEpoch, videoOrgane) {
|
|
204
212
|
const titleScore = dice(agenda.titre || "", videoTitle || "");
|
|
205
213
|
let timeScore = 0;
|
|
206
214
|
if (agendaTs && videoEpoch) {
|
|
@@ -209,15 +217,11 @@ export function score(agenda, agendaTs, videoTitle, videoEpoch) {
|
|
|
209
217
|
// delta : 180min
|
|
210
218
|
timeScore = Math.max(0, 1 - deltaMin / 180);
|
|
211
219
|
}
|
|
212
|
-
|
|
213
|
-
if (agenda.organe
|
|
214
|
-
|
|
215
|
-
const t = normalize(videoTitle);
|
|
216
|
-
const first = o.split(" ").filter(Boolean)[0];
|
|
217
|
-
if (first && t.includes(first))
|
|
218
|
-
orgBonus = 0.15;
|
|
220
|
+
const orgScore = videoOrgane && agenda.organe ? dice(agenda.organe, videoOrgane) : 0;
|
|
221
|
+
if (orgScore === 0 && agenda.organe === "Séance publique") {
|
|
222
|
+
return 0.5 * titleScore + 0.5 * timeScore;
|
|
219
223
|
}
|
|
220
|
-
return 0.
|
|
224
|
+
return 0.4 * titleScore + 0.3 * timeScore + orgScore * 0.3;
|
|
221
225
|
}
|
|
222
226
|
/**
|
|
223
227
|
* Build search strategies for senat's videos
|
|
@@ -263,19 +267,16 @@ async function fetchAllSearchPages(args, baseDir, strategyIndex, maxPages = MAX_
|
|
|
263
267
|
async function processGroupedReunion(agenda, session, dataDir) {
|
|
264
268
|
if (!agenda)
|
|
265
269
|
return;
|
|
266
|
-
// 1)
|
|
270
|
+
// 1) GuardRails
|
|
267
271
|
if (!agenda.captationVideo) {
|
|
268
|
-
if (!options["silent"])
|
|
269
|
-
console.log(`[skip] ${agenda.uid} captationVideo=false`);
|
|
272
|
+
// if (!options["silent"]) console.log(`[skip] ${agenda.uid} captationVideo=false`)
|
|
270
273
|
return;
|
|
271
274
|
}
|
|
272
275
|
if (!agenda.date || !agenda.startTime) {
|
|
273
|
-
if (!options["silent"])
|
|
274
|
-
console.log(`[skip] ${agenda.uid} date/hour missing`);
|
|
276
|
+
// if (!options["silent"]) console.log(`[skip] ${agenda.uid} date/hour missing`)
|
|
275
277
|
return;
|
|
276
278
|
}
|
|
277
279
|
STATS.total++;
|
|
278
|
-
// 2) Dossier de sortie (utilise directement l'UID)
|
|
279
280
|
const reunionUid = agenda.uid;
|
|
280
281
|
const baseDir = path.join(dataDir, VIDEOS_ROOT_FOLDER, String(session), reunionUid);
|
|
281
282
|
await fs.ensureDir(baseDir);
|
|
@@ -312,9 +313,29 @@ async function processGroupedReunion(agenda, session, dataDir) {
|
|
|
312
313
|
if (!buf)
|
|
313
314
|
continue;
|
|
314
315
|
const meta = parseDataNvs(buf.toString("utf-8"));
|
|
315
|
-
|
|
316
|
+
// If organes are different, go to next candidates
|
|
317
|
+
if (meta.organe && agenda.organe) {
|
|
318
|
+
const videoOrgNorm = normalize(meta.organe);
|
|
319
|
+
const agendaOrgNorm = normalize(agenda.organe);
|
|
320
|
+
if (dice(agendaOrgNorm, videoOrgNorm) < 0.5) {
|
|
321
|
+
continue;
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
let videoTitle = c.title;
|
|
325
|
+
if (c.isSeancePublique && meta.firstChapterLabel) {
|
|
326
|
+
videoTitle = meta.firstChapterLabel;
|
|
327
|
+
}
|
|
328
|
+
const s = score(agenda, agendaTs, videoTitle, meta.epoch, meta.organe);
|
|
316
329
|
if (!best || s > best.score) {
|
|
317
|
-
best = {
|
|
330
|
+
best = {
|
|
331
|
+
id: c.id,
|
|
332
|
+
hash: c.hash,
|
|
333
|
+
pageUrl: c.pageUrl,
|
|
334
|
+
epoch: meta.epoch,
|
|
335
|
+
vtitle: videoTitle,
|
|
336
|
+
score: s,
|
|
337
|
+
vorgane: meta.organe,
|
|
338
|
+
};
|
|
318
339
|
}
|
|
319
340
|
}
|
|
320
341
|
if (!best) {
|
|
@@ -326,7 +347,10 @@ async function processGroupedReunion(agenda, session, dataDir) {
|
|
|
326
347
|
if (accepted)
|
|
327
348
|
STATS.accepted++;
|
|
328
349
|
if (!options["silent"]) {
|
|
329
|
-
console.log(`[pick] ${agenda.uid}
|
|
350
|
+
console.log(`[pick] ${agenda.uid} score=${best.score.toFixed(2)}
|
|
351
|
+
agenda title="${agenda.titre ?? ""}" agenda organe="${agenda.organe ?? ""}"
|
|
352
|
+
best title="${best.vtitle ?? ""}" best organe="${best.vorgane ?? ""}"
|
|
353
|
+
accepted=${accepted} (strategy=${usedStrategy})`);
|
|
330
354
|
}
|
|
331
355
|
// ==== 3) Write metadata + NVS of the best candidate (always) ====
|
|
332
356
|
const bestDt = best?.epoch ? epochToParisDateTime(best.epoch) : null;
|
package/lib/types/agenda.d.ts
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
export interface AgendaEvent {
|
|
2
2
|
id: string;
|
|
3
3
|
type: string | null;
|
|
4
|
-
date: string
|
|
4
|
+
date: string;
|
|
5
5
|
startTime: string | null;
|
|
6
6
|
endTime: string | null;
|
|
7
7
|
timeOriginal: string | null;
|
|
8
|
-
titre: string
|
|
8
|
+
titre: string;
|
|
9
9
|
organe: string | null;
|
|
10
10
|
objet: string | null;
|
|
11
11
|
lieu: string | null;
|
package/lib/utils/cr_spliting.js
CHANGED
|
@@ -349,12 +349,13 @@ export async function linkCRtoCommissionGroup(opts) {
|
|
|
349
349
|
uid,
|
|
350
350
|
chambre: "SN",
|
|
351
351
|
date: dateISO,
|
|
352
|
-
type:
|
|
352
|
+
type: "Commission",
|
|
353
353
|
startTime: hourShortToStartTime(hourShort),
|
|
354
354
|
endTime: null,
|
|
355
355
|
captationVideo: false,
|
|
356
356
|
titre: titreGuess ?? "",
|
|
357
|
-
|
|
357
|
+
organe: organeDetected ?? "Commission",
|
|
358
|
+
objet: titreGuess ?? "",
|
|
358
359
|
events: [],
|
|
359
360
|
compteRenduRefUid: crUid,
|
|
360
361
|
};
|
|
@@ -364,6 +365,7 @@ export async function linkCRtoCommissionGroup(opts) {
|
|
|
364
365
|
else {
|
|
365
366
|
group.compteRenduRefUid = crUid;
|
|
366
367
|
updated = true;
|
|
368
|
+
console.log(`[AGENDA][COM] Updated group uid=${uid} for CR uid=${crUid}`);
|
|
367
369
|
}
|
|
368
370
|
// Lien CR
|
|
369
371
|
// Enrichir depuis CR si vide
|
|
@@ -2,7 +2,7 @@ import { AgendaEvent, GroupedReunion, TimeSlot } from "../types/agenda";
|
|
|
2
2
|
type KnownType = "SP" | "COM" | "MC" | "OD" | "ID";
|
|
3
3
|
export declare function groupNonSPByTypeOrganeHour(events: AgendaEvent[]): Record<"IDC" | "IDM" | "IDO" | "IDI", GroupedReunion[]>;
|
|
4
4
|
export declare function groupSeancePubliqueBySlot(events: AgendaEvent[]): Record<TimeSlot, GroupedReunion[]>;
|
|
5
|
-
export declare function makeTypeGroupUid(dateISO: string, kind: KnownType,
|
|
5
|
+
export declare function makeTypeGroupUid(dateISO: string, kind: KnownType, agendaEventId: string, organe?: string | null): string;
|
|
6
6
|
export declare function makeGroupUid(date: string, slot: TimeSlot): string;
|
|
7
7
|
export declare function formatYYYYMMDD(dateYYYYMMDD: string): string;
|
|
8
8
|
export declare function makeReunionUid(agenda: AgendaEvent): string;
|
|
@@ -28,56 +28,27 @@ export function groupNonSPByTypeOrganeHour(events) {
|
|
|
28
28
|
const nonSP = events.filter((e) => !isSeancePublique(e?.type));
|
|
29
29
|
if (nonSP.length === 0)
|
|
30
30
|
return out;
|
|
31
|
-
const buckets = new Map();
|
|
32
31
|
for (const e of nonSP) {
|
|
33
32
|
const kind = classifyAgendaType(e?.type);
|
|
34
33
|
if (!kind || kind === "SP")
|
|
35
34
|
continue;
|
|
36
35
|
const { startISO, endISO } = deriveTimesForEvent(e);
|
|
37
|
-
const
|
|
38
|
-
const
|
|
39
|
-
const
|
|
40
|
-
if (!buckets.has(key))
|
|
41
|
-
buckets.set(key, []);
|
|
42
|
-
buckets.get(key).push({ ...e, startTime: startISO ?? e.startTime, endTime: endISO ?? e.endTime });
|
|
43
|
-
}
|
|
44
|
-
for (const [key, list] of buckets) {
|
|
45
|
-
const [date, kindStr, organe, hourShort] = key.split("|");
|
|
46
|
-
const kind = kindStr;
|
|
47
|
-
const enriched = list
|
|
48
|
-
.map((ev) => {
|
|
49
|
-
const { startISO, endISO } = deriveTimesForEvent(ev);
|
|
50
|
-
return { ev, startISO: startISO ?? ev.startTime, endISO: endISO ?? ev.endTime };
|
|
51
|
-
})
|
|
52
|
-
.sort((a, b) => {
|
|
53
|
-
const ta = a.startISO ? (parseISO(a.startISO)?.toMillis() ?? Number.MAX_SAFE_INTEGER) : Number.MAX_SAFE_INTEGER;
|
|
54
|
-
const tb = b.startISO ? (parseISO(b.startISO)?.toMillis() ?? Number.MAX_SAFE_INTEGER) : Number.MAX_SAFE_INTEGER;
|
|
55
|
-
return ta - tb;
|
|
56
|
-
});
|
|
57
|
-
const startTime = enriched.find((x) => !!x.startISO)?.startISO ?? null;
|
|
58
|
-
const endTime = enriched.reduce((acc, x) => {
|
|
59
|
-
const de = x.endISO ? parseISO(x.endISO)?.toMillis() : null;
|
|
60
|
-
const accMs = acc ? parseISO(acc)?.toMillis() : null;
|
|
61
|
-
if (de != null && (accMs == null || de > accMs))
|
|
62
|
-
return x.endISO;
|
|
63
|
-
return acc;
|
|
64
|
-
}, null);
|
|
65
|
-
const any = enriched[0]?.ev;
|
|
66
|
-
const hour = hourShort !== "NA" ? hourShort : (hourShortFromISO(startTime) ?? hourShortFromOriginal(any?.timeOriginal));
|
|
67
|
-
const uid = makeTypeGroupUid(date, kind, hour ?? "", any?.organe ?? "");
|
|
36
|
+
const startTime = startISO ?? e.startTime ?? null;
|
|
37
|
+
const endTime = endISO ?? e.endTime ?? null;
|
|
38
|
+
const uid = makeTypeGroupUid(e.date, kind, e.id, e.organe ?? null);
|
|
68
39
|
const suffix = (kind === "COM" ? "IDC" : kind === "MC" ? "IDM" : kind === "OD" ? "IDO" : "IDI");
|
|
69
40
|
const group = {
|
|
70
41
|
uid,
|
|
71
42
|
chambre: "SN",
|
|
72
|
-
date,
|
|
73
|
-
type:
|
|
74
|
-
organe:
|
|
43
|
+
date: e.date,
|
|
44
|
+
type: e.type || "",
|
|
45
|
+
organe: e.organe || undefined,
|
|
75
46
|
startTime,
|
|
76
47
|
endTime,
|
|
77
|
-
captationVideo:
|
|
78
|
-
titre:
|
|
79
|
-
objet:
|
|
80
|
-
events:
|
|
48
|
+
captationVideo: e.captationVideo === true,
|
|
49
|
+
titre: e.titre,
|
|
50
|
+
objet: e.objet || "",
|
|
51
|
+
events: [e],
|
|
81
52
|
};
|
|
82
53
|
out[suffix].push(group);
|
|
83
54
|
}
|
|
@@ -270,12 +241,12 @@ function organeInitials(input, maxLen = 8) {
|
|
|
270
241
|
const out = letters.join("");
|
|
271
242
|
return out.slice(0, maxLen);
|
|
272
243
|
}
|
|
273
|
-
export function makeTypeGroupUid(dateISO, kind,
|
|
244
|
+
export function makeTypeGroupUid(dateISO, kind, agendaEventId, organe) {
|
|
274
245
|
const ymd = dateISO ? formatYYYYMMDD(dateISO) : "00000000";
|
|
275
246
|
const suffix = typeToSuffixStrict(kind);
|
|
276
|
-
const hh = hourShort ?? "NA";
|
|
277
247
|
const org = organe ? organeInitials(organe) : "";
|
|
278
|
-
|
|
248
|
+
let base = `RUSN${ymd}${suffix}${org ? org : ""}${agendaEventId}`;
|
|
249
|
+
return base;
|
|
279
250
|
}
|
|
280
251
|
function parseISO(isoLike) {
|
|
281
252
|
if (!isoLike)
|
package/package.json
CHANGED
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
import { CompteRendu, Sommaire } from "../types/compte_rendu";
|
|
2
|
-
import { TimeSlot } from "../types/agenda";
|
|
3
|
-
export declare function parseCompteRenduSlotFromFile(xmlFilePath: string, wantedSlot: TimeSlot, firstSlotOfDay?: TimeSlot): Promise<CompteRendu | null>;
|
|
4
|
-
export declare function sessionStartYearFromDate(d: Date): number;
|
|
5
|
-
export declare function parseYYYYMMDD(yyyymmdd: string): Date | null;
|
|
6
|
-
export declare function deriveTitreObjetFromSommaire(sommaire: Sommaire | undefined, slot: TimeSlot): {
|
|
7
|
-
titre: string;
|
|
8
|
-
objet: string;
|
|
9
|
-
};
|
|
@@ -1,325 +0,0 @@
|
|
|
1
|
-
import fs from "fs";
|
|
2
|
-
import * as cheerio from "cheerio";
|
|
3
|
-
import path from "path";
|
|
4
|
-
import { computeIntervalsBySlot } from "../utils/cr_spliting";
|
|
5
|
-
import { norm } from "./util";
|
|
6
|
-
const asArray = (x) => x == null ? [] : Array.isArray(x) ? x : [x];
|
|
7
|
-
const toInt = (s) => Number.isFinite(Number(s)) ? Number(s) : Number.POSITIVE_INFINITY;
|
|
8
|
-
export async function parseCompteRenduSlotFromFile(xmlFilePath, wantedSlot, firstSlotOfDay) {
|
|
9
|
-
try {
|
|
10
|
-
const raw = fs.readFileSync(xmlFilePath, "utf8");
|
|
11
|
-
const $ = cheerio.load(raw, { xml: false });
|
|
12
|
-
const metadonnees = extractMetadonnees($, xmlFilePath);
|
|
13
|
-
const order = $("body *").toArray();
|
|
14
|
-
const idx = new Map(order.map((el, i) => [el, i]));
|
|
15
|
-
const intervalsAll = computeIntervalsBySlot($, idx, firstSlotOfDay);
|
|
16
|
-
const intervals = intervalsAll.filter(iv => iv.slot === wantedSlot);
|
|
17
|
-
if (intervals.length === 0) {
|
|
18
|
-
console.warn(`[CRI] no intervals for ${path.basename(xmlFilePath)} [${wantedSlot}]`);
|
|
19
|
-
return null;
|
|
20
|
-
}
|
|
21
|
-
metadonnees.sommaire = extractSommaireForIntervals($, idx, intervals);
|
|
22
|
-
const points = [];
|
|
23
|
-
let ordre = 0;
|
|
24
|
-
const addPoint = (p) => points.push({ ...p, ordre_absolu_seance: String(++ordre) });
|
|
25
|
-
// Titles
|
|
26
|
-
$("cri\\:titreS1 p.titre_S1").each((_, el) => {
|
|
27
|
-
if (!elementInAnyInterval(el, idx, intervals))
|
|
28
|
-
return;
|
|
29
|
-
const t = normalizeTitle(norm($(el).text() || ""));
|
|
30
|
-
if (t)
|
|
31
|
-
addPoint({ code_grammaire: "TITRE_TEXTE_DISCUSSION", texte: { _: t }, code_style: "Titre" });
|
|
32
|
-
});
|
|
33
|
-
// Interventions
|
|
34
|
-
$("div.intervenant").each((_, block) => {
|
|
35
|
-
if (!elementInAnyInterval(block, idx, intervals))
|
|
36
|
-
return;
|
|
37
|
-
const $block = $(block);
|
|
38
|
-
$block.find([
|
|
39
|
-
"p[class^='titre_S']",
|
|
40
|
-
"p.mention_titre",
|
|
41
|
-
"p.intitule_titre",
|
|
42
|
-
"p.mention_chapitre",
|
|
43
|
-
"p.intitule_chapitre",
|
|
44
|
-
"p.mention_article",
|
|
45
|
-
"p.intitule_article",
|
|
46
|
-
"p.mention_section",
|
|
47
|
-
"p.intitule_section",
|
|
48
|
-
].join(",")).remove();
|
|
49
|
-
const firstP = $block.find("p").first();
|
|
50
|
-
const speakerLabelRaw = firstP.find(".orateur_nom").text() || firstP.find("a.lien_senfic").text() || "";
|
|
51
|
-
const speakerLabel = dedupeSpeaker(speakerLabelRaw);
|
|
52
|
-
const { mat, nom: nomCRI, qua: quaCRI } = readIntervenantMeta($block);
|
|
53
|
-
const qualFromSpans = extractAndRemoveLeadingQualite($, $block);
|
|
54
|
-
const qualite = norm(decodeHtmlEntities(quaCRI || "")) || qualFromSpans;
|
|
55
|
-
const canonicalName = dedupeSpeaker(nomCRI || speakerLabel);
|
|
56
|
-
const role = roleForSpeaker(speakerLabel) || roleForSpeaker(qualite) || roleForSpeaker(quaCRI || "");
|
|
57
|
-
const speechHtml = sanitizeInterventionHtml($, $block);
|
|
58
|
-
if (!norm(cheerio.load(speechHtml).text() || ""))
|
|
59
|
-
return;
|
|
60
|
-
addPoint({
|
|
61
|
-
code_grammaire: "PAROLE_GENERIQUE",
|
|
62
|
-
roledebat: role,
|
|
63
|
-
orateurs: { orateur: { nom: canonicalName, id: mat || "", qualite } },
|
|
64
|
-
texte: { _: speechHtml },
|
|
65
|
-
});
|
|
66
|
-
});
|
|
67
|
-
const contenu = {
|
|
68
|
-
quantiemes: { journee: metadonnees.dateSeance, session: metadonnees.session },
|
|
69
|
-
point: points,
|
|
70
|
-
};
|
|
71
|
-
return {
|
|
72
|
-
uid: "CRSSN" + xmlFilePath.replace(/^.*?(\d{8}).*$/i, "$1") + `-${wantedSlot}`,
|
|
73
|
-
seanceRef: "RUSN" + xmlFilePath.replace(/^.*?(\d{8}).*$/i, "$1") + "IDS-" + wantedSlot,
|
|
74
|
-
sessionRef: metadonnees.session,
|
|
75
|
-
metadonnees,
|
|
76
|
-
contenu,
|
|
77
|
-
};
|
|
78
|
-
}
|
|
79
|
-
catch (e) {
|
|
80
|
-
console.error(`[CRI] parseSlot error file=${xmlFilePath} slot=${wantedSlot}:`, e);
|
|
81
|
-
return null;
|
|
82
|
-
}
|
|
83
|
-
}
|
|
84
|
-
export function sessionStartYearFromDate(d) {
|
|
85
|
-
// Session (1th oct N → 30 sept N+1)
|
|
86
|
-
const m = d.getMonth();
|
|
87
|
-
const y = d.getFullYear();
|
|
88
|
-
return m >= 9 ? y : y - 1;
|
|
89
|
-
}
|
|
90
|
-
export function parseYYYYMMDD(yyyymmdd) {
|
|
91
|
-
if (!/^\d{8}$/.test(yyyymmdd))
|
|
92
|
-
return null;
|
|
93
|
-
const y = Number(yyyymmdd.slice(0, 4));
|
|
94
|
-
const m = Number(yyyymmdd.slice(4, 6)) - 1;
|
|
95
|
-
const d = Number(yyyymmdd.slice(6, 8));
|
|
96
|
-
const dt = new Date(y, m, d);
|
|
97
|
-
return Number.isFinite(dt.getTime()) ? dt : null;
|
|
98
|
-
}
|
|
99
|
-
export function deriveTitreObjetFromSommaire(sommaire, slot) {
|
|
100
|
-
const items = extractLevel1Items(sommaire);
|
|
101
|
-
const meaningful = items.filter(it => !isBoilerplate(it.label));
|
|
102
|
-
if (meaningful.length === 0) {
|
|
103
|
-
return {
|
|
104
|
-
titre: `Séance publique ${slotLabel(slot)}`,
|
|
105
|
-
objet: "",
|
|
106
|
-
};
|
|
107
|
-
}
|
|
108
|
-
const titre = meaningful[0].label;
|
|
109
|
-
const objet = meaningful.slice(0, 3).map(it => it.label).join(" ; ");
|
|
110
|
-
return { titre, objet };
|
|
111
|
-
}
|
|
112
|
-
function slotLabel(slot) {
|
|
113
|
-
switch (slot) {
|
|
114
|
-
case "MATIN": return "du matin";
|
|
115
|
-
case "APRES-MIDI": return "de l’après-midi";
|
|
116
|
-
case "SOIR": return "du soir";
|
|
117
|
-
default: return "";
|
|
118
|
-
}
|
|
119
|
-
}
|
|
120
|
-
const BOILERPLATE_PATTERNS = [
|
|
121
|
-
/proc(?:è|e)s-?verbal/i,
|
|
122
|
-
/hommages?/i,
|
|
123
|
-
/désignation des vice-?président/i,
|
|
124
|
-
/candidatures? aux?/i,
|
|
125
|
-
/ordre du jour/i,
|
|
126
|
-
/rappels? au règlement/i,
|
|
127
|
-
/communications?/i,
|
|
128
|
-
/dépôts?/i,
|
|
129
|
-
/proclamation/i,
|
|
130
|
-
/présidence de/i,
|
|
131
|
-
/questions? diverses?/i,
|
|
132
|
-
/ouverture de la séance/i,
|
|
133
|
-
/clo(?:t|̂)ure de la séance/i,
|
|
134
|
-
];
|
|
135
|
-
const isBoilerplate = (label) => !label?.trim() || BOILERPLATE_PATTERNS.some(rx => rx.test(label));
|
|
136
|
-
function extractLevel1Items(sommaire) {
|
|
137
|
-
const level1 = asArray(sommaire?.sommaire1);
|
|
138
|
-
return level1
|
|
139
|
-
.map(el => ({
|
|
140
|
-
numero: toInt(el?.valeur_pts_odj),
|
|
141
|
-
label: String(el?.titreStruct?.intitule ?? "").trim(),
|
|
142
|
-
}))
|
|
143
|
-
.filter(it => !!it.label)
|
|
144
|
-
.sort((a, b) => a.numero - b.numero);
|
|
145
|
-
}
|
|
146
|
-
function stripTrailingPunct(s) { return s.replace(/\s*([:,.;])\s*$/u, "").trim(); }
|
|
147
|
-
function dedupeSpeaker(raw) {
|
|
148
|
-
let s = norm(raw);
|
|
149
|
-
s = stripTrailingPunct(s);
|
|
150
|
-
const dupPatterns = [/^(.+?)\s*[.]\s*\1$/u, /^(.+?)\s*,\s*\1,?$/u, /^(.+?)\s+\1$/u];
|
|
151
|
-
for (const re of dupPatterns) {
|
|
152
|
-
const m = s.match(re);
|
|
153
|
-
if (m) {
|
|
154
|
-
s = m[1];
|
|
155
|
-
break;
|
|
156
|
-
}
|
|
157
|
-
}
|
|
158
|
-
return s.replace(/\.\s*$/, "");
|
|
159
|
-
}
|
|
160
|
-
function decodeHtmlEntities(s) {
|
|
161
|
-
return s.replace(/&#(\d+);/g, (_, d) => String.fromCharCode(parseInt(d, 10)))
|
|
162
|
-
.replace(/&#x([0-9a-fA-F]+);/g, (_, h) => String.fromCharCode(parseInt(h, 16)));
|
|
163
|
-
}
|
|
164
|
-
function fixApostrophes(s) {
|
|
165
|
-
let out = s;
|
|
166
|
-
out = out.replace(/\s*’\s*/g, "’");
|
|
167
|
-
out = out.replace(/\b([dljctmsn])\s*’/gi, (_, m) => m + "’");
|
|
168
|
-
out = out.replace(/’\s+([A-Za-zÀ-ÖØ-öø-ÿ])/g, "’$1");
|
|
169
|
-
out = out.replace(/\s+([,;:.!?])/g, "$1");
|
|
170
|
-
return out;
|
|
171
|
-
}
|
|
172
|
-
function normalizeTitle(text) { return text.replace(/^PR[ÉE]SIDENCE DE\b/i, "Présidence de "); }
|
|
173
|
-
function roleForSpeaker(labelOrQualite) {
|
|
174
|
-
const s = (labelOrQualite || "").toLowerCase();
|
|
175
|
-
if (/^(m\.|mme)?\s*(le|la)\s+pr[ée]sident(e)?\b/.test(s) || /\bpr[ée]sident[e]?\s+de\s+séance\b/.test(s))
|
|
176
|
-
return "président";
|
|
177
|
-
return "";
|
|
178
|
-
}
|
|
179
|
-
function readIntervenantMeta($block) {
|
|
180
|
-
const int = $block.find('cri\\:intervenant').first();
|
|
181
|
-
if (int.length)
|
|
182
|
-
return { mat: int.attr("mat") || undefined, nom: int.attr("nom") || undefined, qua: int.attr("qua") || undefined };
|
|
183
|
-
const html = $block.html() || "";
|
|
184
|
-
const m = html.match(/<!--\s*cri:intervenant\b([^>]+)-->/i);
|
|
185
|
-
if (!m)
|
|
186
|
-
return {};
|
|
187
|
-
const out = {};
|
|
188
|
-
const re = /(\w+)="([^"]*)"/g;
|
|
189
|
-
let a;
|
|
190
|
-
while ((a = re.exec(m[1])))
|
|
191
|
-
out[a[1]] = decodeHtmlEntities(a[2]);
|
|
192
|
-
return { mat: out["mat"], nom: out["nom"], qua: out["qua"] };
|
|
193
|
-
}
|
|
194
|
-
function extractAndRemoveLeadingQualite($, $block) {
|
|
195
|
-
const firstP = $block.find("p").first();
|
|
196
|
-
if (firstP.length === 0)
|
|
197
|
-
return "";
|
|
198
|
-
const parts = [];
|
|
199
|
-
let stop = false;
|
|
200
|
-
firstP.contents().each((_, node) => {
|
|
201
|
-
if (stop)
|
|
202
|
-
return;
|
|
203
|
-
if (node.type === "tag") {
|
|
204
|
-
const $node = $(node);
|
|
205
|
-
if ($node.hasClass("orateur_nom")) {
|
|
206
|
-
$node.remove();
|
|
207
|
-
return;
|
|
208
|
-
}
|
|
209
|
-
if ($node.hasClass("orateur_qualite")) {
|
|
210
|
-
parts.push($node.text() || "");
|
|
211
|
-
$node.remove();
|
|
212
|
-
return;
|
|
213
|
-
}
|
|
214
|
-
const t = norm($node.text() || "");
|
|
215
|
-
if (t)
|
|
216
|
-
stop = true;
|
|
217
|
-
else
|
|
218
|
-
$node.remove();
|
|
219
|
-
}
|
|
220
|
-
else if (node.type === "text") {
|
|
221
|
-
const t = norm(node.data || "");
|
|
222
|
-
if (!t || /^[:.,;–—-]+$/.test(t)) {
|
|
223
|
-
node.data = "";
|
|
224
|
-
return;
|
|
225
|
-
}
|
|
226
|
-
stop = true;
|
|
227
|
-
}
|
|
228
|
-
});
|
|
229
|
-
return fixApostrophes(norm(parts.join(" ")));
|
|
230
|
-
}
|
|
231
|
-
function sanitizeInterventionHtml($, $block) {
|
|
232
|
-
const $clone = $block.clone();
|
|
233
|
-
$clone.find('a[name]').remove();
|
|
234
|
-
$clone.find('div[align="right"]').remove();
|
|
235
|
-
$clone.find('a.link').remove();
|
|
236
|
-
$clone.find('img').remove();
|
|
237
|
-
$clone.find('a#ameli_amendement_cri_phrase, a#ameli_amendement_cra_contenu, a#ameli_amendement_cra_objet').remove();
|
|
238
|
-
$clone.find(".orateur_nom, .orateur_qualite").remove();
|
|
239
|
-
let html = $clone.html() || "";
|
|
240
|
-
html = html.replace(/<!--[\s\S]*?-->/g, "");
|
|
241
|
-
return html.trim();
|
|
242
|
-
}
|
|
243
|
-
function extractSommaireForIntervals($, idx, intervals) {
|
|
244
|
-
const inIv = (el) => elementInAnyInterval(el, idx, intervals);
|
|
245
|
-
const root = $("body");
|
|
246
|
-
const sommaire = { presidentSeance: { _: "" }, sommaire1: [] };
|
|
247
|
-
// (1) Présidence (tm2) — première ligne dans l’intervalle
|
|
248
|
-
const pres = root.find("p.tm2").filter((_, el) => inIv(el)).first();
|
|
249
|
-
if (pres.length)
|
|
250
|
-
sommaire.presidentSeance = { _: norm(pres.text()) };
|
|
251
|
-
// (2) Paras tm5 présents dans l’intervalle
|
|
252
|
-
const paras = [];
|
|
253
|
-
root.find("p.tm5").each((_, el) => {
|
|
254
|
-
if (!inIv(el))
|
|
255
|
-
return;
|
|
256
|
-
const t = norm($(el).text());
|
|
257
|
-
if (t)
|
|
258
|
-
paras.push({ _: t });
|
|
259
|
-
});
|
|
260
|
-
if (paras.length)
|
|
261
|
-
sommaire.para = paras.length === 1 ? paras[0] : paras;
|
|
262
|
-
// (3) Items de 1er niveau (tm3) présents dans l’intervalle
|
|
263
|
-
const items = [];
|
|
264
|
-
root.find("p.tm3").each((_, el) => {
|
|
265
|
-
if (!inIv(el))
|
|
266
|
-
return;
|
|
267
|
-
const $p = $(el);
|
|
268
|
-
const full = norm($p.text() || "");
|
|
269
|
-
if (!full)
|
|
270
|
-
return;
|
|
271
|
-
const numMatch = full.match(/^(\d+)\s*[.\-–—]\s*/);
|
|
272
|
-
const valeur = numMatch ? numMatch[1] : undefined;
|
|
273
|
-
// prefere intitule in ancre <a> if present
|
|
274
|
-
const a = $p.find("a").first();
|
|
275
|
-
const intituleRaw = a.length ? a.text() : full.replace(/^(\d+)\s*[.\-–—]\s*/, "");
|
|
276
|
-
const intitule = norm(intituleRaw);
|
|
277
|
-
// id_syceron from href="#Niv1_SOMx"
|
|
278
|
-
const href = (a.attr("href") || "").trim();
|
|
279
|
-
const idSyceron = href.startsWith("#") ? href.slice(1) : href;
|
|
280
|
-
const titreStruct = { id_syceron: idSyceron || "", intitule };
|
|
281
|
-
items.push({ valeur_pts_odj: valeur, titreStruct });
|
|
282
|
-
});
|
|
283
|
-
if (items.length)
|
|
284
|
-
sommaire.sommaire1 = items;
|
|
285
|
-
return sommaire;
|
|
286
|
-
}
|
|
287
|
-
function extractMetadonnees($, filePath) {
|
|
288
|
-
let dateText = norm($("h1, h2, .page-title").first().text() || "");
|
|
289
|
-
if (!dateText)
|
|
290
|
-
dateText = norm($("p").first().text() || "");
|
|
291
|
-
const dateMatch = dateText.match(/\b(\d{1,2}\s+\w+\s+\d{4})\b/i);
|
|
292
|
-
const allText = norm($("body").text() || "");
|
|
293
|
-
const sessionMatch = allText.match(/\bsession\s+(\d{4}-\d{4})\b/i);
|
|
294
|
-
let dateSeance = dateMatch?.[1] || "";
|
|
295
|
-
if (!dateSeance) {
|
|
296
|
-
const m = filePath.match(/d(\d{4})(\d{2})(\d{2})\.xml$/i);
|
|
297
|
-
if (m)
|
|
298
|
-
dateSeance = `${m[1]}-${m[2]}-${m[3]}`;
|
|
299
|
-
}
|
|
300
|
-
return {
|
|
301
|
-
dateSeance,
|
|
302
|
-
dateSeanceJour: dateSeance,
|
|
303
|
-
numSeanceJour: "",
|
|
304
|
-
numSeance: "",
|
|
305
|
-
typeAssemblee: "SN",
|
|
306
|
-
legislature: "",
|
|
307
|
-
session: sessionMatch?.[1] || "",
|
|
308
|
-
nomFichierJo: "",
|
|
309
|
-
validite: "",
|
|
310
|
-
etat: "",
|
|
311
|
-
diffusion: "",
|
|
312
|
-
version: "1.0",
|
|
313
|
-
environnement: "",
|
|
314
|
-
heureGeneration: new Date()
|
|
315
|
-
};
|
|
316
|
-
}
|
|
317
|
-
function elementInAnyInterval(el, idx, intervals) {
|
|
318
|
-
const p = idx.get(el);
|
|
319
|
-
if (p == null)
|
|
320
|
-
return false;
|
|
321
|
-
for (const iv of intervals)
|
|
322
|
-
if (p >= iv.start && p < iv.end)
|
|
323
|
-
return true;
|
|
324
|
-
return false;
|
|
325
|
-
}
|