@tricoteuses/senat 2.15.6 → 2.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/datasets.js +0 -1
- package/lib/model/agenda.js +9 -16
- package/lib/model/commission.d.ts +9 -1
- package/lib/model/commission.js +47 -33
- package/lib/model/scrutins.js +4 -3
- package/lib/model/seance.js +1 -6
- package/lib/model/util.d.ts +3 -0
- package/lib/model/util.js +32 -0
- package/lib/scripts/retrieve_cr_commission.js +90 -72
- package/lib/scripts/retrieve_videos.d.ts +0 -2
- package/lib/scripts/retrieve_videos.js +57 -33
- package/lib/types/agenda.d.ts +2 -2
- package/lib/utils/cr_spliting.js +4 -2
- package/lib/utils/reunion_grouping.d.ts +1 -1
- package/lib/utils/reunion_grouping.js +13 -42
- package/package.json +1 -1
- package/lib/model/compte_rendu.d.ts +0 -9
- package/lib/model/compte_rendu.js +0 -325
- package/lib/raw_types/db.d.ts +0 -11389
- package/lib/raw_types/db.js +0 -5
- package/lib/scripts/retrieve_comptes_rendus.d.ts +0 -6
- package/lib/scripts/retrieve_comptes_rendus.js +0 -274
package/lib/datasets.js
CHANGED
|
@@ -220,7 +220,6 @@ export const datasets = {
|
|
|
220
220
|
],
|
|
221
221
|
votsen: [
|
|
222
222
|
{ name: "idx_scrnum", columns: ["scrnum"] },
|
|
223
|
-
{ name: "idx_sesann", columns: ["sesann"] },
|
|
224
223
|
{ name: "idx_titsencod", columns: ["titsencod"] },
|
|
225
224
|
{ name: "idx_stavotidt", columns: ["stavotidt"] },
|
|
226
225
|
{ name: "idx_posvotcod", columns: ["posvotcod"] },
|
package/lib/model/agenda.js
CHANGED
|
@@ -7,9 +7,7 @@ function eventIsSeance(eventElement) {
|
|
|
7
7
|
return eventElement.classList.contains("evt-seance");
|
|
8
8
|
}
|
|
9
9
|
function getEventType(eventClasses) {
|
|
10
|
-
const typeClass = [...eventClasses]
|
|
11
|
-
.find(className => className.startsWith("evt-"))
|
|
12
|
-
|| null;
|
|
10
|
+
const typeClass = [...eventClasses].find((className) => className.startsWith("evt-")) || null;
|
|
13
11
|
switch (typeClass) {
|
|
14
12
|
case "evt-seance":
|
|
15
13
|
return "Séance publique";
|
|
@@ -25,8 +23,7 @@ function getEventType(eventClasses) {
|
|
|
25
23
|
return null;
|
|
26
24
|
}
|
|
27
25
|
function getUrlDossierSenat(lienElements) {
|
|
28
|
-
const urlElement = [...lienElements]
|
|
29
|
-
.find(lienElement => lienElement.textContent?.includes("dossier législatif"));
|
|
26
|
+
const urlElement = [...lienElements].find((lienElement) => lienElement.textContent?.includes("dossier législatif"));
|
|
30
27
|
return urlElement ? urlElement.getAttribute("href") : null;
|
|
31
28
|
}
|
|
32
29
|
function getQuantieme(eventElement, seancesElements) {
|
|
@@ -60,17 +57,16 @@ function normalizeTime(timeStr) {
|
|
|
60
57
|
?.replace(/^(?:l')?après-midi/i, "16h00") // We chose "après-midi" to mean 16h00
|
|
61
58
|
?.replace(/^(?:le )?soir/i, "20h00") // We chose "soir" to mean 20h00
|
|
62
59
|
?.replace(/^(?:la )?nuit/i, "22h00") // We chose "nuit" to mean 22h00
|
|
63
|
-
?.replace(/^à\s/
|
|
64
|
-
?.replace(/heures/
|
|
60
|
+
?.replace(/^à\s/gi, "")
|
|
61
|
+
?.replace(/heures/gi, "h00")
|
|
65
62
|
?.replace(/\set.*/i, "")
|
|
66
63
|
?.replace(/,.*/, "")
|
|
67
64
|
?.replace(/\s\(hors hémicycle\)/i, "")
|
|
68
|
-
?.replace(/\s*h\s*/
|
|
65
|
+
?.replace(/\s*h\s*/gi, "h");
|
|
69
66
|
}
|
|
70
67
|
function getStartAndEndTimes(timeStr) {
|
|
71
68
|
const normalizedTime = normalizeTime(timeStr);
|
|
72
|
-
const timeMatches = normalizedTime
|
|
73
|
-
?.match(/^de (?<startTime>\d{2}h\d{2}) à (?<endTime>\d{2}h\d{2})$/i);
|
|
69
|
+
const timeMatches = normalizedTime?.match(/^de (?<startTime>\d{2}h\d{2}) à (?<endTime>\d{2}h\d{2})$/i);
|
|
74
70
|
if (timeMatches?.groups) {
|
|
75
71
|
const { startTime, endTime } = timeMatches.groups;
|
|
76
72
|
return {
|
|
@@ -86,7 +82,7 @@ function getStartAndEndTimes(timeStr) {
|
|
|
86
82
|
function transformAgenda(document, fileName) {
|
|
87
83
|
const agendaEvents = [];
|
|
88
84
|
const eventElements = document.querySelectorAll(".evt");
|
|
89
|
-
const seanceElements = Array.from(eventElements).filter(eventElement => eventIsSeance(eventElement));
|
|
85
|
+
const seanceElements = Array.from(eventElements).filter((eventElement) => eventIsSeance(eventElement));
|
|
90
86
|
for (const eventElement of eventElements) {
|
|
91
87
|
const id = eventElement.previousElementSibling?.getAttribute("name") || null;
|
|
92
88
|
if (!id) {
|
|
@@ -96,12 +92,9 @@ function transformAgenda(document, fileName) {
|
|
|
96
92
|
const date = DateTime.fromFormat(fileName, ID_DATE_FORMAT).toFormat(STANDARD_DATE_FORMAT);
|
|
97
93
|
const timeOriginal = eventElement.querySelector(".time")?.textContent || null;
|
|
98
94
|
const { startTime, endTime } = getStartAndEndTimes(timeOriginal);
|
|
99
|
-
const titre = eventElement.querySelector(".titre")?.textContent?.trim() ||
|
|
95
|
+
const titre = eventElement.querySelector(".titre")?.textContent?.trim() || "";
|
|
100
96
|
const organe = eventElement.querySelector(".organe")?.textContent?.trim() || null;
|
|
101
|
-
const objet = eventElement.querySelector(".objet")?.textContent
|
|
102
|
-
?.trim()
|
|
103
|
-
?.replace(/^- /, "")
|
|
104
|
-
|| null;
|
|
97
|
+
const objet = eventElement.querySelector(".objet")?.textContent?.trim()?.replace(/^- /, "") || null;
|
|
105
98
|
const lieu = eventElement.querySelector(".lieu")?.textContent || null;
|
|
106
99
|
const videoElement = eventElement.querySelector(".video");
|
|
107
100
|
const urlDossierSenat = getUrlDossierSenat(eventElement.querySelectorAll(".lien a"));
|
|
@@ -2,8 +2,16 @@ import * as cheerio from "cheerio";
|
|
|
2
2
|
import { CompteRendu } from "../types/compte_rendu";
|
|
3
3
|
import { GroupedReunion } from "../types/agenda";
|
|
4
4
|
export declare function getRemainingTextAfterSpeakerHeader($: cheerio.CheerioAPI, $p: cheerio.Cheerio<any>): string;
|
|
5
|
-
export
|
|
5
|
+
export type DaySection = {
|
|
6
|
+
title: string;
|
|
7
|
+
$start: cheerio.Cheerio<any>;
|
|
8
|
+
};
|
|
9
|
+
export declare function cleanTitle(t: string): string;
|
|
10
|
+
export declare function extractDayH3Sections($: cheerio.CheerioAPI, dateISO: string): DaySection[];
|
|
11
|
+
export declare function parseCommissionCRSectionFromDom($: cheerio.CheerioAPI, htmlFilePath: string, opts: {
|
|
6
12
|
dateISO: string;
|
|
7
13
|
hourShort: string | null;
|
|
8
14
|
organe?: string | null;
|
|
15
|
+
section: DaySection;
|
|
16
|
+
matched?: GroupedReunion;
|
|
9
17
|
}): CompteRendu | null;
|
package/lib/model/commission.js
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import * as cheerio from "cheerio";
|
|
2
2
|
import path from "path";
|
|
3
|
-
import fs from "fs";
|
|
4
3
|
import { norm, toCRDate } from "./util";
|
|
5
4
|
import { makeTypeGroupUid } from "../utils/reunion_grouping";
|
|
6
5
|
import { frDateToISO, hourShortToStartTime } from "../utils/cr_spliting";
|
|
@@ -162,47 +161,68 @@ function buildPointsFromParagraphs($, paras) {
|
|
|
162
161
|
flush();
|
|
163
162
|
return points;
|
|
164
163
|
}
|
|
165
|
-
export function
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
164
|
+
export function cleanTitle(t) {
|
|
165
|
+
return (t || "").replace(/\s+/g, " ").trim();
|
|
166
|
+
}
|
|
167
|
+
export function extractDayH3Sections($, dateISO) {
|
|
168
|
+
const sections = [];
|
|
169
|
+
const $dayRoot = findDayRoot($, dateISO);
|
|
170
|
+
if ($dayRoot.length === 0)
|
|
171
|
+
return sections;
|
|
172
|
+
let $cursor = $dayRoot.next();
|
|
173
|
+
while ($cursor.length && !$cursor.is("h2")) {
|
|
174
|
+
if ($cursor.is("h3")) {
|
|
175
|
+
const title = cleanTitle($cursor.text());
|
|
176
|
+
if (title)
|
|
177
|
+
sections.push({ title, $start: $cursor });
|
|
178
|
+
$cursor = $cursor.next();
|
|
179
|
+
continue;
|
|
180
|
+
}
|
|
181
|
+
const $h3 = $cursor.find("h3").first();
|
|
182
|
+
if ($h3.length) {
|
|
183
|
+
const title = cleanTitle($h3.text());
|
|
184
|
+
if (title)
|
|
185
|
+
sections.push({ title, $start: $h3 });
|
|
186
|
+
$cursor = $cursor.next();
|
|
187
|
+
continue;
|
|
170
188
|
}
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
189
|
+
$cursor = $cursor.next();
|
|
190
|
+
}
|
|
191
|
+
return sections;
|
|
192
|
+
}
|
|
193
|
+
export function parseCommissionCRSectionFromDom($, htmlFilePath, opts) {
|
|
194
|
+
try {
|
|
195
|
+
const { dateISO, hourShort, organe, section, matched } = opts;
|
|
196
|
+
const seanceRef = matched?.uid ?? makeTypeGroupUid(dateISO, "COM", matched?.events[0].id ?? hourShort ?? "", organe ?? undefined);
|
|
177
197
|
const uid = seanceRef.replace(/^RU/, "CRC");
|
|
178
|
-
const dateSeance = toCRDate(dateISO, startTime);
|
|
198
|
+
const dateSeance = toCRDate(dateISO, matched?.startTime ?? hourShortToStartTime(hourShort));
|
|
179
199
|
const $dayRoot = findDayRoot($, dateISO);
|
|
180
200
|
if ($dayRoot.length === 0) {
|
|
181
201
|
console.warn(`[COM-CR][parse] day root not found for ${dateISO} in ${path.basename(htmlFilePath)}`);
|
|
182
202
|
return null;
|
|
183
203
|
}
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
dayParas.push($cursor);
|
|
204
|
+
const paras = [];
|
|
205
|
+
let $cursor = section.$start;
|
|
206
|
+
// Jump title if we do not want to add it to paragraphes
|
|
207
|
+
$cursor = $cursor.next();
|
|
208
|
+
while ($cursor.length && !$cursor.is("h2") && !$cursor.is("h3")) {
|
|
190
209
|
if ($cursor.is(PARA_h3_SEL)) {
|
|
191
|
-
|
|
210
|
+
paras.push($cursor);
|
|
192
211
|
}
|
|
193
212
|
else {
|
|
194
213
|
const $ps = $cursor.find(PARA_h3_SEL);
|
|
195
214
|
if ($ps.length)
|
|
196
215
|
$ps.each((_, p) => {
|
|
197
|
-
|
|
216
|
+
paras.push($(p));
|
|
198
217
|
});
|
|
199
218
|
}
|
|
200
219
|
$cursor = $cursor.next();
|
|
201
220
|
}
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
221
|
+
const points = buildPointsFromParagraphs($, paras);
|
|
222
|
+
if (points.length < 4) {
|
|
223
|
+
console.warn(`[COM-CR][parse] Insufficient points found for section="${section.title}" date=${dateISO} in ${path.basename(htmlFilePath)}`);
|
|
224
|
+
return null;
|
|
225
|
+
}
|
|
206
226
|
const session = dateISO.slice(5, 7) >= "10" ? `${dateISO.slice(0, 4)}` : `${Number(dateISO.slice(0, 4)) - 1}`;
|
|
207
227
|
const contenu = {
|
|
208
228
|
quantiemes: { journee: dateISO, session },
|
|
@@ -224,16 +244,10 @@ export function parseCommissionCRFromFile(htmlFilePath, best, fallback) {
|
|
|
224
244
|
environnement: "prod",
|
|
225
245
|
heureGeneration: new Date(),
|
|
226
246
|
};
|
|
227
|
-
return {
|
|
228
|
-
uid,
|
|
229
|
-
seanceRef,
|
|
230
|
-
sessionRef: session,
|
|
231
|
-
metadonnees,
|
|
232
|
-
contenu,
|
|
233
|
-
};
|
|
247
|
+
return { uid, seanceRef, sessionRef: session, metadonnees, contenu };
|
|
234
248
|
}
|
|
235
249
|
catch (e) {
|
|
236
|
-
console.error(`[COM-CR][parse] error file=${path.basename(htmlFilePath)}:`, e);
|
|
250
|
+
console.error(`[COM-CR][parse] error section file=${path.basename(htmlFilePath)}:`, e);
|
|
237
251
|
return null;
|
|
238
252
|
}
|
|
239
253
|
}
|
package/lib/model/scrutins.js
CHANGED
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
import { jsonArrayFrom } from "kysely/helpers/postgres";
|
|
2
2
|
import { dbSenat } from "../databases";
|
|
3
3
|
import { rtrim, toDateString } from "./util";
|
|
4
|
-
function votes(scrutinNum,
|
|
4
|
+
function votes(scrutinNum, scrutinDate) {
|
|
5
5
|
return jsonArrayFrom(dbSenat
|
|
6
6
|
.selectFrom("dosleg.votsen")
|
|
7
7
|
.leftJoin("dosleg.titsen", "dosleg.titsen.titsencod", "dosleg.votsen.titsencod")
|
|
8
8
|
.leftJoin("dosleg.stavot", "dosleg.stavot.stavotidt", "dosleg.votsen.stavotidt")
|
|
9
9
|
.leftJoin("dosleg.posvot", "dosleg.posvot.posvotcod", "dosleg.votsen.posvotcod")
|
|
10
|
+
.leftJoin("sens.sen", "dosleg.votsen.senmat", "sens.sen.senmat")
|
|
10
11
|
.leftJoin("sens.memgrppol", (join) => join
|
|
11
12
|
.onRef("sens.memgrppol.senmat", "=", "dosleg.votsen.senmat")
|
|
12
13
|
.onRef("sens.memgrppol.memgrppoldatdeb", "<=", scrutinDate)
|
|
@@ -15,7 +16,7 @@ function votes(scrutinNum, scrutinSession, scrutinDate) {
|
|
|
15
16
|
eb("sens.memgrppol.memgrppoldatfin", "is", null)
|
|
16
17
|
])))
|
|
17
18
|
.where("dosleg.votsen.scrnum", "=", scrutinNum)
|
|
18
|
-
.where("
|
|
19
|
+
.where("sens.sen.etasencod", "=", "ACTIF")
|
|
19
20
|
.select([
|
|
20
21
|
"dosleg.votsen.senmat as matricule_votant",
|
|
21
22
|
"dosleg.votsen.senmatdel as matricule_delegant",
|
|
@@ -64,7 +65,7 @@ const findAllScrutinsQuery = dbSenat
|
|
|
64
65
|
"scr.scrconsea as nombre_contre_seance",
|
|
65
66
|
"scr.scrpou as nombre_pour",
|
|
66
67
|
"scr.scrpousea as nombre_pour_seance",
|
|
67
|
-
votes(ref("scr.scrnum"), ref("scr.
|
|
68
|
+
votes(ref("scr.scrnum"), ref("scr.scrdat")).as("votes"),
|
|
68
69
|
misesAuPoint(ref("scr.scrnum")).as("mises_au_point"),
|
|
69
70
|
])
|
|
70
71
|
.$narrowType();
|
package/lib/model/seance.js
CHANGED
|
@@ -2,7 +2,7 @@ import fs from "fs";
|
|
|
2
2
|
import * as cheerio from "cheerio";
|
|
3
3
|
import path from "path";
|
|
4
4
|
import { computeIntervalsBySlot } from "../utils/cr_spliting";
|
|
5
|
-
import { norm, toCRDate } from "./util";
|
|
5
|
+
import { decodeHtmlEntities, norm, toCRDate } from "./util";
|
|
6
6
|
const asArray = (x) => (x == null ? [] : Array.isArray(x) ? x : [x]);
|
|
7
7
|
const toInt = (s) => (Number.isFinite(Number(s)) ? Number(s) : Number.POSITIVE_INFINITY);
|
|
8
8
|
export async function parseCompteRenduSlotFromFile(xmlFilePath, wantedSlot, firstSlotOfDay) {
|
|
@@ -166,11 +166,6 @@ function dedupeSpeaker(raw) {
|
|
|
166
166
|
}
|
|
167
167
|
return s.replace(/\.\s*$/, "");
|
|
168
168
|
}
|
|
169
|
-
function decodeHtmlEntities(s) {
|
|
170
|
-
return s
|
|
171
|
-
.replace(/&#(\d+);/g, (_, d) => String.fromCharCode(parseInt(d, 10)))
|
|
172
|
-
.replace(/&#x([0-9a-fA-F]+);/g, (_, h) => String.fromCharCode(parseInt(h, 16)));
|
|
173
|
-
}
|
|
174
169
|
function fixApostrophes(s) {
|
|
175
170
|
let out = s;
|
|
176
171
|
out = out.replace(/\s*’\s*/g, "’");
|
package/lib/model/util.d.ts
CHANGED
|
@@ -8,3 +8,6 @@ export declare function rtrim(expr: Expression<string | null | undefined>): impo
|
|
|
8
8
|
export declare function toDateString(expr: Expression<Date | null | undefined>, format?: Expression<string>): import("kysely").RawBuilder<string>;
|
|
9
9
|
export declare function norm(s?: string | null): string;
|
|
10
10
|
export declare function toCRDate(dateISO: string, startTime?: string | null): string;
|
|
11
|
+
export declare function normalizeTitle(t: string): string;
|
|
12
|
+
export declare function jaccardTokenSim(a: string, b: string): number;
|
|
13
|
+
export declare function decodeHtmlEntities(s?: string | null): string;
|
package/lib/model/util.js
CHANGED
|
@@ -42,3 +42,35 @@ export function toCRDate(dateISO, startTime) {
|
|
|
42
42
|
}
|
|
43
43
|
return `${yyyymmdd}${hh}${mm}${ss}${SSS}`;
|
|
44
44
|
}
|
|
45
|
+
export function normalizeTitle(t) {
|
|
46
|
+
return (t || "")
|
|
47
|
+
.toLowerCase()
|
|
48
|
+
.normalize("NFD")
|
|
49
|
+
.replace(/\p{Diacritic}/gu, "")
|
|
50
|
+
.replace(/[^a-z0-9\s]/g, " ")
|
|
51
|
+
.replace(/\s+/g, " ")
|
|
52
|
+
.trim();
|
|
53
|
+
}
|
|
54
|
+
export function jaccardTokenSim(a, b) {
|
|
55
|
+
const A = new Set(normalizeTitle(a).split(" ").filter(Boolean));
|
|
56
|
+
const B = new Set(normalizeTitle(b).split(" ").filter(Boolean));
|
|
57
|
+
if (A.size === 0 || B.size === 0)
|
|
58
|
+
return 0;
|
|
59
|
+
let inter = 0;
|
|
60
|
+
for (const x of A)
|
|
61
|
+
if (B.has(x))
|
|
62
|
+
inter++;
|
|
63
|
+
return inter / (A.size + B.size - inter);
|
|
64
|
+
}
|
|
65
|
+
export function decodeHtmlEntities(s) {
|
|
66
|
+
if (!s)
|
|
67
|
+
return "";
|
|
68
|
+
return s
|
|
69
|
+
.replace(/&#x([0-9a-fA-F]+);/g, (_, h) => String.fromCodePoint(parseInt(h, 16)))
|
|
70
|
+
.replace(/&#(\d+);/g, (_, d) => String.fromCodePoint(parseInt(d, 10)))
|
|
71
|
+
.replace(/&/g, "&")
|
|
72
|
+
.replace(/</g, "<")
|
|
73
|
+
.replace(/>/g, ">")
|
|
74
|
+
.replace(/"/g, '"')
|
|
75
|
+
.replace(/'/g, "'");
|
|
76
|
+
}
|
|
@@ -4,12 +4,13 @@ import path from "path";
|
|
|
4
4
|
import * as cheerio from "cheerio";
|
|
5
5
|
import { COMMISSION_FOLDER, DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER } from "../loaders";
|
|
6
6
|
import { loadAgendaForDate, parseCommissionMetadataFromHtml, linkCRtoCommissionGroup } from "../utils/cr_spliting";
|
|
7
|
-
import {
|
|
7
|
+
import { cleanTitle, extractDayH3Sections, parseCommissionCRSectionFromDom } from "../model/commission";
|
|
8
8
|
import commandLineArgs from "command-line-args";
|
|
9
9
|
import { commonOptions } from "./shared/cli_helpers";
|
|
10
10
|
import { sessionStartYearFromDate } from "../model/seance";
|
|
11
11
|
import { getSessionsFromStart } from "../types/sessions";
|
|
12
12
|
import { ensureAndClearDir, fetchWithRetry } from "./shared/util";
|
|
13
|
+
import { jaccardTokenSim } from "../model/util";
|
|
13
14
|
class CommissionCRDownloadError extends Error {
|
|
14
15
|
constructor(message, url) {
|
|
15
16
|
super(`An error occurred while retrieving Commission CR ${url}: ${message}`);
|
|
@@ -27,35 +28,35 @@ const optionsDefinitions = [
|
|
|
27
28
|
];
|
|
28
29
|
const options = commandLineArgs(optionsDefinitions);
|
|
29
30
|
const COMMISSION_HUBS = {
|
|
30
|
-
"affaires
|
|
31
|
+
"Commission des affaires étrangères": [
|
|
31
32
|
"https://www.senat.fr/compte-rendu-commissions/affaires-etrangeres.html",
|
|
32
33
|
"https://www.senat.fr/compte-rendu-commissions/affaires-etrangeres_archives.html",
|
|
33
34
|
],
|
|
34
|
-
"affaires
|
|
35
|
+
"Commission des affaires économiques": [
|
|
35
36
|
"https://www.senat.fr/compte-rendu-commissions/economie.html",
|
|
36
37
|
"https://www.senat.fr/compte-rendu-commissions/economie_archives.html",
|
|
37
38
|
],
|
|
38
|
-
"amenagement
|
|
39
|
+
"Commission de l'amenagement du territoire et du développement durable": [
|
|
39
40
|
"https://www.senat.fr/compte-rendu-commissions/developpement-durable.html",
|
|
40
41
|
"https://www.senat.fr/compte-rendu-commissions/developpement-durable_archives.html",
|
|
41
42
|
],
|
|
42
|
-
culture: [
|
|
43
|
+
"Commission de la culture": [
|
|
43
44
|
"https://www.senat.fr/compte-rendu-commissions/culture.html",
|
|
44
45
|
"https://www.senat.fr/compte-rendu-commissions/culture_archives.html",
|
|
45
46
|
],
|
|
46
|
-
finances: [
|
|
47
|
+
"Commission des finances": [
|
|
47
48
|
"https://www.senat.fr/compte-rendu-commissions/finances.html",
|
|
48
49
|
"https://www.senat.fr/compte-rendu-commissions/finances_archives.html",
|
|
49
50
|
],
|
|
50
|
-
lois: [
|
|
51
|
+
"Commission des lois": [
|
|
51
52
|
"https://www.senat.fr/compte-rendu-commissions/lois.html",
|
|
52
53
|
"https://www.senat.fr/compte-rendu-commissions/lois_archives.html",
|
|
53
54
|
],
|
|
54
|
-
"affaires
|
|
55
|
+
"Commission des affaires sociales": [
|
|
55
56
|
"https://www.senat.fr/compte-rendu-commissions/affaires-sociales.html",
|
|
56
57
|
"https://www.senat.fr/compte-rendu-commissions/affaires-sociales_archives.html",
|
|
57
58
|
],
|
|
58
|
-
"affaires
|
|
59
|
+
"Commission des affaires européennes": [
|
|
59
60
|
"https://www.senat.fr/compte-rendu-commissions/affaires-europeennes.html",
|
|
60
61
|
"https://www.senat.fr/compte-rendu-commissions/affaires-europeennes_archives.html",
|
|
61
62
|
],
|
|
@@ -170,6 +171,15 @@ function timeProximityScore(h, openHHMM, maxDeltaMin) {
|
|
|
170
171
|
return 0;
|
|
171
172
|
return 1 - d / maxDeltaMin; // 0..1 (1 = même heure)
|
|
172
173
|
}
|
|
174
|
+
function titleSimilarity(reunion, sectionTitle) {
|
|
175
|
+
const t = reunion.titre ?? "";
|
|
176
|
+
const o = reunion.objet ?? "";
|
|
177
|
+
if (!sectionTitle.trim())
|
|
178
|
+
return 0;
|
|
179
|
+
const sTit = jaccardTokenSim(t, sectionTitle);
|
|
180
|
+
const sObj = jaccardTokenSim(o, sectionTitle);
|
|
181
|
+
return Math.max(sTit, sObj);
|
|
182
|
+
}
|
|
173
183
|
async function retrieveCommissionCRs(options = {}) {
|
|
174
184
|
const dataDir = options["dataDir"];
|
|
175
185
|
const fromSession = Number(options["fromSession"]);
|
|
@@ -245,86 +255,94 @@ async function retrieveCommissionCRs(options = {}) {
|
|
|
245
255
|
for (const f of htmlFiles) {
|
|
246
256
|
const htmlPath = path.join(commissionDir, f);
|
|
247
257
|
let meta;
|
|
258
|
+
let raw = "";
|
|
248
259
|
try {
|
|
249
|
-
|
|
260
|
+
raw = await fs.readFile(htmlPath, "utf8");
|
|
250
261
|
meta = parseCommissionMetadataFromHtml(raw, f);
|
|
251
262
|
}
|
|
252
263
|
catch (e) {
|
|
253
264
|
console.warn(`[COM-CR][PRE][${session}] Cannot read/parse ${f}:`, e);
|
|
254
265
|
continue;
|
|
255
266
|
}
|
|
256
|
-
|
|
257
|
-
|
|
267
|
+
if (!meta?.days?.length)
|
|
268
|
+
continue;
|
|
269
|
+
const $ = cheerio.load(raw, { xmlMode: false });
|
|
270
|
+
for (const day of meta.days) {
|
|
258
271
|
const yyyymmdd = day.date.replace(/-/g, "");
|
|
259
272
|
const dt = new Date(Number(day.date.slice(0, 4)), Number(day.date.slice(5, 7)) - 1, Number(day.date.slice(8, 10)));
|
|
260
273
|
const daySession = sessionStartYearFromDate(dt);
|
|
261
274
|
let hits = await loadAgendaForDate(dataDir, yyyymmdd, daySession);
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
275
|
+
console.log(`[COM-CR][TRANSFORM] ${f} → ${hits.length} agenda events on ${day.date} :`);
|
|
276
|
+
const sections = extractDayH3Sections($, day.date);
|
|
277
|
+
if (sections.length === 0) {
|
|
278
|
+
console.warn(`[COM-CR][TRANSFORM] no sections found for ${f} on ${day.date}, skipping.`);
|
|
279
|
+
continue;
|
|
280
|
+
}
|
|
266
281
|
const MAX_TIME_DELTA_MIN = 120;
|
|
267
|
-
const ORGANE_GATE = 0.55;
|
|
268
|
-
const
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
282
|
+
const ORGANE_GATE = 0.55;
|
|
283
|
+
const TITLE_GATE = 0.2;
|
|
284
|
+
const W_ORG = 0.5;
|
|
285
|
+
const W_TIM = 0.2;
|
|
286
|
+
const W_TIT = 0.3;
|
|
287
|
+
for (let sIdx = 0; sIdx < sections.length; sIdx++) {
|
|
288
|
+
const sec = sections[sIdx];
|
|
289
|
+
let best = null;
|
|
290
|
+
let reason = "fallback-none";
|
|
291
|
+
if (hits.length) {
|
|
292
|
+
const scored = hits
|
|
293
|
+
.map((h) => {
|
|
294
|
+
const sOrg = organeSimilarity(h, commissionKey); // 0..1
|
|
295
|
+
const sTim = timeProximityScore(h, day.openTime ?? null, MAX_TIME_DELTA_MIN); // 0..1
|
|
296
|
+
const sTit = titleSimilarity(h, sec.title); // 0..1
|
|
297
|
+
const total = W_ORG * sOrg + W_TIM * sTim + W_TIT * sTit;
|
|
298
|
+
return { h, sOrg, sTim, sTit, total };
|
|
299
|
+
})
|
|
300
|
+
.filter((x) => x.sOrg >= ORGANE_GATE && x.sTit >= TITLE_GATE)
|
|
301
|
+
.sort((a, b) => b.total - a.total);
|
|
302
|
+
if (scored[0]) {
|
|
303
|
+
best = scored[0].h;
|
|
304
|
+
reason =
|
|
305
|
+
scored[0].sTit >= Math.max(scored[0].sOrg, scored[0].sTim)
|
|
306
|
+
? "title"
|
|
307
|
+
: scored[0].sOrg >= scored[0].sTim
|
|
308
|
+
? "organe"
|
|
309
|
+
: "time";
|
|
285
310
|
}
|
|
286
311
|
}
|
|
312
|
+
const hourShort = toHourShort(day.openTime) ?? "NA";
|
|
313
|
+
const cr = parseCommissionCRSectionFromDom($, htmlPath, {
|
|
314
|
+
dateISO: day.date,
|
|
315
|
+
hourShort,
|
|
316
|
+
organe: commissionKey,
|
|
317
|
+
section: sec,
|
|
318
|
+
matched: best ?? undefined,
|
|
319
|
+
});
|
|
320
|
+
if (!cr) {
|
|
321
|
+
console.warn(`[COM-CR][TRANSFORM] parse failed for section#${sIdx} ${path.basename(htmlPath)} → ${best ? best.uid : "NO-GROUP"} (${commissionKey})`);
|
|
322
|
+
continue;
|
|
323
|
+
}
|
|
324
|
+
const fileUid = cr.uid;
|
|
325
|
+
const transformedSessionDir = path.join(transformedRoot, String(daySession));
|
|
326
|
+
fs.ensureDirSync(transformedSessionDir);
|
|
327
|
+
const outPath = path.join(transformedSessionDir, `${fileUid}.json`);
|
|
328
|
+
await fs.writeJSON(outPath, cr, { spaces: 2 });
|
|
329
|
+
const titreGuess = cleanTitle(sections[sIdx].title) || "Commission du " + day.date;
|
|
330
|
+
const up = await linkCRtoCommissionGroup({
|
|
331
|
+
dataDir,
|
|
332
|
+
dateISO: day.date,
|
|
333
|
+
organeDetected: commissionKey,
|
|
334
|
+
hourShort,
|
|
335
|
+
crUid: fileUid,
|
|
336
|
+
titreGuess,
|
|
337
|
+
groupUid: best ? best.uid : undefined,
|
|
338
|
+
});
|
|
339
|
+
totalFiles++;
|
|
340
|
+
if (up.created || up.updated)
|
|
341
|
+
linkedFiles++;
|
|
287
342
|
else {
|
|
288
|
-
|
|
289
|
-
reason = "fallback-none";
|
|
343
|
+
console.warn(`[COM-CR][AGENDA][WARN] CR ${fileUid} (section#${sIdx}) not linked (reason=${reason})`);
|
|
290
344
|
}
|
|
291
345
|
}
|
|
292
|
-
// Parse CR
|
|
293
|
-
const hourShort = toHourShort(day.openTime) ?? "NA";
|
|
294
|
-
const cr = parseCommissionCRFromFile(htmlPath, best ?? undefined, {
|
|
295
|
-
dateISO: day.date,
|
|
296
|
-
hourShort,
|
|
297
|
-
organe: commissionKey,
|
|
298
|
-
});
|
|
299
|
-
if (!cr) {
|
|
300
|
-
console.warn(`[COM-CR][TRANSFORM] parse failed for ${f} → ${best ? best.uid : "NO-GROUP"} (${commissionKey})`);
|
|
301
|
-
continue;
|
|
302
|
-
}
|
|
303
|
-
const fileUid = cr.uid;
|
|
304
|
-
const transformedSessionDir = path.join(transformedRoot, String(daySession));
|
|
305
|
-
fs.ensureDirSync(transformedSessionDir);
|
|
306
|
-
const outPath = path.join(transformedSessionDir, `${fileUid}.json`);
|
|
307
|
-
await fs.writeJSON(outPath, cr, { spaces: 2 });
|
|
308
|
-
const npts = Array.isArray(cr.contenu.point) ? cr.contenu.point.length : cr.contenu.point ? 1 : 0;
|
|
309
|
-
if (!options["silent"]) {
|
|
310
|
-
console.log(`[COM-CR][TRANSFORM] saved ${path.basename(outPath)} (points=${npts}) [${commissionKey}]`);
|
|
311
|
-
}
|
|
312
|
-
const titreGuess = organeLabel || "Commission";
|
|
313
|
-
const up = await linkCRtoCommissionGroup({
|
|
314
|
-
dataDir,
|
|
315
|
-
dateISO: day.date,
|
|
316
|
-
organeDetected: best?.organe ?? null,
|
|
317
|
-
hourShort,
|
|
318
|
-
crUid: fileUid,
|
|
319
|
-
titreGuess,
|
|
320
|
-
groupUid: best ? best.uid : undefined,
|
|
321
|
-
});
|
|
322
|
-
totalFiles++;
|
|
323
|
-
if (up.created || up.updated)
|
|
324
|
-
linkedFiles++;
|
|
325
|
-
else {
|
|
326
|
-
console.warn(`[COM-CR][AGENDA][WARN] CR ${fileUid} not linked to any agenda group (reason=${reason}, delta=${deltaMin ?? "NA"}m)`);
|
|
327
|
-
}
|
|
328
346
|
}
|
|
329
347
|
}
|
|
330
348
|
if (!options["silent"]) {
|
|
@@ -1,6 +1,4 @@
|
|
|
1
1
|
import { GroupedReunion } from "../types/agenda";
|
|
2
|
-
export declare function buildSenatVodMasterM3u8FromNvs(nvsText: string, finalText: string): string | null;
|
|
3
|
-
export declare function score(agenda: GroupedReunion, agendaTs: number | null, videoTitle?: string, videoEpoch?: number): number;
|
|
4
2
|
/**
|
|
5
3
|
* Build search strategies for senat's videos
|
|
6
4
|
*/
|