@tricoteuses/senat 2.9.10 → 2.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE.md +22 -22
- package/README.md +116 -116
- package/lib/loaders.d.ts +6 -1
- package/lib/loaders.js +54 -0
- package/lib/model/agenda.js +0 -2
- package/lib/model/compte_rendu.d.ts +9 -2
- package/lib/model/compte_rendu.js +223 -211
- package/lib/model/util.d.ts +1 -0
- package/lib/model/util.js +3 -0
- package/lib/scripts/retrieve_agenda.js +25 -6
- package/lib/scripts/retrieve_comptes_rendus.d.ts +6 -1
- package/lib/scripts/retrieve_comptes_rendus.js +230 -77
- package/lib/scripts/retrieve_open_data.js +3 -1
- package/lib/scripts/retrieve_videos.js +1 -9
- package/lib/types/agenda.d.ts +19 -2
- package/lib/types/compte_rendu.d.ts +1 -1
- package/lib/utils/cr_spliting.d.ts +7 -0
- package/lib/utils/cr_spliting.js +125 -0
- package/lib/utils/reunion_grouping.d.ts +6 -0
- package/lib/utils/reunion_grouping.js +359 -0
- package/lib/validators/senat.d.ts +0 -0
- package/lib/validators/senat.js +24 -0
- package/package.json +98 -98
- package/lib/raw_types/kysely-table-types.d.ts +0 -5
- package/lib/raw_types/kysely-table-types.js +0 -1
|
@@ -1,119 +1,272 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Needs to be run after retrieve_agenda.ts !
|
|
3
|
+
* - downloads the ZIP of comptes-rendus des débats (CRI) from data.senat.fr
|
|
4
|
+
* - extracts XML files, distributes them by session/year
|
|
5
|
+
*/
|
|
1
6
|
import assert from "assert";
|
|
2
7
|
import commandLineArgs from "command-line-args";
|
|
3
8
|
import fs from "fs-extra";
|
|
4
9
|
import path from "path";
|
|
5
|
-
import
|
|
6
|
-
import
|
|
10
|
+
import StreamZip from "node-stream-zip";
|
|
11
|
+
import * as cheerio from "cheerio";
|
|
12
|
+
import { AGENDA_FOLDER, COMPTES_RENDUS_FOLDER, DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER, } from "../loaders";
|
|
7
13
|
import { commonOptions } from "./shared/cli_helpers";
|
|
8
|
-
import {
|
|
9
|
-
import {
|
|
14
|
+
import { deriveTitreObjetFromSommaire, parseCompteRenduSlotFromFile, parseYYYYMMDD, sessionStartYearFromDate } from "../model/compte_rendu";
|
|
15
|
+
import { makeGroupUid } from "../utils/reunion_grouping";
|
|
16
|
+
import { getSessionsFromStart } from "../types/sessions";
|
|
17
|
+
import { ensureAndClearDir, fetchWithRetry } from "./shared/util";
|
|
18
|
+
import { computeIntervalsBySlot } from "../utils/cr_spliting";
|
|
10
19
|
const optionsDefinitions = [
|
|
11
20
|
...commonOptions,
|
|
12
21
|
{
|
|
13
22
|
help: "parse and convert comptes-rendus des débats into JSON",
|
|
14
23
|
name: "parseDebats",
|
|
15
24
|
type: Boolean,
|
|
16
|
-
}
|
|
25
|
+
}
|
|
17
26
|
];
|
|
18
27
|
const options = commandLineArgs(optionsDefinitions);
|
|
19
|
-
const
|
|
28
|
+
const CRI_ZIP_URL = "https://data.senat.fr/data/debats/cri.zip";
|
|
29
|
+
const SLOT_ORDER = ["MATIN", "APRES-MIDI", "SOIR"];
|
|
20
30
|
class CompteRenduError extends Error {
|
|
21
|
-
constructor(message,
|
|
22
|
-
super(`An error occurred while retrieving
|
|
31
|
+
constructor(message, url) {
|
|
32
|
+
super(`An error occurred while retrieving ${url}: ${message}`);
|
|
23
33
|
}
|
|
24
34
|
}
|
|
25
|
-
|
|
26
|
-
for (
|
|
27
|
-
|
|
28
|
-
return
|
|
29
|
-
}
|
|
30
|
-
catch (e) {
|
|
31
|
-
if (attempt === retries)
|
|
32
|
-
break;
|
|
33
|
-
console.warn(`Fetch attempt ${attempt + 1} for ${url} failed. Retrying in ${backoffMs}ms...`);
|
|
34
|
-
await new Promise((resolve) => setTimeout(resolve, backoffMs));
|
|
35
|
-
backoffMs *= 2;
|
|
36
|
-
}
|
|
37
|
-
}
|
|
38
|
-
console.log(`Failed to fetch ${url} after ${retries} attempts.`);
|
|
35
|
+
function pickFirstSlotOfDay(slots) {
|
|
36
|
+
for (const s of SLOT_ORDER)
|
|
37
|
+
if (slots.includes(s))
|
|
38
|
+
return s;
|
|
39
39
|
return null;
|
|
40
40
|
}
|
|
41
|
-
|
|
42
|
-
const
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
const transformedComptesRendusDir = path.join(comptesRendusRootDir, DATA_TRANSFORMED_FOLDER);
|
|
47
|
-
if (options["parseDebats"]) {
|
|
48
|
-
fs.ensureDirSync(transformedComptesRendusDir);
|
|
41
|
+
function loadAgendaSPSlotsForDate(dataDir, yyyymmdd, session) {
|
|
42
|
+
const dirPath = path.join(dataDir, AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, session.toString());
|
|
43
|
+
if (!fs.existsSync(dirPath)) {
|
|
44
|
+
console.warn(`[AGENDA] Directory not found for session ${session} → ${dirPath}`);
|
|
45
|
+
return null;
|
|
49
46
|
}
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
const
|
|
54
|
-
|
|
55
|
-
|
|
47
|
+
const pattern = new RegExp(`^RUSN${yyyymmdd}IDS-(MATIN|APRES-MIDI|SOIR)\\.json$`);
|
|
48
|
+
const ALLOWED_SLOTS = new Set(["MATIN", "APRES-MIDI", "SOIR"]);
|
|
49
|
+
try {
|
|
50
|
+
const files = fs.readdirSync(dirPath);
|
|
51
|
+
const matched = files.filter((f) => pattern.test(f));
|
|
52
|
+
if (matched.length === 0) {
|
|
53
|
+
return null;
|
|
56
54
|
}
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
await downloadCompteRendu(debatMonoUrl, compteRenduPath);
|
|
64
|
-
if (options["parseDebats"]) {
|
|
65
|
-
await parseAndWriteJSON(transformedComptesRendusSessionDir, compteRenduPath, debat);
|
|
66
|
-
}
|
|
67
|
-
}
|
|
68
|
-
catch (error) {
|
|
69
|
-
console.error(error);
|
|
70
|
-
}
|
|
55
|
+
const found = new Set();
|
|
56
|
+
for (const name of matched) {
|
|
57
|
+
const m = name.match(pattern);
|
|
58
|
+
const raw = (m?.[1] ?? "");
|
|
59
|
+
if (ALLOWED_SLOTS.has(raw))
|
|
60
|
+
found.add(raw);
|
|
71
61
|
}
|
|
62
|
+
const slots = Array.from(found);
|
|
63
|
+
if (slots.length === 0) {
|
|
64
|
+
return null;
|
|
65
|
+
}
|
|
66
|
+
return { filePath: dirPath, slots };
|
|
72
67
|
}
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
const compteRenduUrl = `${SENAT_COMPTE_RENDU_URL_ROOT}/${debatUrl}`;
|
|
76
|
-
if (!options["silent"]) {
|
|
77
|
-
console.log(`Downloading Compte-Rendu ${compteRenduUrl}…`);
|
|
68
|
+
catch {
|
|
69
|
+
return null;
|
|
78
70
|
}
|
|
79
|
-
|
|
71
|
+
}
|
|
72
|
+
async function downloadCriZip(zipPath) {
|
|
73
|
+
if (!options["silent"])
|
|
74
|
+
console.log(`Downloading CRI zip ${CRI_ZIP_URL}…`);
|
|
75
|
+
const response = await fetchWithRetry(CRI_ZIP_URL);
|
|
80
76
|
if (!response.ok) {
|
|
81
77
|
if (response.status === 404) {
|
|
82
|
-
console.warn(`
|
|
78
|
+
console.warn(`CRI zip ${CRI_ZIP_URL} not found`);
|
|
79
|
+
return;
|
|
83
80
|
}
|
|
84
|
-
|
|
85
|
-
throw new CompteRenduError(String(response.status), compteRenduUrl);
|
|
86
|
-
}
|
|
87
|
-
return;
|
|
81
|
+
throw new CompteRenduError(String(response.status), CRI_ZIP_URL);
|
|
88
82
|
}
|
|
89
|
-
const
|
|
90
|
-
|
|
91
|
-
return;
|
|
92
|
-
fs.writeFileSync(compteRenduPath, Buffer.from(compteRenduContent));
|
|
93
|
-
}
|
|
94
|
-
async function parseAndWriteJSON(transformedComptesRendusSessionDir, compteRenduPath, debat) {
|
|
83
|
+
const buf = Buffer.from(await response.arrayBuffer());
|
|
84
|
+
await fs.writeFile(zipPath, buf);
|
|
95
85
|
if (!options["silent"]) {
|
|
96
|
-
|
|
86
|
+
const mb = (buf.length / (1024 * 1024)).toFixed(1);
|
|
87
|
+
console.log(`[CRI] Downloaded ${mb} MB → ${zipPath}`);
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
async function extractAndDistributeXmlBySession(zipPath, originalRoot) {
|
|
91
|
+
const zip = new StreamZip.async({ file: zipPath });
|
|
92
|
+
const entries = await zip.entries();
|
|
93
|
+
let count = 0;
|
|
94
|
+
for (const entryName of Object.keys(entries)) {
|
|
95
|
+
if (!entryName.toLowerCase().endsWith(".xml"))
|
|
96
|
+
continue;
|
|
97
|
+
// ex: d20231005.xml
|
|
98
|
+
const base = path.basename(entryName);
|
|
99
|
+
const m = base.match(/^d(\d{8})\.xml$/i);
|
|
100
|
+
if (!m)
|
|
101
|
+
continue;
|
|
102
|
+
const yyyymmdd = m[1];
|
|
103
|
+
const dt = parseYYYYMMDD(yyyymmdd);
|
|
104
|
+
if (!dt)
|
|
105
|
+
continue;
|
|
106
|
+
const session = sessionStartYearFromDate(dt);
|
|
107
|
+
const destDir = path.join(originalRoot, String(session));
|
|
108
|
+
await fs.ensureDir(destDir);
|
|
109
|
+
const outPath = path.join(destDir, base);
|
|
110
|
+
await zip.extract(entryName, outPath);
|
|
111
|
+
count++;
|
|
97
112
|
}
|
|
98
|
-
|
|
99
|
-
|
|
113
|
+
await zip.close();
|
|
114
|
+
return count;
|
|
115
|
+
}
|
|
116
|
+
export async function retrieveCriXmlDump(dataDir, options = {}) {
|
|
117
|
+
const root = path.join(dataDir, COMPTES_RENDUS_FOLDER);
|
|
118
|
+
ensureAndClearDir(root);
|
|
119
|
+
const originalRoot = path.join(root, DATA_ORIGINAL_FOLDER);
|
|
120
|
+
fs.ensureDirSync(originalRoot);
|
|
121
|
+
const transformedRoot = path.join(root, DATA_TRANSFORMED_FOLDER);
|
|
122
|
+
if (options["parseDebats"])
|
|
123
|
+
fs.ensureDirSync(transformedRoot);
|
|
124
|
+
const sessions = getSessionsFromStart(options["fromSession"]);
|
|
125
|
+
// 1) Download ZIP global + distribut by session
|
|
126
|
+
const zipPath = path.join(dataDir, "cri.zip");
|
|
127
|
+
console.log("[CRI] Downloading global CRI zip…");
|
|
128
|
+
await downloadCriZip(zipPath);
|
|
129
|
+
console.log("[CRI] Extracting + distributing XMLs by session…");
|
|
130
|
+
for (const session of sessions) {
|
|
131
|
+
const dir = path.join(originalRoot, String(session));
|
|
132
|
+
if (await fs.pathExists(dir)) {
|
|
133
|
+
for (const f of await fs.readdir(dir))
|
|
134
|
+
if (/\.xml$/i.test(f))
|
|
135
|
+
await fs.remove(path.join(dir, f));
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
const n = await extractAndDistributeXmlBySession(zipPath, originalRoot);
|
|
139
|
+
if (n === 0) {
|
|
140
|
+
console.warn("[CRI] No XML extracted. Archive empty or layout changed?");
|
|
141
|
+
}
|
|
142
|
+
else {
|
|
143
|
+
console.log(`[CRI] Distributed ${n} XML file(s) into session folders.`);
|
|
144
|
+
}
|
|
145
|
+
if (!options["parseDebats"]) {
|
|
146
|
+
console.log("[CRI] parseDebats not requested → done.");
|
|
100
147
|
return;
|
|
101
|
-
|
|
102
|
-
|
|
148
|
+
}
|
|
149
|
+
for (const session of sessions) {
|
|
150
|
+
const originalSessionDir = path.join(originalRoot, String(session));
|
|
151
|
+
if (!(await fs.pathExists(originalSessionDir))) {
|
|
152
|
+
continue;
|
|
153
|
+
}
|
|
154
|
+
const xmlFiles = (await fs.readdir(originalSessionDir))
|
|
155
|
+
.filter((f) => /^d\d{8}\.xml$/i.test(f))
|
|
156
|
+
.sort();
|
|
157
|
+
const transformedSessionDir = path.join(transformedRoot, String(session));
|
|
158
|
+
if (options["parseDebats"])
|
|
159
|
+
await fs.ensureDir(transformedSessionDir);
|
|
160
|
+
for (const f of xmlFiles) {
|
|
161
|
+
const yyyymmdd = f.slice(1, 9);
|
|
162
|
+
const xmlPath = path.join(originalSessionDir, f);
|
|
163
|
+
// 1) Deduce slot(s) from agenda if it exsits
|
|
164
|
+
const agendaInfo = loadAgendaSPSlotsForDate(dataDir, yyyymmdd, session);
|
|
165
|
+
const firstSlotOfDay = pickFirstSlotOfDay(agendaInfo?.slots ?? []);
|
|
166
|
+
// 2) Detect slots from CRI content
|
|
167
|
+
let slotsInCri = [];
|
|
168
|
+
try {
|
|
169
|
+
const raw = await fs.readFile(xmlPath, "utf8");
|
|
170
|
+
const $ = cheerio.load(raw, { xml: false });
|
|
171
|
+
const order = $("body *").toArray();
|
|
172
|
+
const idx = new Map(order.map((el, i) => [el, i]));
|
|
173
|
+
const intervals = computeIntervalsBySlot($, idx, firstSlotOfDay ?? undefined);
|
|
174
|
+
const uniq = new Set();
|
|
175
|
+
for (const iv of intervals)
|
|
176
|
+
if (iv.slot && iv.slot !== "UNKNOWN")
|
|
177
|
+
uniq.add(iv.slot);
|
|
178
|
+
slotsInCri = Array.from(uniq);
|
|
179
|
+
}
|
|
180
|
+
catch (e) {
|
|
181
|
+
console.warn(`[CRI] [${session}] Cannot read/parse ${f}:`, e);
|
|
182
|
+
continue;
|
|
183
|
+
}
|
|
184
|
+
if (slotsInCri.length === 0) {
|
|
185
|
+
slotsInCri = [firstSlotOfDay ?? "MATIN"];
|
|
186
|
+
}
|
|
187
|
+
// 3) Parse & write each slot
|
|
188
|
+
for (const slot of slotsInCri) {
|
|
189
|
+
const outName = `CRSSN${yyyymmdd}-${slot}.json`;
|
|
190
|
+
const cr = await parseCompteRenduSlotFromFile(xmlPath, slot, firstSlotOfDay ?? slot);
|
|
191
|
+
if (!cr) {
|
|
192
|
+
console.warn(`[CRI] [${session}] Empty or no points for ${yyyymmdd} (${slot}) → skip`);
|
|
193
|
+
continue;
|
|
194
|
+
}
|
|
195
|
+
const outDir = transformedSessionDir;
|
|
196
|
+
await fs.ensureDir(outDir);
|
|
197
|
+
const outPath = path.join(outDir, outName);
|
|
198
|
+
await fs.writeJSON(outPath, cr, { spaces: 2 });
|
|
199
|
+
try {
|
|
200
|
+
await linkCriSlotIntoAgendaGrouped(dataDir, yyyymmdd, slot, cr.uid, cr, session);
|
|
201
|
+
}
|
|
202
|
+
catch (e) {
|
|
203
|
+
console.warn(`[AGENDA] [${session}] Could not link CR into grouped for ${yyyymmdd} ${slot}:`, e);
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
}
|
|
103
208
|
}
|
|
104
209
|
async function main() {
|
|
105
210
|
const dataDir = options["dataDir"];
|
|
106
211
|
assert(dataDir, "Missing argument: data directory");
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
await retrieveComptesRendus(dataDir, sessions);
|
|
212
|
+
console.time("CRI processing time");
|
|
213
|
+
await retrieveCriXmlDump(dataDir, options);
|
|
110
214
|
if (!options["silent"]) {
|
|
111
|
-
console.timeEnd("
|
|
215
|
+
console.timeEnd("CRI processing time");
|
|
112
216
|
}
|
|
113
217
|
}
|
|
114
218
|
main()
|
|
115
219
|
.then(() => process.exit(0))
|
|
116
220
|
.catch((error) => {
|
|
117
|
-
console.
|
|
221
|
+
console.error(error);
|
|
118
222
|
process.exit(1);
|
|
119
223
|
});
|
|
224
|
+
async function linkCriSlotIntoAgendaGrouped(dataDir, yyyymmdd, slot, crUid, cr, session) {
|
|
225
|
+
const groupedDir = path.join(dataDir, AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, session.toString());
|
|
226
|
+
fs.ensureDirSync(groupedDir);
|
|
227
|
+
const groupedPath = path.join(groupedDir, 'RUSN' + yyyymmdd + 'IDS-' + slot + '.json');
|
|
228
|
+
let groups = [];
|
|
229
|
+
if (fs.existsSync(groupedPath)) {
|
|
230
|
+
try {
|
|
231
|
+
groups = JSON.parse(fs.readFileSync(groupedPath, "utf8"));
|
|
232
|
+
if (!Array.isArray(groups))
|
|
233
|
+
groups = [];
|
|
234
|
+
}
|
|
235
|
+
catch (e) {
|
|
236
|
+
console.warn(`[AGENDA] unreadable grouped JSON → ${groupedPath} (${e}) → recreating`);
|
|
237
|
+
groups = [];
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
// find existing group with same slot
|
|
241
|
+
const sameSlot = groups.filter(g => g?.slot === slot);
|
|
242
|
+
let target = null;
|
|
243
|
+
if (sameSlot.length > 1) {
|
|
244
|
+
console.warn(`[AGENDA] multiple groups for ${yyyymmdd} ${slot} in ${groupedPath} → linking the first`);
|
|
245
|
+
}
|
|
246
|
+
target = sameSlot[0] ?? null;
|
|
247
|
+
const dateISO = `${yyyymmdd.slice(0, 4)}-${yyyymmdd.slice(4, 6)}-${yyyymmdd.slice(6, 8)}`;
|
|
248
|
+
const sommaire = cr?.metadonnees?.sommaire;
|
|
249
|
+
const { titre: dTitre, objet: dObjet } = deriveTitreObjetFromSommaire(sommaire, slot);
|
|
250
|
+
if (!target) {
|
|
251
|
+
const newGroup = {
|
|
252
|
+
uid: makeGroupUid(dateISO, slot),
|
|
253
|
+
chambre: "SN",
|
|
254
|
+
date: dateISO,
|
|
255
|
+
slot,
|
|
256
|
+
type: "Séance publique",
|
|
257
|
+
startTime: null,
|
|
258
|
+
endTime: null,
|
|
259
|
+
captationVideo: false,
|
|
260
|
+
titre: dTitre,
|
|
261
|
+
objet: dObjet || "",
|
|
262
|
+
reunions: [],
|
|
263
|
+
compteRenduRefUid: crUid,
|
|
264
|
+
};
|
|
265
|
+
groups.push(newGroup);
|
|
266
|
+
}
|
|
267
|
+
else {
|
|
268
|
+
target.compteRenduRefUid = crUid;
|
|
269
|
+
}
|
|
270
|
+
await fs.writeJSON(groupedPath, groups, { spaces: 2 });
|
|
271
|
+
console.log(`[AGENDA] Linked CR ${crUid} → ${path.basename(groupedPath)} [${slot}]`);
|
|
272
|
+
}
|
|
@@ -79,6 +79,8 @@ async function copyToSenat(dataset, dataDir, options) {
|
|
|
79
79
|
const schemaDumpFile = path.join(dataDir, `${dataset.database}_schema_dump.sql`);
|
|
80
80
|
// Write the header and then stream the rest of the SQL file
|
|
81
81
|
const schemaSqlWriter = fs.createWriteStream(schemaDumpFile, { encoding: "utf8" });
|
|
82
|
+
// Add CREATE SCHEMA statement at the top
|
|
83
|
+
schemaSqlWriter.write(`CREATE SCHEMA IF NOT EXISTS ${dataset.database};\n`);
|
|
82
84
|
const lineReader = readline.createInterface({
|
|
83
85
|
input: fs.createReadStream(sqlFilePath, { encoding: "utf8" }),
|
|
84
86
|
crlfDelay: Infinity,
|
|
@@ -110,9 +112,9 @@ async function copyToSenat(dataset, dataDir, options) {
|
|
|
110
112
|
schemaSqlWriter.on("finish", () => {
|
|
111
113
|
try {
|
|
112
114
|
execSync(`${options["sudo"] ? `sudo -u ${options["sudo"]} ` : ""}psql --quiet -d senat -f ${schemaDumpFile}`, {
|
|
113
|
-
cwd: dataDir,
|
|
114
115
|
env: process.env,
|
|
115
116
|
encoding: "utf-8",
|
|
117
|
+
stdio: ["ignore", "ignore", "pipe"],
|
|
116
118
|
});
|
|
117
119
|
}
|
|
118
120
|
finally {
|
|
@@ -7,6 +7,7 @@ import path from "path";
|
|
|
7
7
|
import { AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, iterLoadSenatAgendas, } from "../loaders";
|
|
8
8
|
import { getSessionsFromStart } from "../types/sessions";
|
|
9
9
|
import { commonOptions } from "./shared/cli_helpers";
|
|
10
|
+
import { formatYYYYMMDD, makeReunionUid } from "../utils/reunion_grouping";
|
|
10
11
|
// ===================== Constants =====================
|
|
11
12
|
const MATCH_THRESHOLD = 0.60;
|
|
12
13
|
const MAX_CANDIDATES = 15;
|
|
@@ -136,15 +137,6 @@ function toFRDate(dateYYYYMMDD) {
|
|
|
136
137
|
const [y, m, d] = dateYYYYMMDD.split("-");
|
|
137
138
|
return `${d}/${m}/${y}`; // DD/MM/YYYY
|
|
138
139
|
}
|
|
139
|
-
function formatYYYYMMDD(dateYYYYMMDD) {
|
|
140
|
-
const [y, m, d] = dateYYYYMMDD.split("-");
|
|
141
|
-
return `${y}${m}${d}`;
|
|
142
|
-
}
|
|
143
|
-
function makeReunionUid(agenda) {
|
|
144
|
-
// agenda.date is expected as "YYYY-MM-DD"
|
|
145
|
-
const ymd = agenda.date ? formatYYYYMMDD(agenda.date) : "00000000";
|
|
146
|
-
return `${ymd}-${agenda.id}`;
|
|
147
|
-
}
|
|
148
140
|
function extractCandidatesFromSearchHtml(html) {
|
|
149
141
|
const out = [];
|
|
150
142
|
const re = /href="\/?video\.(\d+)_([a-z0-9]+)\.[^"]+"/gi;
|
package/lib/types/agenda.d.ts
CHANGED
|
@@ -12,6 +12,23 @@ export interface AgendaEvent {
|
|
|
12
12
|
captationVideo: boolean;
|
|
13
13
|
urlDossierSenat: string | null;
|
|
14
14
|
quantieme: string | null;
|
|
15
|
-
|
|
16
|
-
|
|
15
|
+
}
|
|
16
|
+
export type TimeSlot = "MATIN" | "APRES-MIDI" | "SOIR" | "UNKNOWN";
|
|
17
|
+
export interface GroupedReunion {
|
|
18
|
+
uid: string;
|
|
19
|
+
chambre: "SN";
|
|
20
|
+
date: string;
|
|
21
|
+
slot?: TimeSlot;
|
|
22
|
+
startTime: string | null;
|
|
23
|
+
endTime: string | null;
|
|
24
|
+
captationVideo: boolean;
|
|
25
|
+
titre: string;
|
|
26
|
+
type: string;
|
|
27
|
+
organe?: string;
|
|
28
|
+
objet?: string;
|
|
29
|
+
lieu?: string;
|
|
30
|
+
reunions: AgendaEvent[];
|
|
31
|
+
compteRenduRefUid?: string;
|
|
32
|
+
urlVideo?: string;
|
|
33
|
+
timecodeDebutVideo?: number;
|
|
17
34
|
}
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
export function computeIntervalsBySlot($, idx, firstSlotOfDay) {
|
|
2
|
+
const all = $("body *").toArray();
|
|
3
|
+
const cuts = [{ pos: 0, hhmm: undefined }];
|
|
4
|
+
$('a[name]').each((_, a) => {
|
|
5
|
+
const name = (a.attribs?.["name"] || "").trim();
|
|
6
|
+
if (!/^su/i.test(name))
|
|
7
|
+
return;
|
|
8
|
+
const pos = idx.get(a);
|
|
9
|
+
if (pos == null)
|
|
10
|
+
return;
|
|
11
|
+
const hhmm = hhmmFromSuName(name); // "SU1620" -> "1620"
|
|
12
|
+
cuts.push({ pos, hhmm });
|
|
13
|
+
});
|
|
14
|
+
cuts.sort((a, b) => a.pos - b.pos);
|
|
15
|
+
cuts.push({ pos: all.length, hhmm: undefined });
|
|
16
|
+
let initialSlot = firstSlotOfDay;
|
|
17
|
+
if (!initialSlot) {
|
|
18
|
+
const openHHMM = extractOpeningHHMM($);
|
|
19
|
+
if (openHHMM)
|
|
20
|
+
initialSlot = slotOfHHMM(openHHMM);
|
|
21
|
+
}
|
|
22
|
+
if (!initialSlot)
|
|
23
|
+
initialSlot = "MATIN";
|
|
24
|
+
const intervals = [];
|
|
25
|
+
let lastSlot = initialSlot;
|
|
26
|
+
for (let i = 0; i + 1 < cuts.length; i++) {
|
|
27
|
+
const start = cuts[i].pos;
|
|
28
|
+
const end = cuts[i + 1].pos;
|
|
29
|
+
if (end <= start)
|
|
30
|
+
continue;
|
|
31
|
+
// i=0 initialSlot
|
|
32
|
+
// i>0 : if current cut has SU -> slotOfHHMM, otherwise lastSlot
|
|
33
|
+
const slot = i === 0 ? initialSlot : (cuts[i].hhmm ? slotOfHHMM(cuts[i].hhmm) : lastSlot);
|
|
34
|
+
intervals.push({ slot, start, end });
|
|
35
|
+
lastSlot = slot;
|
|
36
|
+
}
|
|
37
|
+
return intervals;
|
|
38
|
+
}
|
|
39
|
+
function hhmmFromSuName(name) {
|
|
40
|
+
const m = name.match(/^SU(\d{2})(\d{2})$/i);
|
|
41
|
+
if (!m)
|
|
42
|
+
return;
|
|
43
|
+
return `${m[1]}:${m[2]}`;
|
|
44
|
+
}
|
|
45
|
+
function slotOfHHMM(hhmm) {
|
|
46
|
+
if (!hhmm)
|
|
47
|
+
return "MATIN";
|
|
48
|
+
const [h, m] = hhmm.split(":").map(Number);
|
|
49
|
+
const v = h + m / 60;
|
|
50
|
+
if (v < 12)
|
|
51
|
+
return "MATIN";
|
|
52
|
+
if (v < 18.5)
|
|
53
|
+
return "APRES-MIDI";
|
|
54
|
+
return "SOIR";
|
|
55
|
+
}
|
|
56
|
+
// Looks for text like "(La séance est ouverte à quinze heures.)" and extracts "HH:MM"
|
|
57
|
+
function extractOpeningHHMM($) {
|
|
58
|
+
let txt = "";
|
|
59
|
+
$("span.info_entre_parentheses, .info_entre_parentheses").each((_, el) => {
|
|
60
|
+
const t = ($(el).text() || "").replace(/\s+/g, " ").trim();
|
|
61
|
+
if (!txt && /\bs[eé]ance est ouverte\b/i.test(t))
|
|
62
|
+
txt = t;
|
|
63
|
+
});
|
|
64
|
+
if (!txt)
|
|
65
|
+
return undefined;
|
|
66
|
+
const inner = txt.match(/\(.*?ouverte\s+à\s+([^)]+?)\)/i)?.[1];
|
|
67
|
+
if (!inner)
|
|
68
|
+
return undefined;
|
|
69
|
+
return parseFrenchClockToHHMM(inner);
|
|
70
|
+
}
|
|
71
|
+
// Convert "quinze heures trente", "15 heures 30", "dix-sept heures moins le quart", etc. en "HHMM"
|
|
72
|
+
function parseFrenchClockToHHMM(input) {
|
|
73
|
+
const s = (input || "").toLowerCase().normalize("NFKD").replace(/[\u0300-\u036f]/g, "").trim();
|
|
74
|
+
if (!s)
|
|
75
|
+
return undefined;
|
|
76
|
+
const digitMatch = s.match(/(\d{1,2})\s*heures?(?:\s*(\d{1,2}))?/);
|
|
77
|
+
if (digitMatch) {
|
|
78
|
+
const h = Math.min(24, Math.max(0, parseInt(digitMatch[1], 10)));
|
|
79
|
+
const m = digitMatch[2] ? Math.min(59, Math.max(0, parseInt(digitMatch[2], 10))) : 0;
|
|
80
|
+
return `${String(h).padStart(2, "0")}${String(m).padStart(2, "0")}`;
|
|
81
|
+
}
|
|
82
|
+
const NUM = new Map([
|
|
83
|
+
["zero", 0], ["une", 1], ["un", 1], ["deux", 2], ["trois", 3], ["quatre", 4], ["cinq", 5], ["six", 6],
|
|
84
|
+
["sept", 7], ["huit", 8], ["neuf", 9], ["dix", 10], ["onze", 11], ["douze", 12], ["treize", 13],
|
|
85
|
+
["quatorze", 14], ["quinze", 15], ["seize", 16], ["dix-sept", 17], ["dix sept", 17], ["dix-huit", 18],
|
|
86
|
+
["dix huit", 18], ["dix-neuf", 19], ["dix neuf", 19], ["vingt", 20], ["vingt et une", 21],
|
|
87
|
+
["vingt-et-une", 21], ["vingt et un", 21], ["vingt-et-un", 21], ["vingt-deux", 22], ["vingt deux", 22],
|
|
88
|
+
["vingt-trois", 23], ["vingt trois", 23], ["vingt-quatre", 24], ["vingt quatre", 24],
|
|
89
|
+
]);
|
|
90
|
+
const hourWordMatch = s.match(/([a-z\- ]+?)\s*heures?/);
|
|
91
|
+
if (!hourWordMatch)
|
|
92
|
+
return undefined;
|
|
93
|
+
const hourWord = hourWordMatch[1].trim();
|
|
94
|
+
let hour = NUM.get(hourWord);
|
|
95
|
+
if (hour == null) {
|
|
96
|
+
const cleaned = hourWord.replace(/\s+/g, " ");
|
|
97
|
+
hour = NUM.get(cleaned);
|
|
98
|
+
}
|
|
99
|
+
if (hour == null)
|
|
100
|
+
return undefined;
|
|
101
|
+
let minutes = 0;
|
|
102
|
+
if (/\bet (demie|demi)\b/.test(s))
|
|
103
|
+
minutes = 30;
|
|
104
|
+
else if (/\bet quart\b/.test(s))
|
|
105
|
+
minutes = 15;
|
|
106
|
+
else if (/\bmoins le quart\b/.test(s)) {
|
|
107
|
+
hour = (hour + 23) % 24;
|
|
108
|
+
minutes = 45;
|
|
109
|
+
}
|
|
110
|
+
else {
|
|
111
|
+
const MIN = new Map([
|
|
112
|
+
["cinq", 5], ["dix", 10], ["quinze", 15], ["vingt", 20], ["vingt-cinq", 25], ["vingt cinq", 25],
|
|
113
|
+
["trente", 30], ["trente-cinq", 35], ["trente cinq", 35], ["quarante", 40], ["quarante-cinq", 45],
|
|
114
|
+
["quarante cinq", 45], ["cinquante", 50], ["cinquante-cinq", 55], ["cinquante cinq", 55],
|
|
115
|
+
]);
|
|
116
|
+
const minWordMatch = s.match(/heures?\s+([a-z\- ]+?)(?:[).,;]|$)/);
|
|
117
|
+
if (minWordMatch) {
|
|
118
|
+
const mw = minWordMatch[1].trim();
|
|
119
|
+
const m1 = MIN.get(mw);
|
|
120
|
+
if (m1 != null)
|
|
121
|
+
minutes = m1;
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
return `${String(hour).padStart(2, "0")}${String(minutes).padStart(2, "0")}`;
|
|
125
|
+
}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import { AgendaEvent, GroupedReunion, TimeSlot } from "../types/agenda";
|
|
2
|
+
export declare function groupNonSPByTypeOrganeHour(events: AgendaEvent[]): Record<"IDC" | "IDM" | "IDO" | "IDI", GroupedReunion[]>;
|
|
3
|
+
export declare function groupSeancePubliqueBySlot(events: AgendaEvent[]): GroupedReunion[];
|
|
4
|
+
export declare function makeGroupUid(date: string, slot: TimeSlot): string;
|
|
5
|
+
export declare function formatYYYYMMDD(dateYYYYMMDD: string): string;
|
|
6
|
+
export declare function makeReunionUid(agenda: AgendaEvent): string;
|