@tricoteuses/senat 2.11.0 → 2.11.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/loaders.d.ts +5 -0
- package/lib/loaders.js +14 -9
- package/lib/model/commission.d.ts +5 -0
- package/lib/model/commission.js +263 -0
- package/lib/model/{compte_rendu.js → seance.js} +47 -28
- package/lib/model/util.d.ts +1 -0
- package/lib/model/util.js +19 -1
- package/lib/scripts/retrieve_cr_commission.d.ts +1 -0
- package/lib/scripts/retrieve_cr_commission.js +291 -0
- package/lib/scripts/{retrieve_comptes_rendus.js → retrieve_cr_seance.js} +1 -1
- package/lib/utils/cr_spliting.d.ts +22 -1
- package/lib/utils/cr_spliting.js +273 -12
- package/lib/utils/reunion_grouping.d.ts +3 -0
- package/lib/utils/reunion_grouping.js +1 -1
- package/package.json +3 -2
- package/lib/raw_types/db.d.ts +0 -11389
- package/lib/raw_types/db.js +0 -5
- /package/lib/model/{compte_rendu.d.ts → seance.d.ts} +0 -0
- /package/lib/scripts/{retrieve_comptes_rendus.d.ts → retrieve_cr_seance.d.ts} +0 -0
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
import fs from "fs-extra";
|
|
2
|
+
import assert from "assert";
|
|
3
|
+
import path from "path";
|
|
4
|
+
import * as cheerio from "cheerio";
|
|
5
|
+
import { COMMISSION_FOLDER, DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER } from "../loaders";
|
|
6
|
+
import { createCommissionGroupIfMissing, loadCommissionAgendaForDate, parseCommissionMetadataFromHtml, } from "../utils/cr_spliting";
|
|
7
|
+
import { parseCommissionCRFromFile } from "../model/commission";
|
|
8
|
+
import commandLineArgs from "command-line-args";
|
|
9
|
+
import { commonOptions } from "./shared/cli_helpers";
|
|
10
|
+
import { sessionStartYearFromDate } from "../model/seance";
|
|
11
|
+
import { getSessionsFromStart } from "../types/sessions";
|
|
12
|
+
import { ensureAndClearDir, fetchWithRetry } from "./shared/util";
|
|
13
|
+
class CommissionCRDownloadError extends Error {
|
|
14
|
+
constructor(message, url) {
|
|
15
|
+
super(`An error occurred while retrieving Commission CR ${url}: ${message}`);
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
const optionsDefinitions = [
|
|
19
|
+
...commonOptions,
|
|
20
|
+
{ name: "concurrency", type: Number, defaultValue: 6, help: "Max parallel downloads" },
|
|
21
|
+
{ name: "politenessMs", type: Number, defaultValue: 150, help: "Delay per worker (ms)" },
|
|
22
|
+
{
|
|
23
|
+
help: "parse and convert comptes-rendus des débats into JSON",
|
|
24
|
+
name: "parseDebats",
|
|
25
|
+
type: Boolean,
|
|
26
|
+
},
|
|
27
|
+
];
|
|
28
|
+
const options = commandLineArgs(optionsDefinitions);
|
|
29
|
+
const COMMISSION_HUBS = {
|
|
30
|
+
"affaires-etrangeres": [
|
|
31
|
+
"https://www.senat.fr/compte-rendu-commissions/affaires-etrangeres.html",
|
|
32
|
+
"https://www.senat.fr/compte-rendu-commissions/affaires-etrangeres_archives.html",
|
|
33
|
+
],
|
|
34
|
+
"affaires-economiques": [
|
|
35
|
+
"https://www.senat.fr/compte-rendu-commissions/affaires-economiques.html",
|
|
36
|
+
"https://www.senat.fr/compte-rendu-commissions/affaires-economiques_archives.html",
|
|
37
|
+
],
|
|
38
|
+
"amenagement-developpement-durable": [
|
|
39
|
+
"https://www.senat.fr/compte-rendu-commissions/cadre-de-vie-et-developpement-durable.html",
|
|
40
|
+
"https://www.senat.fr/compte-rendu-commissions/cadre-de-vie-et-developpement-durable_archives.html",
|
|
41
|
+
],
|
|
42
|
+
culture: [
|
|
43
|
+
"https://www.senat.fr/compte-rendu-commissions/culture.html",
|
|
44
|
+
"https://www.senat.fr/compte-rendu-commissions/culture_archives.html",
|
|
45
|
+
],
|
|
46
|
+
finances: [
|
|
47
|
+
"https://www.senat.fr/compte-rendu-commissions/finances.html",
|
|
48
|
+
"https://www.senat.fr/compte-rendu-commissions/finances_archives.html",
|
|
49
|
+
],
|
|
50
|
+
lois: [
|
|
51
|
+
"https://www.senat.fr/compte-rendu-commissions/lois.html",
|
|
52
|
+
"https://www.senat.fr/compte-rendu-commissions/lois_archives.html",
|
|
53
|
+
],
|
|
54
|
+
"affaires-sociales": [
|
|
55
|
+
"https://www.senat.fr/compte-rendu-commissions/affaires-sociales.html",
|
|
56
|
+
"https://www.senat.fr/compte-rendu-commissions/affaires-sociales_archives.html",
|
|
57
|
+
],
|
|
58
|
+
"affaires-europeennes": [
|
|
59
|
+
"https://www.senat.fr/compte-rendu-commissions/affaires-europeennes.html",
|
|
60
|
+
"https://www.senat.fr/compte-rendu-commissions/affaires-europeennes_archives.html",
|
|
61
|
+
],
|
|
62
|
+
};
|
|
63
|
+
async function harvestWeeklyLinksFromHub(hubUrl) {
|
|
64
|
+
const res = await fetchWithRetry(hubUrl);
|
|
65
|
+
if (!res.ok)
|
|
66
|
+
return [];
|
|
67
|
+
const html = await res.text();
|
|
68
|
+
const $ = cheerio.load(html);
|
|
69
|
+
const out = [];
|
|
70
|
+
$("a[href]").each((_, a) => {
|
|
71
|
+
const href = ($(a).attr("href") || "").trim();
|
|
72
|
+
const m = href.match(/\/compte-rendu-commissions\/(\d{8})\/([a-z0-9\-]+)\.html$/i);
|
|
73
|
+
if (m) {
|
|
74
|
+
const url = href.startsWith("http") ? href : new URL(href, hubUrl).toString();
|
|
75
|
+
out.push(url);
|
|
76
|
+
}
|
|
77
|
+
});
|
|
78
|
+
return Array.from(new Set(out));
|
|
79
|
+
}
|
|
80
|
+
async function discoverCommissionWeeklyPages(fromSession) {
|
|
81
|
+
const results = [];
|
|
82
|
+
for (const [commissionKey, hubs] of Object.entries(COMMISSION_HUBS)) {
|
|
83
|
+
for (const hubUrl of hubs) {
|
|
84
|
+
try {
|
|
85
|
+
const links = await harvestWeeklyLinksFromHub(hubUrl);
|
|
86
|
+
for (const url of links) {
|
|
87
|
+
const m = url.match(/\/compte-rendu-commissions\/(\d{8})\/([a-z0-9\-]+)\.html$/i);
|
|
88
|
+
if (!m)
|
|
89
|
+
continue;
|
|
90
|
+
const yyyymmdd = m[1];
|
|
91
|
+
const year = Number(yyyymmdd.slice(0, 4));
|
|
92
|
+
const month = Number(yyyymmdd.slice(4, 6));
|
|
93
|
+
const session = month >= 10 ? year : year - 1;
|
|
94
|
+
if (session < fromSession)
|
|
95
|
+
continue;
|
|
96
|
+
results.push({ url, yyyymmdd, commissionKey });
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
catch (e) {
|
|
100
|
+
console.warn(`[COM-CR][hub-fail] ${hubUrl} → ${e?.message ?? e}`);
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
return results.sort((a, b) => a.yyyymmdd.localeCompare(b.yyyymmdd));
|
|
105
|
+
}
|
|
106
|
+
function toHourShort(hhmm) {
|
|
107
|
+
if (!hhmm)
|
|
108
|
+
return null;
|
|
109
|
+
const m = hhmm.match(/^(\d{2}):(\d{2})$/);
|
|
110
|
+
return m ? `${m[1]}${m[2]}` : null;
|
|
111
|
+
}
|
|
112
|
+
function timeToMinutes(hhmm) {
|
|
113
|
+
const [h, m] = hhmm.split(":").map((n) => parseInt(n, 10));
|
|
114
|
+
return (h || 0) * 60 + (m || 0);
|
|
115
|
+
}
|
|
116
|
+
async function tryDownload(url) {
|
|
117
|
+
const res = await fetch(url, { redirect: "follow" });
|
|
118
|
+
if (res.status === 404)
|
|
119
|
+
return null;
|
|
120
|
+
if (!res.ok)
|
|
121
|
+
throw new CommissionCRDownloadError(String(res.status), url);
|
|
122
|
+
const ab = await res.arrayBuffer();
|
|
123
|
+
return Buffer.from(ab);
|
|
124
|
+
}
|
|
125
|
+
async function retrieveCommissionCRs(options = {}) {
|
|
126
|
+
const dataDir = options["dataDir"];
|
|
127
|
+
const fromSession = Number(options["fromSession"]);
|
|
128
|
+
const concurrency = Number(options["concurrency"] ?? 6);
|
|
129
|
+
const politenessMs = Number(options["politenessMs"] ?? 150);
|
|
130
|
+
const commissionsRootDir = path.join(dataDir, COMMISSION_FOLDER);
|
|
131
|
+
const originalRoot = path.join(commissionsRootDir, DATA_ORIGINAL_FOLDER);
|
|
132
|
+
ensureAndClearDir(originalRoot);
|
|
133
|
+
const discovered = await discoverCommissionWeeklyPages(fromSession);
|
|
134
|
+
console.log(`[COM-CR][discover] ${discovered.length} links (>= session ${fromSession})`);
|
|
135
|
+
const jobs = discovered.map(({ url, yyyymmdd }) => {
|
|
136
|
+
const d = new Date(Number(yyyymmdd.slice(0, 4)), Number(yyyymmdd.slice(4, 6)) - 1, Number(yyyymmdd.slice(6, 8)));
|
|
137
|
+
const session = sessionStartYearFromDate(d);
|
|
138
|
+
const dir = path.join(originalRoot, String(session));
|
|
139
|
+
fs.ensureDirSync(dir);
|
|
140
|
+
const slug = url.replace(/^.*\/(\d{8})\/([^\/]+)\.html$/i, "$2");
|
|
141
|
+
const outPath = path.join(dir, `${yyyymmdd}.${slug}.html`);
|
|
142
|
+
return { url, outPath, yyyymmdd };
|
|
143
|
+
});
|
|
144
|
+
console.log(`[COM-CR] Downloading ${jobs.length} links → ${path.relative(process.cwd(), originalRoot)}`);
|
|
145
|
+
let completed = 0, saved = 0, skipped = 0, notFound = 0;
|
|
146
|
+
const workers = Array.from({ length: Math.max(1, concurrency) }, async () => {
|
|
147
|
+
while (true) {
|
|
148
|
+
const job = jobs.shift();
|
|
149
|
+
if (!job)
|
|
150
|
+
break;
|
|
151
|
+
const { url, outPath, yyyymmdd } = job;
|
|
152
|
+
try {
|
|
153
|
+
if (await fs.pathExists(outPath)) {
|
|
154
|
+
skipped++;
|
|
155
|
+
}
|
|
156
|
+
else {
|
|
157
|
+
const buf = await tryDownload(url);
|
|
158
|
+
if (!buf) {
|
|
159
|
+
notFound++;
|
|
160
|
+
console.warn(`[COM-CR][404] ${url} → week=${yyyymmdd}`);
|
|
161
|
+
}
|
|
162
|
+
else {
|
|
163
|
+
await fs.writeFile(outPath, buf);
|
|
164
|
+
saved++;
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
catch (e) {
|
|
169
|
+
console.error(`[COM-CR][err] ${url} → ${e?.message || e}`);
|
|
170
|
+
}
|
|
171
|
+
finally {
|
|
172
|
+
completed++;
|
|
173
|
+
if (politenessMs > 0)
|
|
174
|
+
await new Promise((r) => setTimeout(r, politenessMs));
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
});
|
|
178
|
+
await Promise.all(workers);
|
|
179
|
+
console.log(`[COM-CR] done: saved=${saved} | skipped=${skipped} | 404=${notFound} | total=${completed}`);
|
|
180
|
+
const sessions = getSessionsFromStart(options["fromSession"]);
|
|
181
|
+
const comRoot = path.join(dataDir, COMMISSION_FOLDER);
|
|
182
|
+
const transformedRoot = path.join(comRoot, DATA_TRANSFORMED_FOLDER);
|
|
183
|
+
if (options["parseDebats"])
|
|
184
|
+
ensureAndClearDir(transformedRoot);
|
|
185
|
+
for (const session of sessions) {
|
|
186
|
+
const originalSessionDir = path.join(originalRoot, String(session));
|
|
187
|
+
const transformedSessionDir = path.join(transformedRoot, String(session));
|
|
188
|
+
fs.ensureDirSync(transformedSessionDir);
|
|
189
|
+
if (!(await fs.pathExists(originalSessionDir)))
|
|
190
|
+
continue;
|
|
191
|
+
const htmlFiles = (await fs.readdir(originalSessionDir)).filter((f) => /\.html?$/i.test(f)).sort();
|
|
192
|
+
for (const f of htmlFiles) {
|
|
193
|
+
const htmlPath = path.join(originalSessionDir, f);
|
|
194
|
+
let meta;
|
|
195
|
+
try {
|
|
196
|
+
const raw = await fs.readFile(htmlPath, "utf8");
|
|
197
|
+
meta = parseCommissionMetadataFromHtml(raw, f);
|
|
198
|
+
}
|
|
199
|
+
catch (e) {
|
|
200
|
+
console.warn(`[COM-CR][PRE][${session}] Cannot read/parse ${f}:`, e);
|
|
201
|
+
continue;
|
|
202
|
+
}
|
|
203
|
+
const organeKeywords = (meta.organeDetected ?? meta.organeTitleRaw ?? "")
|
|
204
|
+
.toLowerCase()
|
|
205
|
+
.replace(/[’']/g, "'")
|
|
206
|
+
.split(/\W+/)
|
|
207
|
+
.filter((x) => x.length >= 3 && !["commission", "des", "de", "du", "d", "la", "le", "les", "et"].includes(x));
|
|
208
|
+
const MAX_TIME_DELTA_MIN = 120;
|
|
209
|
+
for (let i = 0; i < meta.days.length; i++) {
|
|
210
|
+
const day = meta.days[i];
|
|
211
|
+
const yyyymmdd = day.date.replace(/-/g, "");
|
|
212
|
+
const dt = new Date(Number(day.date.slice(0, 4)), Number(day.date.slice(5, 7)) - 1, Number(day.date.slice(8, 10)));
|
|
213
|
+
const daySession = sessionStartYearFromDate(dt);
|
|
214
|
+
const hits = await loadCommissionAgendaForDate(dataDir, yyyymmdd, daySession);
|
|
215
|
+
let best = null;
|
|
216
|
+
let reason = "fallback-none";
|
|
217
|
+
let deltaMin;
|
|
218
|
+
// a) score by title and organe keywords
|
|
219
|
+
if (organeKeywords.length && hits.length) {
|
|
220
|
+
const scored = hits
|
|
221
|
+
.map((h) => {
|
|
222
|
+
const t = (h.titre ?? "").toLowerCase();
|
|
223
|
+
const s = organeKeywords.reduce((acc, kw) => acc + (t.includes(kw) ? 1 : 0), 0);
|
|
224
|
+
return { h, s };
|
|
225
|
+
})
|
|
226
|
+
.sort((a, b) => b.s - a.s);
|
|
227
|
+
if (scored[0]?.s > 0) {
|
|
228
|
+
best = scored[0].h;
|
|
229
|
+
reason = "title";
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
// b) otherwise score by time proximity
|
|
233
|
+
if (!best && day.openTime && hits.length) {
|
|
234
|
+
const candidates = hits
|
|
235
|
+
.map((h) => ({ h, hhmm: h.startTime ?? null }))
|
|
236
|
+
.filter((x) => !!x.hhmm)
|
|
237
|
+
.map((x) => ({
|
|
238
|
+
h: x.h,
|
|
239
|
+
d: Math.abs(timeToMinutes(x.hhmm) - timeToMinutes(day.openTime)),
|
|
240
|
+
}))
|
|
241
|
+
.sort((a, b) => a.d - b.d);
|
|
242
|
+
if (candidates[0] && candidates[0].d <= MAX_TIME_DELTA_MIN) {
|
|
243
|
+
best = candidates[0].h;
|
|
244
|
+
reason = "time";
|
|
245
|
+
deltaMin = candidates[0].d;
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
if (best) {
|
|
249
|
+
const cr = parseCommissionCRFromFile(htmlPath, best);
|
|
250
|
+
if (!cr) {
|
|
251
|
+
console.warn(`[COM-CR][TRANSFORM] parse failed for ${f} → ${best.uid}`);
|
|
252
|
+
}
|
|
253
|
+
else {
|
|
254
|
+
const fileUid = cr.uid;
|
|
255
|
+
const outPath = path.join(transformedSessionDir, `${fileUid}.json`);
|
|
256
|
+
await fs.writeJSON(outPath, cr, { spaces: 2 });
|
|
257
|
+
const npts = Array.isArray(cr.contenu.point) ? cr.contenu.point.length : cr.contenu.point ? 1 : 0;
|
|
258
|
+
if (!options["silent"]) {
|
|
259
|
+
console.log(`[COM-CR][TRANSFORM] saved ${path.basename(outPath)} (points=${npts})`);
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
else {
|
|
264
|
+
const hourShort = toHourShort(day.openTime) ?? "NA";
|
|
265
|
+
const titreGuess = meta.organeDetected || meta.organeTitleRaw || "Commission";
|
|
266
|
+
const { uid, filePath } = await createCommissionGroupIfMissing(dataDir, day.date, meta.organeDetected ?? null, hourShort, titreGuess);
|
|
267
|
+
if (!options["silent"]) {
|
|
268
|
+
console.log(`[COM-CR][PRE-SPLIT][${session}] ${f} | ${day.date}` +
|
|
269
|
+
(day.openTime ? ` ${day.openTime}` : ``) +
|
|
270
|
+
` → NO-MATCH → CREATED uid=${uid} file=${path.basename(filePath)}`);
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
async function main() {
|
|
278
|
+
const dataDir = options["dataDir"];
|
|
279
|
+
assert(dataDir, "Missing argument: data directory");
|
|
280
|
+
console.time("CRI processing time");
|
|
281
|
+
await retrieveCommissionCRs(options);
|
|
282
|
+
if (!options["silent"]) {
|
|
283
|
+
console.timeEnd("CRI processing time");
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
main()
|
|
287
|
+
.then(() => process.exit(0))
|
|
288
|
+
.catch((error) => {
|
|
289
|
+
console.error(error);
|
|
290
|
+
process.exit(1);
|
|
291
|
+
});
|
|
@@ -11,7 +11,7 @@ import StreamZip from "node-stream-zip";
|
|
|
11
11
|
import * as cheerio from "cheerio";
|
|
12
12
|
import { AGENDA_FOLDER, COMPTES_RENDUS_FOLDER, DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER, } from "../loaders";
|
|
13
13
|
import { commonOptions } from "./shared/cli_helpers";
|
|
14
|
-
import { deriveTitreObjetFromSommaire, parseCompteRenduSlotFromFile, parseYYYYMMDD, sessionStartYearFromDate } from "../model/
|
|
14
|
+
import { deriveTitreObjetFromSommaire, parseCompteRenduSlotFromFile, parseYYYYMMDD, sessionStartYearFromDate } from "../model/seance";
|
|
15
15
|
import { makeGroupUid } from "../utils/reunion_grouping";
|
|
16
16
|
import { getSessionsFromStart } from "../types/sessions";
|
|
17
17
|
import { ensureAndClearDir, fetchWithRetry } from "./shared/util";
|
|
@@ -1,7 +1,28 @@
|
|
|
1
|
-
import { TimeSlot } from "../types/agenda";
|
|
1
|
+
import { GroupedReunion, TimeSlot } from "../types/agenda";
|
|
2
2
|
import * as cheerio from "cheerio";
|
|
3
3
|
export declare function computeIntervalsBySlot($: cheerio.CheerioAPI, idx: Map<any, number>, firstSlotOfDay?: TimeSlot): {
|
|
4
4
|
slot: TimeSlot;
|
|
5
5
|
start: number;
|
|
6
6
|
end: number;
|
|
7
7
|
}[];
|
|
8
|
+
export declare function parseCommissionMetadataFromHtml(html: string, sourceFileName?: string): {
|
|
9
|
+
sourceFile: string | null;
|
|
10
|
+
organeTitleRaw: string | null;
|
|
11
|
+
organeDetected: string | null;
|
|
12
|
+
organeCode: string | null;
|
|
13
|
+
weekStart: string | null;
|
|
14
|
+
days: {
|
|
15
|
+
date: string;
|
|
16
|
+
openTime?: string;
|
|
17
|
+
h2Index: number;
|
|
18
|
+
}[];
|
|
19
|
+
};
|
|
20
|
+
export declare function loadCommissionAgendaForDate(dataDir: string, yyyymmdd: string, session: number): Promise<GroupedReunion[]>;
|
|
21
|
+
export declare function createCommissionGroupIfMissing(dataDir: string, dateISO: string, // "YYYY-MM-DD"
|
|
22
|
+
organeDetected: string | null, // ex. "Commission des finances"
|
|
23
|
+
hourShort: string | null, // "HHMM" | "NA"
|
|
24
|
+
titreGuess?: string | null): Promise<{
|
|
25
|
+
uid: string;
|
|
26
|
+
filePath: string;
|
|
27
|
+
created: boolean;
|
|
28
|
+
}>;
|
package/lib/utils/cr_spliting.js
CHANGED
|
@@ -1,7 +1,13 @@
|
|
|
1
|
+
import path from "path";
|
|
2
|
+
import * as cheerio from "cheerio";
|
|
3
|
+
import { AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER } from "../loaders";
|
|
4
|
+
import fs from "fs-extra";
|
|
5
|
+
import { makeTypeGroupUid } from "./reunion_grouping";
|
|
6
|
+
import { sessionStartYearFromDate } from "../model/seance";
|
|
1
7
|
export function computeIntervalsBySlot($, idx, firstSlotOfDay) {
|
|
2
8
|
const all = $("body *").toArray();
|
|
3
9
|
const cuts = [{ pos: 0, hhmm: undefined }];
|
|
4
|
-
$(
|
|
10
|
+
$("a[name]").each((_, a) => {
|
|
5
11
|
const name = (a.attribs?.["name"] || "").trim();
|
|
6
12
|
if (!/^su/i.test(name))
|
|
7
13
|
return;
|
|
@@ -30,7 +36,7 @@ export function computeIntervalsBySlot($, idx, firstSlotOfDay) {
|
|
|
30
36
|
continue;
|
|
31
37
|
// i=0 initialSlot
|
|
32
38
|
// i>0 : if current cut has SU -> slotOfHHMM, otherwise lastSlot
|
|
33
|
-
const slot = i === 0 ? initialSlot :
|
|
39
|
+
const slot = i === 0 ? initialSlot : cuts[i].hhmm ? slotOfHHMM(cuts[i].hhmm) : lastSlot;
|
|
34
40
|
intervals.push({ slot, start, end });
|
|
35
41
|
lastSlot = slot;
|
|
36
42
|
}
|
|
@@ -70,7 +76,11 @@ function extractOpeningHHMM($) {
|
|
|
70
76
|
}
|
|
71
77
|
// Convert "quinze heures trente", "15 heures 30", "dix-sept heures moins le quart", etc. en "HHMM"
|
|
72
78
|
function parseFrenchClockToHHMM(input) {
|
|
73
|
-
const s = (input || "")
|
|
79
|
+
const s = (input || "")
|
|
80
|
+
.toLowerCase()
|
|
81
|
+
.normalize("NFKD")
|
|
82
|
+
.replace(/[\u0300-\u036f]/g, "")
|
|
83
|
+
.trim();
|
|
74
84
|
if (!s)
|
|
75
85
|
return undefined;
|
|
76
86
|
const digitMatch = s.match(/(\d{1,2})\s*heures?(?:\s*(\d{1,2}))?/);
|
|
@@ -80,12 +90,41 @@ function parseFrenchClockToHHMM(input) {
|
|
|
80
90
|
return `${String(h).padStart(2, "0")}${String(m).padStart(2, "0")}`;
|
|
81
91
|
}
|
|
82
92
|
const NUM = new Map([
|
|
83
|
-
["zero", 0],
|
|
84
|
-
["
|
|
85
|
-
["
|
|
86
|
-
["
|
|
87
|
-
["
|
|
88
|
-
["
|
|
93
|
+
["zero", 0],
|
|
94
|
+
["une", 1],
|
|
95
|
+
["un", 1],
|
|
96
|
+
["deux", 2],
|
|
97
|
+
["trois", 3],
|
|
98
|
+
["quatre", 4],
|
|
99
|
+
["cinq", 5],
|
|
100
|
+
["six", 6],
|
|
101
|
+
["sept", 7],
|
|
102
|
+
["huit", 8],
|
|
103
|
+
["neuf", 9],
|
|
104
|
+
["dix", 10],
|
|
105
|
+
["onze", 11],
|
|
106
|
+
["douze", 12],
|
|
107
|
+
["treize", 13],
|
|
108
|
+
["quatorze", 14],
|
|
109
|
+
["quinze", 15],
|
|
110
|
+
["seize", 16],
|
|
111
|
+
["dix-sept", 17],
|
|
112
|
+
["dix sept", 17],
|
|
113
|
+
["dix-huit", 18],
|
|
114
|
+
["dix huit", 18],
|
|
115
|
+
["dix-neuf", 19],
|
|
116
|
+
["dix neuf", 19],
|
|
117
|
+
["vingt", 20],
|
|
118
|
+
["vingt et une", 21],
|
|
119
|
+
["vingt-et-une", 21],
|
|
120
|
+
["vingt et un", 21],
|
|
121
|
+
["vingt-et-un", 21],
|
|
122
|
+
["vingt-deux", 22],
|
|
123
|
+
["vingt deux", 22],
|
|
124
|
+
["vingt-trois", 23],
|
|
125
|
+
["vingt trois", 23],
|
|
126
|
+
["vingt-quatre", 24],
|
|
127
|
+
["vingt quatre", 24],
|
|
89
128
|
]);
|
|
90
129
|
const hourWordMatch = s.match(/([a-z\- ]+?)\s*heures?/);
|
|
91
130
|
if (!hourWordMatch)
|
|
@@ -109,9 +148,21 @@ function parseFrenchClockToHHMM(input) {
|
|
|
109
148
|
}
|
|
110
149
|
else {
|
|
111
150
|
const MIN = new Map([
|
|
112
|
-
["cinq", 5],
|
|
113
|
-
["
|
|
114
|
-
["
|
|
151
|
+
["cinq", 5],
|
|
152
|
+
["dix", 10],
|
|
153
|
+
["quinze", 15],
|
|
154
|
+
["vingt", 20],
|
|
155
|
+
["vingt-cinq", 25],
|
|
156
|
+
["vingt cinq", 25],
|
|
157
|
+
["trente", 30],
|
|
158
|
+
["trente-cinq", 35],
|
|
159
|
+
["trente cinq", 35],
|
|
160
|
+
["quarante", 40],
|
|
161
|
+
["quarante-cinq", 45],
|
|
162
|
+
["quarante cinq", 45],
|
|
163
|
+
["cinquante", 50],
|
|
164
|
+
["cinquante-cinq", 55],
|
|
165
|
+
["cinquante cinq", 55],
|
|
115
166
|
]);
|
|
116
167
|
const minWordMatch = s.match(/heures?\s+([a-z\- ]+?)(?:[).,;]|$)/);
|
|
117
168
|
if (minWordMatch) {
|
|
@@ -123,3 +174,213 @@ function parseFrenchClockToHHMM(input) {
|
|
|
123
174
|
}
|
|
124
175
|
return `${String(hour).padStart(2, "0")}${String(minutes).padStart(2, "0")}`;
|
|
125
176
|
}
|
|
177
|
+
// Helpers locaux (autonomes)
|
|
178
|
+
function frDateToISO(s) {
|
|
179
|
+
if (!s)
|
|
180
|
+
return;
|
|
181
|
+
const months = {
|
|
182
|
+
janvier: 1,
|
|
183
|
+
février: 2,
|
|
184
|
+
fevrier: 2,
|
|
185
|
+
mars: 3,
|
|
186
|
+
avril: 4,
|
|
187
|
+
mai: 5,
|
|
188
|
+
juin: 6,
|
|
189
|
+
juillet: 7,
|
|
190
|
+
août: 8,
|
|
191
|
+
aout: 8,
|
|
192
|
+
septembre: 9,
|
|
193
|
+
octobre: 10,
|
|
194
|
+
novembre: 11,
|
|
195
|
+
décembre: 12,
|
|
196
|
+
decembre: 12,
|
|
197
|
+
};
|
|
198
|
+
const m = s
|
|
199
|
+
.trim()
|
|
200
|
+
.replace(/\u00A0/g, " ")
|
|
201
|
+
.replace(/ +/g, " ")
|
|
202
|
+
.match(/^(\d{1,2})\s+([a-zéèêîïôûùç]+)\s+(\d{4})$/i);
|
|
203
|
+
if (!m)
|
|
204
|
+
return;
|
|
205
|
+
const d = String(parseInt(m[1], 10)).padStart(2, "0");
|
|
206
|
+
const mon = months[m[2].toLowerCase()];
|
|
207
|
+
if (!mon)
|
|
208
|
+
return;
|
|
209
|
+
const y = m[3];
|
|
210
|
+
return `${y}-${String(mon).padStart(2, "0")}-${d}`;
|
|
211
|
+
}
|
|
212
|
+
function extractWeekStartFromHead($) {
|
|
213
|
+
const og = $('meta[property="og:title"]').attr("content") || $("title").text();
|
|
214
|
+
const m = (og ?? "").toLowerCase().match(/semaine du\s+(\d{1,2}\s+\w+\s+\d{4})/i);
|
|
215
|
+
if (m)
|
|
216
|
+
return frDateToISO(m[1]);
|
|
217
|
+
return undefined;
|
|
218
|
+
}
|
|
219
|
+
function detectOrganeFromTitle(s) {
|
|
220
|
+
const t = (s ?? "").trim();
|
|
221
|
+
if (!t)
|
|
222
|
+
return { organeTitleRaw: undefined, organeDetected: undefined };
|
|
223
|
+
const lower = t.toLowerCase();
|
|
224
|
+
const m = lower.match(/commission(?:\s+des|\s+de|)\s+([^:]+)$/i);
|
|
225
|
+
let organeDetected;
|
|
226
|
+
if (m && m[1]) {
|
|
227
|
+
organeDetected = ("Commission " + m[1])
|
|
228
|
+
.replace(/\s+/g, " ")
|
|
229
|
+
.replace(/\s+:? comptes? rendus?$/i, "")
|
|
230
|
+
.trim();
|
|
231
|
+
organeDetected = organeDetected[0].toUpperCase() + organeDetected.slice(1);
|
|
232
|
+
}
|
|
233
|
+
return { organeTitleRaw: t, organeDetected };
|
|
234
|
+
}
|
|
235
|
+
function extractDaysAndOpenings($) {
|
|
236
|
+
const days = [];
|
|
237
|
+
const h2s = $("h2").toArray();
|
|
238
|
+
for (let i = 0; i < h2s.length; i++) {
|
|
239
|
+
const h = h2s[i];
|
|
240
|
+
const txt = $(h).text().trim();
|
|
241
|
+
const m = txt.match(/(?:Lundi|Mardi|Mercredi|Jeudi|Vendredi|Samedi|Dimanche)\s+(.+)$/i);
|
|
242
|
+
if (!m)
|
|
243
|
+
continue;
|
|
244
|
+
const iso = frDateToISO(m[1]);
|
|
245
|
+
if (!iso)
|
|
246
|
+
continue;
|
|
247
|
+
let openTime;
|
|
248
|
+
let cur = $(h).next();
|
|
249
|
+
while (cur.length && cur[0].tagName !== "h2") {
|
|
250
|
+
const t = cur.text().replace(/\s+/g, " ").trim();
|
|
251
|
+
const mt = t.match(/La réunion est ouverte à\s+(\d{1,2})(?:h(?:\s*(\d{2}))?)?/i);
|
|
252
|
+
if (mt) {
|
|
253
|
+
openTime = `${mt[1].padStart(2, "0")}:${(mt[2] ?? "00").padStart(2, "0")}`;
|
|
254
|
+
break;
|
|
255
|
+
}
|
|
256
|
+
cur = cur.next();
|
|
257
|
+
}
|
|
258
|
+
days.push({ date: iso, openTime, h2Index: i });
|
|
259
|
+
}
|
|
260
|
+
return days;
|
|
261
|
+
}
|
|
262
|
+
function extractOrganeCode($) {
|
|
263
|
+
const names = $("a[name]")
|
|
264
|
+
.toArray()
|
|
265
|
+
.map((a) => ($(a).attr("name") || "").trim());
|
|
266
|
+
return names.find((n) => /^[A-Z]{3,6}$/.test(n));
|
|
267
|
+
}
|
|
268
|
+
export function parseCommissionMetadataFromHtml(html, sourceFileName) {
|
|
269
|
+
const $ = cheerio.load(html);
|
|
270
|
+
const h1 = $("h1.page-title").first().text().trim() || undefined;
|
|
271
|
+
const headTitle = $('meta[property="og:title"]').attr("content") || $("title").text() || undefined;
|
|
272
|
+
const { organeTitleRaw, organeDetected } = detectOrganeFromTitle(h1 || headTitle);
|
|
273
|
+
let weekStart = extractWeekStartFromHead($);
|
|
274
|
+
const days = extractDaysAndOpenings($);
|
|
275
|
+
if (!weekStart && days.length > 0)
|
|
276
|
+
weekStart = days[0].date;
|
|
277
|
+
const organeCode = extractOrganeCode($);
|
|
278
|
+
return {
|
|
279
|
+
sourceFile: sourceFileName ?? null,
|
|
280
|
+
organeTitleRaw: organeTitleRaw ?? null,
|
|
281
|
+
organeDetected: organeDetected ?? null,
|
|
282
|
+
organeCode: organeCode ?? null,
|
|
283
|
+
weekStart: weekStart ?? null,
|
|
284
|
+
days, // [{date, openTime?, h2Index}]
|
|
285
|
+
};
|
|
286
|
+
}
|
|
287
|
+
function isGroupedReunion(o) {
|
|
288
|
+
return o && typeof o === "object" && typeof o.uid === "string" && typeof o.date === "string";
|
|
289
|
+
}
|
|
290
|
+
export async function loadCommissionAgendaForDate(dataDir, yyyymmdd, session) {
|
|
291
|
+
const baseDir = path.join(dataDir, AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, String(session));
|
|
292
|
+
if (!(await fs.pathExists(baseDir)))
|
|
293
|
+
return [];
|
|
294
|
+
const files = (await fs.readdir(baseDir)).filter((f) => f.startsWith(`RUSN${yyyymmdd}IDC`) && f.toLowerCase().endsWith(".json"));
|
|
295
|
+
const out = [];
|
|
296
|
+
for (const f of files) {
|
|
297
|
+
const p = path.join(baseDir, f);
|
|
298
|
+
try {
|
|
299
|
+
const raw = await fs.readFile(p, "utf8");
|
|
300
|
+
const obj = JSON.parse(raw);
|
|
301
|
+
if (!isGroupedReunion(obj)) {
|
|
302
|
+
continue;
|
|
303
|
+
}
|
|
304
|
+
if (!obj.uid.startsWith(`RUSN${yyyymmdd}IDC`)) {
|
|
305
|
+
continue;
|
|
306
|
+
}
|
|
307
|
+
out.push(obj);
|
|
308
|
+
}
|
|
309
|
+
catch {
|
|
310
|
+
// ignore
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
return out;
|
|
314
|
+
}
|
|
315
|
+
function hourShortToStartTime(hourShort) {
|
|
316
|
+
if (!hourShort || hourShort === "NA")
|
|
317
|
+
return null;
|
|
318
|
+
if (!/^\d{4}$/.test(hourShort))
|
|
319
|
+
return null;
|
|
320
|
+
const hh = hourShort.slice(0, 2);
|
|
321
|
+
const mm = hourShort.slice(2, 4);
|
|
322
|
+
return `${hh}:${mm}`;
|
|
323
|
+
}
|
|
324
|
+
export async function createCommissionGroupIfMissing(dataDir, dateISO, // "YYYY-MM-DD"
|
|
325
|
+
organeDetected, // ex. "Commission des finances"
|
|
326
|
+
hourShort, // "HHMM" | "NA"
|
|
327
|
+
titreGuess) {
|
|
328
|
+
const uid = makeTypeGroupUid(dateISO, "COM", hourShort ?? "NA", organeDetected ?? undefined);
|
|
329
|
+
const session = sessionStartYearFromDate(new Date(dateISO));
|
|
330
|
+
const dir = path.join(dataDir, "agenda", "transformed", String(session));
|
|
331
|
+
await fs.ensureDir(dir);
|
|
332
|
+
const filePath = path.join(dir, `${uid}.json`);
|
|
333
|
+
let groups = [];
|
|
334
|
+
let created = false;
|
|
335
|
+
if (await fs.pathExists(filePath)) {
|
|
336
|
+
try {
|
|
337
|
+
const raw = await fs.readFile(filePath, "utf8");
|
|
338
|
+
groups = JSON.parse(raw);
|
|
339
|
+
if (!Array.isArray(groups))
|
|
340
|
+
groups = [];
|
|
341
|
+
}
|
|
342
|
+
catch {
|
|
343
|
+
groups = [];
|
|
344
|
+
}
|
|
345
|
+
const exists = groups.some((g) => g?.uid === uid);
|
|
346
|
+
if (!exists) {
|
|
347
|
+
groups.push({
|
|
348
|
+
uid,
|
|
349
|
+
chambre: "SN",
|
|
350
|
+
date: dateISO,
|
|
351
|
+
slot: null,
|
|
352
|
+
type: organeDetected ?? "Commission",
|
|
353
|
+
startTime: hourShortToStartTime(hourShort),
|
|
354
|
+
endTime: null,
|
|
355
|
+
captationVideo: false,
|
|
356
|
+
titre: titreGuess ?? null,
|
|
357
|
+
objet: null,
|
|
358
|
+
reunions: [],
|
|
359
|
+
compteRenduRefUid: null,
|
|
360
|
+
});
|
|
361
|
+
await fs.writeJSON(filePath, groups, { spaces: 2 });
|
|
362
|
+
created = true;
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
else {
|
|
366
|
+
groups = [
|
|
367
|
+
{
|
|
368
|
+
uid,
|
|
369
|
+
chambre: "SN",
|
|
370
|
+
date: dateISO,
|
|
371
|
+
slot: null,
|
|
372
|
+
type: organeDetected ?? "Commission",
|
|
373
|
+
startTime: hourShortToStartTime(hourShort),
|
|
374
|
+
endTime: null,
|
|
375
|
+
captationVideo: false,
|
|
376
|
+
titre: titreGuess ?? null,
|
|
377
|
+
objet: null,
|
|
378
|
+
reunions: [],
|
|
379
|
+
compteRenduRefUid: null,
|
|
380
|
+
},
|
|
381
|
+
];
|
|
382
|
+
await fs.writeJSON(filePath, groups, { spaces: 2 });
|
|
383
|
+
created = true;
|
|
384
|
+
}
|
|
385
|
+
return { uid, filePath, created };
|
|
386
|
+
}
|
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
import { AgendaEvent, GroupedReunion, TimeSlot } from "../types/agenda";
|
|
2
|
+
type KnownType = "SP" | "COM" | "MC" | "OD" | "ID";
|
|
2
3
|
export declare function groupNonSPByTypeOrganeHour(events: AgendaEvent[]): Record<"IDC" | "IDM" | "IDO" | "IDI", GroupedReunion[]>;
|
|
3
4
|
export declare function groupSeancePubliqueBySlot(events: AgendaEvent[]): GroupedReunion[];
|
|
5
|
+
export declare function makeTypeGroupUid(dateISO: string, kind: KnownType, hourShort: string | null, organe?: string | null): string;
|
|
4
6
|
export declare function makeGroupUid(date: string, slot: TimeSlot): string;
|
|
5
7
|
export declare function formatYYYYMMDD(dateYYYYMMDD: string): string;
|
|
6
8
|
export declare function makeReunionUid(agenda: AgendaEvent): string;
|
|
9
|
+
export {};
|
|
@@ -243,7 +243,7 @@ function organeInitials(input, maxLen = 8) {
|
|
|
243
243
|
const out = letters.join("");
|
|
244
244
|
return out.slice(0, maxLen);
|
|
245
245
|
}
|
|
246
|
-
function makeTypeGroupUid(dateISO, kind, hourShort, organe) {
|
|
246
|
+
export function makeTypeGroupUid(dateISO, kind, hourShort, organe) {
|
|
247
247
|
const ymd = dateISO ? formatYYYYMMDD(dateISO) : "00000000";
|
|
248
248
|
const suffix = typeToSuffixStrict(kind);
|
|
249
249
|
const hh = hourShort ?? "NA";
|