@tricoteuses/senat 2.11.0 → 2.11.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,291 @@
1
+ import fs from "fs-extra";
2
+ import assert from "assert";
3
+ import path from "path";
4
+ import * as cheerio from "cheerio";
5
+ import { COMMISSION_FOLDER, DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER } from "../loaders";
6
+ import { createCommissionGroupIfMissing, loadCommissionAgendaForDate, parseCommissionMetadataFromHtml, } from "../utils/cr_spliting";
7
+ import { parseCommissionCRFromFile } from "../model/commission";
8
+ import commandLineArgs from "command-line-args";
9
+ import { commonOptions } from "./shared/cli_helpers";
10
+ import { sessionStartYearFromDate } from "../model/seance";
11
+ import { getSessionsFromStart } from "../types/sessions";
12
+ import { ensureAndClearDir, fetchWithRetry } from "./shared/util";
13
+ class CommissionCRDownloadError extends Error {
14
+ constructor(message, url) {
15
+ super(`An error occurred while retrieving Commission CR ${url}: ${message}`);
16
+ }
17
+ }
18
+ const optionsDefinitions = [
19
+ ...commonOptions,
20
+ { name: "concurrency", type: Number, defaultValue: 6, help: "Max parallel downloads" },
21
+ { name: "politenessMs", type: Number, defaultValue: 150, help: "Delay per worker (ms)" },
22
+ {
23
+ help: "parse and convert comptes-rendus des débats into JSON",
24
+ name: "parseDebats",
25
+ type: Boolean,
26
+ },
27
+ ];
28
+ const options = commandLineArgs(optionsDefinitions);
29
+ const COMMISSION_HUBS = {
30
+ "affaires-etrangeres": [
31
+ "https://www.senat.fr/compte-rendu-commissions/affaires-etrangeres.html",
32
+ "https://www.senat.fr/compte-rendu-commissions/affaires-etrangeres_archives.html",
33
+ ],
34
+ "affaires-economiques": [
35
+ "https://www.senat.fr/compte-rendu-commissions/affaires-economiques.html",
36
+ "https://www.senat.fr/compte-rendu-commissions/affaires-economiques_archives.html",
37
+ ],
38
+ "amenagement-developpement-durable": [
39
+ "https://www.senat.fr/compte-rendu-commissions/cadre-de-vie-et-developpement-durable.html",
40
+ "https://www.senat.fr/compte-rendu-commissions/cadre-de-vie-et-developpement-durable_archives.html",
41
+ ],
42
+ culture: [
43
+ "https://www.senat.fr/compte-rendu-commissions/culture.html",
44
+ "https://www.senat.fr/compte-rendu-commissions/culture_archives.html",
45
+ ],
46
+ finances: [
47
+ "https://www.senat.fr/compte-rendu-commissions/finances.html",
48
+ "https://www.senat.fr/compte-rendu-commissions/finances_archives.html",
49
+ ],
50
+ lois: [
51
+ "https://www.senat.fr/compte-rendu-commissions/lois.html",
52
+ "https://www.senat.fr/compte-rendu-commissions/lois_archives.html",
53
+ ],
54
+ "affaires-sociales": [
55
+ "https://www.senat.fr/compte-rendu-commissions/affaires-sociales.html",
56
+ "https://www.senat.fr/compte-rendu-commissions/affaires-sociales_archives.html",
57
+ ],
58
+ "affaires-europeennes": [
59
+ "https://www.senat.fr/compte-rendu-commissions/affaires-europeennes.html",
60
+ "https://www.senat.fr/compte-rendu-commissions/affaires-europeennes_archives.html",
61
+ ],
62
+ };
63
+ async function harvestWeeklyLinksFromHub(hubUrl) {
64
+ const res = await fetchWithRetry(hubUrl);
65
+ if (!res.ok)
66
+ return [];
67
+ const html = await res.text();
68
+ const $ = cheerio.load(html);
69
+ const out = [];
70
+ $("a[href]").each((_, a) => {
71
+ const href = ($(a).attr("href") || "").trim();
72
+ const m = href.match(/\/compte-rendu-commissions\/(\d{8})\/([a-z0-9\-]+)\.html$/i);
73
+ if (m) {
74
+ const url = href.startsWith("http") ? href : new URL(href, hubUrl).toString();
75
+ out.push(url);
76
+ }
77
+ });
78
+ return Array.from(new Set(out));
79
+ }
80
+ async function discoverCommissionWeeklyPages(fromSession) {
81
+ const results = [];
82
+ for (const [commissionKey, hubs] of Object.entries(COMMISSION_HUBS)) {
83
+ for (const hubUrl of hubs) {
84
+ try {
85
+ const links = await harvestWeeklyLinksFromHub(hubUrl);
86
+ for (const url of links) {
87
+ const m = url.match(/\/compte-rendu-commissions\/(\d{8})\/([a-z0-9\-]+)\.html$/i);
88
+ if (!m)
89
+ continue;
90
+ const yyyymmdd = m[1];
91
+ const year = Number(yyyymmdd.slice(0, 4));
92
+ const month = Number(yyyymmdd.slice(4, 6));
93
+ const session = month >= 10 ? year : year - 1;
94
+ if (session < fromSession)
95
+ continue;
96
+ results.push({ url, yyyymmdd, commissionKey });
97
+ }
98
+ }
99
+ catch (e) {
100
+ console.warn(`[COM-CR][hub-fail] ${hubUrl} → ${e?.message ?? e}`);
101
+ }
102
+ }
103
+ }
104
+ return results.sort((a, b) => a.yyyymmdd.localeCompare(b.yyyymmdd));
105
+ }
106
+ function toHourShort(hhmm) {
107
+ if (!hhmm)
108
+ return null;
109
+ const m = hhmm.match(/^(\d{2}):(\d{2})$/);
110
+ return m ? `${m[1]}${m[2]}` : null;
111
+ }
112
+ function timeToMinutes(hhmm) {
113
+ const [h, m] = hhmm.split(":").map((n) => parseInt(n, 10));
114
+ return (h || 0) * 60 + (m || 0);
115
+ }
116
+ async function tryDownload(url) {
117
+ const res = await fetch(url, { redirect: "follow" });
118
+ if (res.status === 404)
119
+ return null;
120
+ if (!res.ok)
121
+ throw new CommissionCRDownloadError(String(res.status), url);
122
+ const ab = await res.arrayBuffer();
123
+ return Buffer.from(ab);
124
+ }
125
+ async function retrieveCommissionCRs(options = {}) {
126
+ const dataDir = options["dataDir"];
127
+ const fromSession = Number(options["fromSession"]);
128
+ const concurrency = Number(options["concurrency"] ?? 6);
129
+ const politenessMs = Number(options["politenessMs"] ?? 150);
130
+ const commissionsRootDir = path.join(dataDir, COMMISSION_FOLDER);
131
+ const originalRoot = path.join(commissionsRootDir, DATA_ORIGINAL_FOLDER);
132
+ ensureAndClearDir(originalRoot);
133
+ const discovered = await discoverCommissionWeeklyPages(fromSession);
134
+ console.log(`[COM-CR][discover] ${discovered.length} links (>= session ${fromSession})`);
135
+ const jobs = discovered.map(({ url, yyyymmdd }) => {
136
+ const d = new Date(Number(yyyymmdd.slice(0, 4)), Number(yyyymmdd.slice(4, 6)) - 1, Number(yyyymmdd.slice(6, 8)));
137
+ const session = sessionStartYearFromDate(d);
138
+ const dir = path.join(originalRoot, String(session));
139
+ fs.ensureDirSync(dir);
140
+ const slug = url.replace(/^.*\/(\d{8})\/([^\/]+)\.html$/i, "$2");
141
+ const outPath = path.join(dir, `${yyyymmdd}.${slug}.html`);
142
+ return { url, outPath, yyyymmdd };
143
+ });
144
+ console.log(`[COM-CR] Downloading ${jobs.length} links → ${path.relative(process.cwd(), originalRoot)}`);
145
+ let completed = 0, saved = 0, skipped = 0, notFound = 0;
146
+ const workers = Array.from({ length: Math.max(1, concurrency) }, async () => {
147
+ while (true) {
148
+ const job = jobs.shift();
149
+ if (!job)
150
+ break;
151
+ const { url, outPath, yyyymmdd } = job;
152
+ try {
153
+ if (await fs.pathExists(outPath)) {
154
+ skipped++;
155
+ }
156
+ else {
157
+ const buf = await tryDownload(url);
158
+ if (!buf) {
159
+ notFound++;
160
+ console.warn(`[COM-CR][404] ${url} → week=${yyyymmdd}`);
161
+ }
162
+ else {
163
+ await fs.writeFile(outPath, buf);
164
+ saved++;
165
+ }
166
+ }
167
+ }
168
+ catch (e) {
169
+ console.error(`[COM-CR][err] ${url} → ${e?.message || e}`);
170
+ }
171
+ finally {
172
+ completed++;
173
+ if (politenessMs > 0)
174
+ await new Promise((r) => setTimeout(r, politenessMs));
175
+ }
176
+ }
177
+ });
178
+ await Promise.all(workers);
179
+ console.log(`[COM-CR] done: saved=${saved} | skipped=${skipped} | 404=${notFound} | total=${completed}`);
180
+ const sessions = getSessionsFromStart(options["fromSession"]);
181
+ const comRoot = path.join(dataDir, COMMISSION_FOLDER);
182
+ const transformedRoot = path.join(comRoot, DATA_TRANSFORMED_FOLDER);
183
+ if (options["parseDebats"])
184
+ ensureAndClearDir(transformedRoot);
185
+ for (const session of sessions) {
186
+ const originalSessionDir = path.join(originalRoot, String(session));
187
+ const transformedSessionDir = path.join(transformedRoot, String(session));
188
+ fs.ensureDirSync(transformedSessionDir);
189
+ if (!(await fs.pathExists(originalSessionDir)))
190
+ continue;
191
+ const htmlFiles = (await fs.readdir(originalSessionDir)).filter((f) => /\.html?$/i.test(f)).sort();
192
+ for (const f of htmlFiles) {
193
+ const htmlPath = path.join(originalSessionDir, f);
194
+ let meta;
195
+ try {
196
+ const raw = await fs.readFile(htmlPath, "utf8");
197
+ meta = parseCommissionMetadataFromHtml(raw, f);
198
+ }
199
+ catch (e) {
200
+ console.warn(`[COM-CR][PRE][${session}] Cannot read/parse ${f}:`, e);
201
+ continue;
202
+ }
203
+ const organeKeywords = (meta.organeDetected ?? meta.organeTitleRaw ?? "")
204
+ .toLowerCase()
205
+ .replace(/[’']/g, "'")
206
+ .split(/\W+/)
207
+ .filter((x) => x.length >= 3 && !["commission", "des", "de", "du", "d", "la", "le", "les", "et"].includes(x));
208
+ const MAX_TIME_DELTA_MIN = 120;
209
+ for (let i = 0; i < meta.days.length; i++) {
210
+ const day = meta.days[i];
211
+ const yyyymmdd = day.date.replace(/-/g, "");
212
+ const dt = new Date(Number(day.date.slice(0, 4)), Number(day.date.slice(5, 7)) - 1, Number(day.date.slice(8, 10)));
213
+ const daySession = sessionStartYearFromDate(dt);
214
+ const hits = await loadCommissionAgendaForDate(dataDir, yyyymmdd, daySession);
215
+ let best = null;
216
+ let reason = "fallback-none";
217
+ let deltaMin;
218
+ // a) score by title and organe keywords
219
+ if (organeKeywords.length && hits.length) {
220
+ const scored = hits
221
+ .map((h) => {
222
+ const t = (h.titre ?? "").toLowerCase();
223
+ const s = organeKeywords.reduce((acc, kw) => acc + (t.includes(kw) ? 1 : 0), 0);
224
+ return { h, s };
225
+ })
226
+ .sort((a, b) => b.s - a.s);
227
+ if (scored[0]?.s > 0) {
228
+ best = scored[0].h;
229
+ reason = "title";
230
+ }
231
+ }
232
+ // b) otherwise score by time proximity
233
+ if (!best && day.openTime && hits.length) {
234
+ const candidates = hits
235
+ .map((h) => ({ h, hhmm: h.startTime ?? null }))
236
+ .filter((x) => !!x.hhmm)
237
+ .map((x) => ({
238
+ h: x.h,
239
+ d: Math.abs(timeToMinutes(x.hhmm) - timeToMinutes(day.openTime)),
240
+ }))
241
+ .sort((a, b) => a.d - b.d);
242
+ if (candidates[0] && candidates[0].d <= MAX_TIME_DELTA_MIN) {
243
+ best = candidates[0].h;
244
+ reason = "time";
245
+ deltaMin = candidates[0].d;
246
+ }
247
+ }
248
+ if (best) {
249
+ const cr = parseCommissionCRFromFile(htmlPath, best);
250
+ if (!cr) {
251
+ console.warn(`[COM-CR][TRANSFORM] parse failed for ${f} → ${best.uid}`);
252
+ }
253
+ else {
254
+ const fileUid = cr.uid;
255
+ const outPath = path.join(transformedSessionDir, `${fileUid}.json`);
256
+ await fs.writeJSON(outPath, cr, { spaces: 2 });
257
+ const npts = Array.isArray(cr.contenu.point) ? cr.contenu.point.length : cr.contenu.point ? 1 : 0;
258
+ if (!options["silent"]) {
259
+ console.log(`[COM-CR][TRANSFORM] saved ${path.basename(outPath)} (points=${npts})`);
260
+ }
261
+ }
262
+ }
263
+ else {
264
+ const hourShort = toHourShort(day.openTime) ?? "NA";
265
+ const titreGuess = meta.organeDetected || meta.organeTitleRaw || "Commission";
266
+ const { uid, filePath } = await createCommissionGroupIfMissing(dataDir, day.date, meta.organeDetected ?? null, hourShort, titreGuess);
267
+ if (!options["silent"]) {
268
+ console.log(`[COM-CR][PRE-SPLIT][${session}] ${f} | ${day.date}` +
269
+ (day.openTime ? ` ${day.openTime}` : ``) +
270
+ ` → NO-MATCH → CREATED uid=${uid} file=${path.basename(filePath)}`);
271
+ }
272
+ }
273
+ }
274
+ }
275
+ }
276
+ }
277
+ async function main() {
278
+ const dataDir = options["dataDir"];
279
+ assert(dataDir, "Missing argument: data directory");
280
+ console.time("CRI processing time");
281
+ await retrieveCommissionCRs(options);
282
+ if (!options["silent"]) {
283
+ console.timeEnd("CRI processing time");
284
+ }
285
+ }
286
+ main()
287
+ .then(() => process.exit(0))
288
+ .catch((error) => {
289
+ console.error(error);
290
+ process.exit(1);
291
+ });
@@ -11,7 +11,7 @@ import StreamZip from "node-stream-zip";
11
11
  import * as cheerio from "cheerio";
12
12
  import { AGENDA_FOLDER, COMPTES_RENDUS_FOLDER, DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER, } from "../loaders";
13
13
  import { commonOptions } from "./shared/cli_helpers";
14
- import { deriveTitreObjetFromSommaire, parseCompteRenduSlotFromFile, parseYYYYMMDD, sessionStartYearFromDate } from "../model/compte_rendu";
14
+ import { deriveTitreObjetFromSommaire, parseCompteRenduSlotFromFile, parseYYYYMMDD, sessionStartYearFromDate } from "../model/seance";
15
15
  import { makeGroupUid } from "../utils/reunion_grouping";
16
16
  import { getSessionsFromStart } from "../types/sessions";
17
17
  import { ensureAndClearDir, fetchWithRetry } from "./shared/util";
@@ -1,7 +1,28 @@
1
- import { TimeSlot } from "../types/agenda";
1
+ import { GroupedReunion, TimeSlot } from "../types/agenda";
2
2
  import * as cheerio from "cheerio";
3
3
  export declare function computeIntervalsBySlot($: cheerio.CheerioAPI, idx: Map<any, number>, firstSlotOfDay?: TimeSlot): {
4
4
  slot: TimeSlot;
5
5
  start: number;
6
6
  end: number;
7
7
  }[];
8
+ export declare function parseCommissionMetadataFromHtml(html: string, sourceFileName?: string): {
9
+ sourceFile: string | null;
10
+ organeTitleRaw: string | null;
11
+ organeDetected: string | null;
12
+ organeCode: string | null;
13
+ weekStart: string | null;
14
+ days: {
15
+ date: string;
16
+ openTime?: string;
17
+ h2Index: number;
18
+ }[];
19
+ };
20
+ export declare function loadCommissionAgendaForDate(dataDir: string, yyyymmdd: string, session: number): Promise<GroupedReunion[]>;
21
+ export declare function createCommissionGroupIfMissing(dataDir: string, dateISO: string, // "YYYY-MM-DD"
22
+ organeDetected: string | null, // ex. "Commission des finances"
23
+ hourShort: string | null, // "HHMM" | "NA"
24
+ titreGuess?: string | null): Promise<{
25
+ uid: string;
26
+ filePath: string;
27
+ created: boolean;
28
+ }>;
@@ -1,7 +1,13 @@
1
+ import path from "path";
2
+ import * as cheerio from "cheerio";
3
+ import { AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER } from "../loaders";
4
+ import fs from "fs-extra";
5
+ import { makeTypeGroupUid } from "./reunion_grouping";
6
+ import { sessionStartYearFromDate } from "../model/seance";
1
7
  export function computeIntervalsBySlot($, idx, firstSlotOfDay) {
2
8
  const all = $("body *").toArray();
3
9
  const cuts = [{ pos: 0, hhmm: undefined }];
4
- $('a[name]').each((_, a) => {
10
+ $("a[name]").each((_, a) => {
5
11
  const name = (a.attribs?.["name"] || "").trim();
6
12
  if (!/^su/i.test(name))
7
13
  return;
@@ -30,7 +36,7 @@ export function computeIntervalsBySlot($, idx, firstSlotOfDay) {
30
36
  continue;
31
37
  // i=0 initialSlot
32
38
  // i>0 : if current cut has SU -> slotOfHHMM, otherwise lastSlot
33
- const slot = i === 0 ? initialSlot : (cuts[i].hhmm ? slotOfHHMM(cuts[i].hhmm) : lastSlot);
39
+ const slot = i === 0 ? initialSlot : cuts[i].hhmm ? slotOfHHMM(cuts[i].hhmm) : lastSlot;
34
40
  intervals.push({ slot, start, end });
35
41
  lastSlot = slot;
36
42
  }
@@ -70,7 +76,11 @@ function extractOpeningHHMM($) {
70
76
  }
71
77
  // Convert "quinze heures trente", "15 heures 30", "dix-sept heures moins le quart", etc. en "HHMM"
72
78
  function parseFrenchClockToHHMM(input) {
73
- const s = (input || "").toLowerCase().normalize("NFKD").replace(/[\u0300-\u036f]/g, "").trim();
79
+ const s = (input || "")
80
+ .toLowerCase()
81
+ .normalize("NFKD")
82
+ .replace(/[\u0300-\u036f]/g, "")
83
+ .trim();
74
84
  if (!s)
75
85
  return undefined;
76
86
  const digitMatch = s.match(/(\d{1,2})\s*heures?(?:\s*(\d{1,2}))?/);
@@ -80,12 +90,41 @@ function parseFrenchClockToHHMM(input) {
80
90
  return `${String(h).padStart(2, "0")}${String(m).padStart(2, "0")}`;
81
91
  }
82
92
  const NUM = new Map([
83
- ["zero", 0], ["une", 1], ["un", 1], ["deux", 2], ["trois", 3], ["quatre", 4], ["cinq", 5], ["six", 6],
84
- ["sept", 7], ["huit", 8], ["neuf", 9], ["dix", 10], ["onze", 11], ["douze", 12], ["treize", 13],
85
- ["quatorze", 14], ["quinze", 15], ["seize", 16], ["dix-sept", 17], ["dix sept", 17], ["dix-huit", 18],
86
- ["dix huit", 18], ["dix-neuf", 19], ["dix neuf", 19], ["vingt", 20], ["vingt et une", 21],
87
- ["vingt-et-une", 21], ["vingt et un", 21], ["vingt-et-un", 21], ["vingt-deux", 22], ["vingt deux", 22],
88
- ["vingt-trois", 23], ["vingt trois", 23], ["vingt-quatre", 24], ["vingt quatre", 24],
93
+ ["zero", 0],
94
+ ["une", 1],
95
+ ["un", 1],
96
+ ["deux", 2],
97
+ ["trois", 3],
98
+ ["quatre", 4],
99
+ ["cinq", 5],
100
+ ["six", 6],
101
+ ["sept", 7],
102
+ ["huit", 8],
103
+ ["neuf", 9],
104
+ ["dix", 10],
105
+ ["onze", 11],
106
+ ["douze", 12],
107
+ ["treize", 13],
108
+ ["quatorze", 14],
109
+ ["quinze", 15],
110
+ ["seize", 16],
111
+ ["dix-sept", 17],
112
+ ["dix sept", 17],
113
+ ["dix-huit", 18],
114
+ ["dix huit", 18],
115
+ ["dix-neuf", 19],
116
+ ["dix neuf", 19],
117
+ ["vingt", 20],
118
+ ["vingt et une", 21],
119
+ ["vingt-et-une", 21],
120
+ ["vingt et un", 21],
121
+ ["vingt-et-un", 21],
122
+ ["vingt-deux", 22],
123
+ ["vingt deux", 22],
124
+ ["vingt-trois", 23],
125
+ ["vingt trois", 23],
126
+ ["vingt-quatre", 24],
127
+ ["vingt quatre", 24],
89
128
  ]);
90
129
  const hourWordMatch = s.match(/([a-z\- ]+?)\s*heures?/);
91
130
  if (!hourWordMatch)
@@ -109,9 +148,21 @@ function parseFrenchClockToHHMM(input) {
109
148
  }
110
149
  else {
111
150
  const MIN = new Map([
112
- ["cinq", 5], ["dix", 10], ["quinze", 15], ["vingt", 20], ["vingt-cinq", 25], ["vingt cinq", 25],
113
- ["trente", 30], ["trente-cinq", 35], ["trente cinq", 35], ["quarante", 40], ["quarante-cinq", 45],
114
- ["quarante cinq", 45], ["cinquante", 50], ["cinquante-cinq", 55], ["cinquante cinq", 55],
151
+ ["cinq", 5],
152
+ ["dix", 10],
153
+ ["quinze", 15],
154
+ ["vingt", 20],
155
+ ["vingt-cinq", 25],
156
+ ["vingt cinq", 25],
157
+ ["trente", 30],
158
+ ["trente-cinq", 35],
159
+ ["trente cinq", 35],
160
+ ["quarante", 40],
161
+ ["quarante-cinq", 45],
162
+ ["quarante cinq", 45],
163
+ ["cinquante", 50],
164
+ ["cinquante-cinq", 55],
165
+ ["cinquante cinq", 55],
115
166
  ]);
116
167
  const minWordMatch = s.match(/heures?\s+([a-z\- ]+?)(?:[).,;]|$)/);
117
168
  if (minWordMatch) {
@@ -123,3 +174,213 @@ function parseFrenchClockToHHMM(input) {
123
174
  }
124
175
  return `${String(hour).padStart(2, "0")}${String(minutes).padStart(2, "0")}`;
125
176
  }
177
+ // Helpers locaux (autonomes)
178
+ function frDateToISO(s) {
179
+ if (!s)
180
+ return;
181
+ const months = {
182
+ janvier: 1,
183
+ février: 2,
184
+ fevrier: 2,
185
+ mars: 3,
186
+ avril: 4,
187
+ mai: 5,
188
+ juin: 6,
189
+ juillet: 7,
190
+ août: 8,
191
+ aout: 8,
192
+ septembre: 9,
193
+ octobre: 10,
194
+ novembre: 11,
195
+ décembre: 12,
196
+ decembre: 12,
197
+ };
198
+ const m = s
199
+ .trim()
200
+ .replace(/\u00A0/g, " ")
201
+ .replace(/ +/g, " ")
202
+ .match(/^(\d{1,2})\s+([a-zéèêîïôûùç]+)\s+(\d{4})$/i);
203
+ if (!m)
204
+ return;
205
+ const d = String(parseInt(m[1], 10)).padStart(2, "0");
206
+ const mon = months[m[2].toLowerCase()];
207
+ if (!mon)
208
+ return;
209
+ const y = m[3];
210
+ return `${y}-${String(mon).padStart(2, "0")}-${d}`;
211
+ }
212
+ function extractWeekStartFromHead($) {
213
+ const og = $('meta[property="og:title"]').attr("content") || $("title").text();
214
+ const m = (og ?? "").toLowerCase().match(/semaine du\s+(\d{1,2}\s+\w+\s+\d{4})/i);
215
+ if (m)
216
+ return frDateToISO(m[1]);
217
+ return undefined;
218
+ }
219
+ function detectOrganeFromTitle(s) {
220
+ const t = (s ?? "").trim();
221
+ if (!t)
222
+ return { organeTitleRaw: undefined, organeDetected: undefined };
223
+ const lower = t.toLowerCase();
224
+ const m = lower.match(/commission(?:\s+des|\s+de|)\s+([^:]+)$/i);
225
+ let organeDetected;
226
+ if (m && m[1]) {
227
+ organeDetected = ("Commission " + m[1])
228
+ .replace(/\s+/g, " ")
229
+ .replace(/\s+:? comptes? rendus?$/i, "")
230
+ .trim();
231
+ organeDetected = organeDetected[0].toUpperCase() + organeDetected.slice(1);
232
+ }
233
+ return { organeTitleRaw: t, organeDetected };
234
+ }
235
+ function extractDaysAndOpenings($) {
236
+ const days = [];
237
+ const h2s = $("h2").toArray();
238
+ for (let i = 0; i < h2s.length; i++) {
239
+ const h = h2s[i];
240
+ const txt = $(h).text().trim();
241
+ const m = txt.match(/(?:Lundi|Mardi|Mercredi|Jeudi|Vendredi|Samedi|Dimanche)\s+(.+)$/i);
242
+ if (!m)
243
+ continue;
244
+ const iso = frDateToISO(m[1]);
245
+ if (!iso)
246
+ continue;
247
+ let openTime;
248
+ let cur = $(h).next();
249
+ while (cur.length && cur[0].tagName !== "h2") {
250
+ const t = cur.text().replace(/\s+/g, " ").trim();
251
+ const mt = t.match(/La réunion est ouverte à\s+(\d{1,2})(?:h(?:\s*(\d{2}))?)?/i);
252
+ if (mt) {
253
+ openTime = `${mt[1].padStart(2, "0")}:${(mt[2] ?? "00").padStart(2, "0")}`;
254
+ break;
255
+ }
256
+ cur = cur.next();
257
+ }
258
+ days.push({ date: iso, openTime, h2Index: i });
259
+ }
260
+ return days;
261
+ }
262
+ function extractOrganeCode($) {
263
+ const names = $("a[name]")
264
+ .toArray()
265
+ .map((a) => ($(a).attr("name") || "").trim());
266
+ return names.find((n) => /^[A-Z]{3,6}$/.test(n));
267
+ }
268
+ export function parseCommissionMetadataFromHtml(html, sourceFileName) {
269
+ const $ = cheerio.load(html);
270
+ const h1 = $("h1.page-title").first().text().trim() || undefined;
271
+ const headTitle = $('meta[property="og:title"]').attr("content") || $("title").text() || undefined;
272
+ const { organeTitleRaw, organeDetected } = detectOrganeFromTitle(h1 || headTitle);
273
+ let weekStart = extractWeekStartFromHead($);
274
+ const days = extractDaysAndOpenings($);
275
+ if (!weekStart && days.length > 0)
276
+ weekStart = days[0].date;
277
+ const organeCode = extractOrganeCode($);
278
+ return {
279
+ sourceFile: sourceFileName ?? null,
280
+ organeTitleRaw: organeTitleRaw ?? null,
281
+ organeDetected: organeDetected ?? null,
282
+ organeCode: organeCode ?? null,
283
+ weekStart: weekStart ?? null,
284
+ days, // [{date, openTime?, h2Index}]
285
+ };
286
+ }
287
+ function isGroupedReunion(o) {
288
+ return o && typeof o === "object" && typeof o.uid === "string" && typeof o.date === "string";
289
+ }
290
+ export async function loadCommissionAgendaForDate(dataDir, yyyymmdd, session) {
291
+ const baseDir = path.join(dataDir, AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, String(session));
292
+ if (!(await fs.pathExists(baseDir)))
293
+ return [];
294
+ const files = (await fs.readdir(baseDir)).filter((f) => f.startsWith(`RUSN${yyyymmdd}IDC`) && f.toLowerCase().endsWith(".json"));
295
+ const out = [];
296
+ for (const f of files) {
297
+ const p = path.join(baseDir, f);
298
+ try {
299
+ const raw = await fs.readFile(p, "utf8");
300
+ const obj = JSON.parse(raw);
301
+ if (!isGroupedReunion(obj)) {
302
+ continue;
303
+ }
304
+ if (!obj.uid.startsWith(`RUSN${yyyymmdd}IDC`)) {
305
+ continue;
306
+ }
307
+ out.push(obj);
308
+ }
309
+ catch {
310
+ // ignore
311
+ }
312
+ }
313
+ return out;
314
+ }
315
+ function hourShortToStartTime(hourShort) {
316
+ if (!hourShort || hourShort === "NA")
317
+ return null;
318
+ if (!/^\d{4}$/.test(hourShort))
319
+ return null;
320
+ const hh = hourShort.slice(0, 2);
321
+ const mm = hourShort.slice(2, 4);
322
+ return `${hh}:${mm}`;
323
+ }
324
+ export async function createCommissionGroupIfMissing(dataDir, dateISO, // "YYYY-MM-DD"
325
+ organeDetected, // ex. "Commission des finances"
326
+ hourShort, // "HHMM" | "NA"
327
+ titreGuess) {
328
+ const uid = makeTypeGroupUid(dateISO, "COM", hourShort ?? "NA", organeDetected ?? undefined);
329
+ const session = sessionStartYearFromDate(new Date(dateISO));
330
+ const dir = path.join(dataDir, "agenda", "transformed", String(session));
331
+ await fs.ensureDir(dir);
332
+ const filePath = path.join(dir, `${uid}.json`);
333
+ let groups = [];
334
+ let created = false;
335
+ if (await fs.pathExists(filePath)) {
336
+ try {
337
+ const raw = await fs.readFile(filePath, "utf8");
338
+ groups = JSON.parse(raw);
339
+ if (!Array.isArray(groups))
340
+ groups = [];
341
+ }
342
+ catch {
343
+ groups = [];
344
+ }
345
+ const exists = groups.some((g) => g?.uid === uid);
346
+ if (!exists) {
347
+ groups.push({
348
+ uid,
349
+ chambre: "SN",
350
+ date: dateISO,
351
+ slot: null,
352
+ type: organeDetected ?? "Commission",
353
+ startTime: hourShortToStartTime(hourShort),
354
+ endTime: null,
355
+ captationVideo: false,
356
+ titre: titreGuess ?? null,
357
+ objet: null,
358
+ reunions: [],
359
+ compteRenduRefUid: null,
360
+ });
361
+ await fs.writeJSON(filePath, groups, { spaces: 2 });
362
+ created = true;
363
+ }
364
+ }
365
+ else {
366
+ groups = [
367
+ {
368
+ uid,
369
+ chambre: "SN",
370
+ date: dateISO,
371
+ slot: null,
372
+ type: organeDetected ?? "Commission",
373
+ startTime: hourShortToStartTime(hourShort),
374
+ endTime: null,
375
+ captationVideo: false,
376
+ titre: titreGuess ?? null,
377
+ objet: null,
378
+ reunions: [],
379
+ compteRenduRefUid: null,
380
+ },
381
+ ];
382
+ await fs.writeJSON(filePath, groups, { spaces: 2 });
383
+ created = true;
384
+ }
385
+ return { uid, filePath, created };
386
+ }
@@ -1,6 +1,9 @@
1
1
  import { AgendaEvent, GroupedReunion, TimeSlot } from "../types/agenda";
2
+ type KnownType = "SP" | "COM" | "MC" | "OD" | "ID";
2
3
  export declare function groupNonSPByTypeOrganeHour(events: AgendaEvent[]): Record<"IDC" | "IDM" | "IDO" | "IDI", GroupedReunion[]>;
3
4
  export declare function groupSeancePubliqueBySlot(events: AgendaEvent[]): GroupedReunion[];
5
+ export declare function makeTypeGroupUid(dateISO: string, kind: KnownType, hourShort: string | null, organe?: string | null): string;
4
6
  export declare function makeGroupUid(date: string, slot: TimeSlot): string;
5
7
  export declare function formatYYYYMMDD(dateYYYYMMDD: string): string;
6
8
  export declare function makeReunionUid(agenda: AgendaEvent): string;
9
+ export {};
@@ -243,7 +243,7 @@ function organeInitials(input, maxLen = 8) {
243
243
  const out = letters.join("");
244
244
  return out.slice(0, maxLen);
245
245
  }
246
- function makeTypeGroupUid(dateISO, kind, hourShort, organe) {
246
+ export function makeTypeGroupUid(dateISO, kind, hourShort, organe) {
247
247
  const ymd = dateISO ? formatYYYYMMDD(dateISO) : "00000000";
248
248
  const suffix = typeToSuffixStrict(kind);
249
249
  const hh = hourShort ?? "NA";