@tricoteuses/senat 2.10.0 → 2.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,69 +1,153 @@
1
- import { JSDOM } from "jsdom";
1
+ import fs from "fs";
2
2
  import * as cheerio from "cheerio";
3
- const norm = (s) => s.replace(/\u00A0/g, " ").replace(/\s+/g, " ").trim();
4
- const toTexte = (s) => ({ _: s });
5
- function extractSommaire($) {
6
- const root = $("#wysiwyg").length ? $("#wysiwyg") : $("#cri");
7
- const sommaire = {
8
- presidentSeance: toTexte(""),
9
- sommaire1: [],
10
- };
11
- // (1) presidency line (e.g., "Présidence de Mme …")
12
- const pres = root.find("p.tm2").filter((_, el) => /présidence/i.test($(el).text())).first();
13
- if (pres.length) {
14
- sommaire.presidentSeance = toTexte(norm(pres.text()));
3
+ import path from "path";
4
+ import { computeIntervalsBySlot } from "../utils/cr_spliting";
5
+ import { norm } from "./util";
6
+ const asArray = (x) => x == null ? [] : Array.isArray(x) ? x : [x];
7
+ const toInt = (s) => Number.isFinite(Number(s)) ? Number(s) : Number.POSITIVE_INFINITY;
8
+ export async function parseCompteRenduSlotFromFile(xmlFilePath, wantedSlot, firstSlotOfDay) {
9
+ try {
10
+ const raw = fs.readFileSync(xmlFilePath, "utf8");
11
+ const $ = cheerio.load(raw, { xml: false });
12
+ const metadonnees = extractMetadonnees($, xmlFilePath);
13
+ const order = $("body *").toArray();
14
+ const idx = new Map(order.map((el, i) => [el, i]));
15
+ const intervalsAll = computeIntervalsBySlot($, idx, firstSlotOfDay);
16
+ const intervals = intervalsAll.filter(iv => iv.slot === wantedSlot);
17
+ if (intervals.length === 0) {
18
+ console.warn(`[CRI] no intervals for ${path.basename(xmlFilePath)} [${wantedSlot}]`);
19
+ return null;
20
+ }
21
+ metadonnees.sommaire = extractSommaireForIntervals($, idx, intervals);
22
+ const points = [];
23
+ let ordre = 0;
24
+ const addPoint = (p) => points.push({ ...p, ordre_absolu_seance: String(++ordre) });
25
+ // Titles
26
+ $("cri\\:titreS1 p.titre_S1").each((_, el) => {
27
+ if (!elementInAnyInterval(el, idx, intervals))
28
+ return;
29
+ const t = normalizeTitle(norm($(el).text() || ""));
30
+ if (t)
31
+ addPoint({ code_grammaire: "TITRE_TEXTE_DISCUSSION", texte: { _: t }, code_style: "Titre" });
32
+ });
33
+ // Interventions
34
+ $("div.intervenant").each((_, block) => {
35
+ if (!elementInAnyInterval(block, idx, intervals))
36
+ return;
37
+ const $block = $(block);
38
+ $block.find([
39
+ "p[class^='titre_S']",
40
+ "p.mention_titre",
41
+ "p.intitule_titre",
42
+ "p.mention_chapitre",
43
+ "p.intitule_chapitre",
44
+ "p.mention_article",
45
+ "p.intitule_article",
46
+ "p.mention_section",
47
+ "p.intitule_section",
48
+ ].join(",")).remove();
49
+ const firstP = $block.find("p").first();
50
+ const speakerLabelRaw = firstP.find(".orateur_nom").text() || firstP.find("a.lien_senfic").text() || "";
51
+ const speakerLabel = dedupeSpeaker(speakerLabelRaw);
52
+ const { mat, nom: nomCRI, qua: quaCRI } = readIntervenantMeta($block);
53
+ const qualFromSpans = extractAndRemoveLeadingQualite($, $block);
54
+ const qualite = norm(decodeHtmlEntities(quaCRI || "")) || qualFromSpans;
55
+ const canonicalName = dedupeSpeaker(nomCRI || speakerLabel);
56
+ const role = roleForSpeaker(speakerLabel) || roleForSpeaker(qualite) || roleForSpeaker(quaCRI || "");
57
+ const speechHtml = sanitizeInterventionHtml($, $block);
58
+ if (!norm(cheerio.load(speechHtml).text() || ""))
59
+ return;
60
+ addPoint({
61
+ code_grammaire: "PAROLE_GENERIQUE",
62
+ roledebat: role,
63
+ orateurs: { orateur: { nom: canonicalName, id: mat || "", qualite } },
64
+ texte: { _: speechHtml },
65
+ });
66
+ });
67
+ const contenu = {
68
+ quantiemes: { journee: metadonnees.dateSeance, session: metadonnees.session },
69
+ point: points,
70
+ };
71
+ return {
72
+ uid: "CRSSN" + xmlFilePath.replace(/^.*?(\d{8}).*$/i, "$1") + `-${wantedSlot}`,
73
+ seanceRef: "RUSN" + xmlFilePath.replace(/^.*?(\d{8}).*$/i, "$1") + "IDS-" + wantedSlot,
74
+ sessionRef: metadonnees.session,
75
+ metadonnees,
76
+ contenu,
77
+ };
15
78
  }
16
- // (2) extra info lines like "Secrétaires :" (tm5)
17
- const paras = [];
18
- root.find("p.tm5").each((_, el) => {
19
- const t = norm($(el).text());
20
- if (t)
21
- paras.push(toTexte(t));
22
- });
23
- if (paras.length) {
24
- sommaire.para = paras.length === 1 ? paras[0] : paras;
79
+ catch (e) {
80
+ console.error(`[CRI] parseSlot error file=${xmlFilePath} slot=${wantedSlot}:`, e);
81
+ return null;
25
82
  }
26
- // (3) first-level items (tm3)
27
- const items = [];
28
- root.find("p.tm3").each((_, el) => {
29
- const $p = $(el);
30
- const full = norm($p.text());
31
- // try to extract the numeric order at the start: "1. ..." or "2 – ..." etc.
32
- const numMatch = full.match(/^(\d+)\s*[.\-–—]/);
33
- const valeur = numMatch ? numMatch[1] : undefined;
34
- // prefer the linked title text; fallback to full text
35
- const a = $p.find("a").first();
36
- const intitule = norm(a.text() || full.replace(/^(\d+)\s*[.\-–—]\s*/, ""));
37
- // id_syceron = href target without '#' ? TODO verify
38
- const href = a.attr("href") || "";
39
- const idSyceron = href.startsWith("#") ? href.slice(1) : href;
40
- const titreStruct = {
41
- id_syceron: idSyceron || "",
42
- intitule,
43
- };
44
- const elem = {
45
- valeur_pts_odj: valeur,
46
- titreStruct,
47
- // sommaire2/3 undefined (first level only)
83
+ }
84
+ export function sessionStartYearFromDate(d) {
85
+ // Session (1th oct N → 30 sept N+1)
86
+ const m = d.getMonth();
87
+ const y = d.getFullYear();
88
+ return m >= 9 ? y : y - 1;
89
+ }
90
+ export function parseYYYYMMDD(yyyymmdd) {
91
+ if (!/^\d{8}$/.test(yyyymmdd))
92
+ return null;
93
+ const y = Number(yyyymmdd.slice(0, 4));
94
+ const m = Number(yyyymmdd.slice(4, 6)) - 1;
95
+ const d = Number(yyyymmdd.slice(6, 8));
96
+ const dt = new Date(y, m, d);
97
+ return Number.isFinite(dt.getTime()) ? dt : null;
98
+ }
99
+ export function deriveTitreObjetFromSommaire(sommaire, slot) {
100
+ const items = extractLevel1Items(sommaire);
101
+ const meaningful = items.filter(it => !isBoilerplate(it.label));
102
+ if (meaningful.length === 0) {
103
+ return {
104
+ titre: `Séance publique ${slotLabel(slot)}`,
105
+ objet: "",
48
106
  };
49
- items.push(elem);
50
- });
51
- if (items.length) {
52
- sommaire.sommaire1 = items;
53
107
  }
54
- return sommaire;
108
+ const titre = meaningful[0].label;
109
+ const objet = meaningful.slice(0, 3).map(it => it.label).join(" ; ");
110
+ return { titre, objet };
111
+ }
112
+ function slotLabel(slot) {
113
+ switch (slot) {
114
+ case "MATIN": return "du matin";
115
+ case "APRES-MIDI": return "de l’après-midi";
116
+ case "SOIR": return "du soir";
117
+ default: return "";
118
+ }
55
119
  }
56
- function stripTrailingPunct(s) {
57
- return s.replace(/\s*([:,.;])\s*$/u, "").trim();
120
+ const BOILERPLATE_PATTERNS = [
121
+ /proc(?:è|e)s-?verbal/i,
122
+ /hommages?/i,
123
+ /désignation des vice-?président/i,
124
+ /candidatures? aux?/i,
125
+ /ordre du jour/i,
126
+ /rappels? au règlement/i,
127
+ /communications?/i,
128
+ /dépôts?/i,
129
+ /proclamation/i,
130
+ /présidence de/i,
131
+ /questions? diverses?/i,
132
+ /ouverture de la séance/i,
133
+ /clo(?:t|̂)ure de la séance/i,
134
+ ];
135
+ const isBoilerplate = (label) => !label?.trim() || BOILERPLATE_PATTERNS.some(rx => rx.test(label));
136
+ function extractLevel1Items(sommaire) {
137
+ const level1 = asArray(sommaire?.sommaire1);
138
+ return level1
139
+ .map(el => ({
140
+ numero: toInt(el?.valeur_pts_odj),
141
+ label: String(el?.titreStruct?.intitule ?? "").trim(),
142
+ }))
143
+ .filter(it => !!it.label)
144
+ .sort((a, b) => a.numero - b.numero);
58
145
  }
146
+ function stripTrailingPunct(s) { return s.replace(/\s*([:,.;])\s*$/u, "").trim(); }
59
147
  function dedupeSpeaker(raw) {
60
148
  let s = norm(raw);
61
149
  s = stripTrailingPunct(s);
62
- const dupPatterns = [
63
- /^(.+?)\s*[.]\s*\1$/u,
64
- /^(.+?)\s*,\s*\1,?$/u,
65
- /^(.+?)\s+\1$/u,
66
- ];
150
+ const dupPatterns = [/^(.+?)\s*[.]\s*\1$/u, /^(.+?)\s*,\s*\1,?$/u, /^(.+?)\s+\1$/u];
67
151
  for (const re of dupPatterns) {
68
152
  const m = s.match(re);
69
153
  if (m) {
@@ -74,12 +158,10 @@ function dedupeSpeaker(raw) {
74
158
  return s.replace(/\.\s*$/, "");
75
159
  }
76
160
  function decodeHtmlEntities(s) {
77
- return s
78
- .replace(/&#(\d+);/g, (_, d) => String.fromCharCode(parseInt(d, 10)))
161
+ return s.replace(/&#(\d+);/g, (_, d) => String.fromCharCode(parseInt(d, 10)))
79
162
  .replace(/&#x([0-9a-fA-F]+);/g, (_, h) => String.fromCharCode(parseInt(h, 16)));
80
163
  }
81
164
  function fixApostrophes(s) {
82
- // Tighten spacing around French apostrophes and punctuation
83
165
  let out = s;
84
166
  out = out.replace(/\s*’\s*/g, "’");
85
167
  out = out.replace(/\b([dljctmsn])\s*’/gi, (_, m) => m + "’");
@@ -87,43 +169,33 @@ function fixApostrophes(s) {
87
169
  out = out.replace(/\s+([,;:.!?])/g, "$1");
88
170
  return out;
89
171
  }
90
- function normalizeTitle(text) {
91
- return text.replace(/^PR[ÉE]SIDENCE DE\b/i, "Présidence de ");
92
- }
172
+ function normalizeTitle(text) { return text.replace(/^PR[ÉE]SIDENCE DE\b/i, "Présidence de "); }
93
173
  function roleForSpeaker(labelOrQualite) {
94
- const s = labelOrQualite.toLowerCase();
95
- if (/^(m\.|mme)?\s*(le|la)\s+pr[ée]sident(e)?\b/.test(s) ||
96
- /\bpr[ée]sident[e]?\s+de\s+séance\b/.test(s)) {
174
+ const s = (labelOrQualite || "").toLowerCase();
175
+ if (/^(m\.|mme)?\s*(le|la)\s+pr[ée]sident(e)?\b/.test(s) || /\bpr[ée]sident[e]?\s+de\s+séance\b/.test(s))
97
176
  return "président";
98
- }
99
177
  return "";
100
178
  }
101
- // ---------------- DOM helpers ----------------
102
- function parseCriIntervenantComment(html) {
103
- // From <!-- cri:intervenant mat="..." nom="..." qua="..." ... -->
179
+ function readIntervenantMeta($block) {
180
+ const int = $block.find('cri\\:intervenant').first();
181
+ if (int.length)
182
+ return { mat: int.attr("mat") || undefined, nom: int.attr("nom") || undefined, qua: int.attr("qua") || undefined };
183
+ const html = $block.html() || "";
104
184
  const m = html.match(/<!--\s*cri:intervenant\b([^>]+)-->/i);
105
185
  if (!m)
106
186
  return {};
107
- const attrs = m[1];
108
187
  const out = {};
109
188
  const re = /(\w+)="([^"]*)"/g;
110
189
  let a;
111
- while ((a = re.exec(attrs))) {
190
+ while ((a = re.exec(m[1])))
112
191
  out[a[1]] = decodeHtmlEntities(a[2]);
113
- }
114
192
  return { mat: out["mat"], nom: out["nom"], qua: out["qua"] };
115
193
  }
116
- /**
117
- * Extract leading .orateur_qualite chunks from the FIRST <p> only,
118
- * concatenate them, clean punctuation/apostrophes, and REMOVE those nodes
119
- * (and .orateur_nom) from the first paragraph so the speech starts cleanly.
120
- */
121
194
  function extractAndRemoveLeadingQualite($, $block) {
122
195
  const firstP = $block.find("p").first();
123
196
  if (firstP.length === 0)
124
197
  return "";
125
198
  const parts = [];
126
- // Iterate over the first <p>'s children from the start
127
199
  let stop = false;
128
200
  firstP.contents().each((_, node) => {
129
201
  if (stop)
@@ -131,7 +203,6 @@ function extractAndRemoveLeadingQualite($, $block) {
131
203
  if (node.type === "tag") {
132
204
  const $node = $(node);
133
205
  if ($node.hasClass("orateur_nom")) {
134
- // speaker label node — remove it
135
206
  $node.remove();
136
207
  return;
137
208
  }
@@ -140,66 +211,95 @@ function extractAndRemoveLeadingQualite($, $block) {
140
211
  $node.remove();
141
212
  return;
142
213
  }
143
- // Non-qualite tag: if it has meaningful text, we reached the speech
144
214
  const t = norm($node.text() || "");
145
- if (t) {
215
+ if (t)
146
216
  stop = true;
147
- }
148
- else {
149
- // empty-ish node; remove to avoid stray punctuation
217
+ else
150
218
  $node.remove();
151
- }
152
219
  }
153
220
  else if (node.type === "text") {
154
221
  const t = norm(node.data || "");
155
- if (!t) {
156
- // whitespace only — drop it
157
- ;
158
- node.data = "";
159
- return;
160
- }
161
- // boundary punctuation like ":" just after label — drop it
162
- if (/^[:.,;–—-]+$/.test(t)) {
163
- ;
222
+ if (!t || /^[:.,;–—-]+$/.test(t)) {
164
223
  node.data = "";
165
224
  return;
166
225
  }
167
- // any other text means speech starts here
168
226
  stop = true;
169
227
  }
170
- else {
171
- // comment or others — ignore
172
- }
173
228
  });
174
- const qual = fixApostrophes(norm(parts.join(" ")));
175
- return qual;
229
+ return fixApostrophes(norm(parts.join(" ")));
176
230
  }
177
231
  function sanitizeInterventionHtml($, $block) {
178
- // Clone to avoid mutating outer tree order
179
232
  const $clone = $block.clone();
180
- // Remove navigation / anchors / images
181
233
  $clone.find('a[name]').remove();
182
234
  $clone.find('div[align="right"]').remove();
183
235
  $clone.find('a.link').remove();
184
236
  $clone.find('img').remove();
185
- // Remove technical anchors inside interventions
186
237
  $clone.find('a#ameli_amendement_cri_phrase, a#ameli_amendement_cra_contenu, a#ameli_amendement_cra_objet').remove();
187
- // Remove any remaining speaker label / quality spans anywhere
188
- $clone.find(".orateur_nom").remove();
189
- $clone.find(".orateur_qualite").remove();
190
- // Strip HTML comments
238
+ $clone.find(".orateur_nom, .orateur_qualite").remove();
191
239
  let html = $clone.html() || "";
192
240
  html = html.replace(/<!--[\s\S]*?-->/g, "");
193
241
  return html.trim();
194
242
  }
195
- function extractMetadonnees($) {
196
- const headerText = norm($("h1.page-title").text() || "");
197
- const dateMatch = headerText.match(/\b(\d{1,2}\s+\w+\s+\d{4})\b/i);
198
- const bodyText = norm($("#cri").text() || "");
199
- const sessionMatch = bodyText.match(/\bsession\s+(\d{4}-\d{4})\b/i);
243
+ function extractSommaireForIntervals($, idx, intervals) {
244
+ const inIv = (el) => elementInAnyInterval(el, idx, intervals);
245
+ const root = $("body");
246
+ const sommaire = { presidentSeance: { _: "" }, sommaire1: [] };
247
+ // (1) Présidence (tm2) — première ligne dans l’intervalle
248
+ const pres = root.find("p.tm2").filter((_, el) => inIv(el)).first();
249
+ if (pres.length)
250
+ sommaire.presidentSeance = { _: norm(pres.text()) };
251
+ // (2) Paras tm5 présents dans l’intervalle
252
+ const paras = [];
253
+ root.find("p.tm5").each((_, el) => {
254
+ if (!inIv(el))
255
+ return;
256
+ const t = norm($(el).text());
257
+ if (t)
258
+ paras.push({ _: t });
259
+ });
260
+ if (paras.length)
261
+ sommaire.para = paras.length === 1 ? paras[0] : paras;
262
+ // (3) Items de 1er niveau (tm3) présents dans l’intervalle
263
+ const items = [];
264
+ root.find("p.tm3").each((_, el) => {
265
+ if (!inIv(el))
266
+ return;
267
+ const $p = $(el);
268
+ const full = norm($p.text() || "");
269
+ if (!full)
270
+ return;
271
+ const numMatch = full.match(/^(\d+)\s*[.\-–—]\s*/);
272
+ const valeur = numMatch ? numMatch[1] : undefined;
273
+ // prefere intitule in ancre <a> if present
274
+ const a = $p.find("a").first();
275
+ const intituleRaw = a.length ? a.text() : full.replace(/^(\d+)\s*[.\-–—]\s*/, "");
276
+ const intitule = norm(intituleRaw);
277
+ // id_syceron from href="#Niv1_SOMx"
278
+ const href = (a.attr("href") || "").trim();
279
+ const idSyceron = href.startsWith("#") ? href.slice(1) : href;
280
+ const titreStruct = { id_syceron: idSyceron || "", intitule };
281
+ items.push({ valeur_pts_odj: valeur, titreStruct });
282
+ });
283
+ if (items.length)
284
+ sommaire.sommaire1 = items;
285
+ return sommaire;
286
+ }
287
+ function extractMetadonnees($, filePath) {
288
+ let dateText = norm($("h1, h2, .page-title").first().text() || "");
289
+ if (!dateText)
290
+ dateText = norm($("p").first().text() || "");
291
+ const dateMatch = dateText.match(/\b(\d{1,2}\s+\w+\s+\d{4})\b/i);
292
+ const allText = norm($("body").text() || "");
293
+ const sessionMatch = allText.match(/\bsession\s+(\d{4}-\d{4})\b/i);
294
+ let dateSeance = dateMatch?.[1] || "";
295
+ if (!dateSeance) {
296
+ const m = filePath.match(/d(\d{4})(\d{2})(\d{2})\.xml$/i);
297
+ if (m)
298
+ dateSeance = `${m[1]}-${m[2]}-${m[3]}`;
299
+ }
200
300
  return {
201
- dateSeance: dateMatch?.[1] || "",
202
- dateSeanceJour: dateMatch?.[1] || "",
301
+ dateSeance,
302
+ dateSeanceJour: dateSeance,
203
303
  numSeanceJour: "",
204
304
  numSeance: "",
205
305
  typeAssemblee: "SN",
@@ -211,103 +311,15 @@ function extractMetadonnees($) {
211
311
  diffusion: "",
212
312
  version: "1.0",
213
313
  environnement: "",
214
- heureGeneration: new Date(),
215
- sommaire: extractSommaire($)
314
+ heureGeneration: new Date()
216
315
  };
217
316
  }
218
- // ---------------- main transform ----------------
219
- export async function parseCompteRenduFromFile(htmlFilePath) {
220
- try {
221
- const { window } = await JSDOM.fromFile(htmlFilePath, { contentType: "text/html" });
222
- const $ = cheerio.load(window.document.documentElement.outerHTML);
223
- const metadonnees = extractMetadonnees($);
224
- const points = [];
225
- let ordre = 0;
226
- const addPoint = (p) => points.push({ ...p, ordre_absolu_seance: String(++ordre) });
227
- // (1) Global section titles (common high-level headings)
228
- let lastTitle = "";
229
- $("#cri p[class^='titre_S']").each((_, el) => {
230
- const t = normalizeTitle(norm($(el).text() || ""));
231
- if (t && t !== lastTitle) {
232
- addPoint({ code_grammaire: "TITRE_TEXTE_DISCUSSION", texte: { _: t }, code_style: "Titre" });
233
- lastTitle = t;
234
- }
235
- });
236
- // (2) Interventions
237
- $("#cri div.intervenant").each((_, block) => {
238
- const $block = $(block);
239
- // (2.a) Extract internal structural titles inside this block (and remove them)
240
- const structuralSel = [
241
- "p[class^='titre_S']",
242
- "p.mention_titre",
243
- "p.intitule_titre",
244
- "p.mention_chapitre",
245
- "p.intitule_chapitre",
246
- "p.mention_article",
247
- "p.intitule_article",
248
- "p.mention_section",
249
- "p.intitule_section",
250
- ].join(",");
251
- $block.find(structuralSel).each((__, el) => {
252
- const title = normalizeTitle(norm($(el).text() || ""));
253
- if (title && title !== lastTitle) {
254
- addPoint({ code_grammaire: "TITRE_TEXTE_DISCUSSION", texte: { _: title }, code_style: "Titre" });
255
- lastTitle = title;
256
- }
257
- $(el).remove();
258
- });
259
- // (2.b) Speaker label & quality
260
- const firstP = $block.find("p").first();
261
- const speakerLabelRaw = firstP.find(".orateur_nom").text() ||
262
- firstP.find("a.lien_senfic").text() ||
263
- "";
264
- const speakerLabel = dedupeSpeaker(speakerLabelRaw);
265
- // Prefer <!--cri:intervenant ...--> for id/name/qualite when available
266
- const rawHtml = $block.html() || "";
267
- const { mat, nom: nomFromComment, qua: quaFromCommentRaw } = parseCriIntervenantComment(rawHtml);
268
- // Extract and remove leading .orateur_qualite chunks from first <p>
269
- const qualFromSpans = extractAndRemoveLeadingQualite($, $block);
270
- const qualite = norm(decodeHtmlEntities(quaFromCommentRaw || "")) ||
271
- qualFromSpans;
272
- const canonicalName = dedupeSpeaker(nomFromComment || speakerLabel);
273
- const role = roleForSpeaker(speakerLabel) ||
274
- roleForSpeaker(qualite) ||
275
- roleForSpeaker(quaFromCommentRaw || "");
276
- // (2.c) Build cleaned speech HTML
277
- let speechHtml = sanitizeInterventionHtml($, $block);
278
- // If nothing meaningful remains, skip
279
- if (!norm(cheerio.load(speechHtml).text() || ""))
280
- return;
281
- addPoint({
282
- code_grammaire: "PAROLE_GENERIQUE",
283
- roledebat: role,
284
- orateurs: {
285
- orateur: {
286
- nom: canonicalName,
287
- id: mat || "",
288
- qualite: qualite,
289
- },
290
- },
291
- texte: { _: speechHtml },
292
- });
293
- });
294
- const contenu = {
295
- quantiemes: {
296
- journee: metadonnees.dateSeance,
297
- session: metadonnees.session,
298
- },
299
- point: points,
300
- };
301
- return {
302
- uid: htmlFilePath.replace(/^.*?(\d{8}).*$/i, "$1"),
303
- seanceRef: htmlFilePath.replace(/^.*?(\d{8}).*$/i, "$1"),
304
- sessionRef: metadonnees.session,
305
- metadonnees,
306
- contenu,
307
- };
308
- }
309
- catch (e) {
310
- console.error("Could not parse compte-rendu with error", e);
311
- return null;
312
- }
317
+ function elementInAnyInterval(el, idx, intervals) {
318
+ const p = idx.get(el);
319
+ if (p == null)
320
+ return false;
321
+ for (const iv of intervals)
322
+ if (p >= iv.start && p < iv.end)
323
+ return true;
324
+ return false;
313
325
  }
@@ -6,3 +6,4 @@ export declare function removeSubstring(expr: Expression<string | null | undefin
6
6
  export declare function replace(expr: Expression<string | null | undefined>, pattern: Expression<string>, replacement: Expression<string>): import("kysely").RawBuilder<string>;
7
7
  export declare function rtrim(expr: Expression<string | null | undefined>): import("kysely").RawBuilder<string>;
8
8
  export declare function toDateString(expr: Expression<Date | null | undefined>, format?: Expression<string>): import("kysely").RawBuilder<string>;
9
+ export declare function norm(s?: string | null): string;
package/lib/model/util.js CHANGED
@@ -21,3 +21,6 @@ export function rtrim(expr) {
21
21
  export function toDateString(expr, format = sql.val(STANDARD_DATE_FORMAT)) {
22
22
  return sql `to_char(${expr}, ${format})`;
23
23
  }
24
+ export function norm(s) {
25
+ return (s || "").replace(/\u00A0/g, " ").replace(/\s+/g, " ").trim();
26
+ }
@@ -8,7 +8,8 @@ import { parseAgendaFromFile } from "../model/agenda";
8
8
  import { getSessionsFromStart } from "../types/sessions";
9
9
  import { ID_DATE_FORMAT } from "./datautil";
10
10
  import { commonOptions } from "./shared/cli_helpers";
11
- import { ensureAndClearDir } from "./shared/util";
11
+ import { ensureAndClearDir, fetchWithRetry } from "./shared/util";
12
+ import { groupNonSPByTypeOrganeHour, groupSeancePubliqueBySlot } from "../utils/reunion_grouping";
12
13
  const optionsDefinitions = [
13
14
  ...commonOptions,
14
15
  {
@@ -64,7 +65,7 @@ async function downloadAgenda(agendaName, agendaPath) {
64
65
  if (!options["silent"]) {
65
66
  console.log(`Downloading Agenda ${agendaUrl}…`);
66
67
  }
67
- const response = await fetch(agendaUrl);
68
+ const response = await fetchWithRetry(agendaUrl);
68
69
  if (!response.ok) {
69
70
  if (response.status === 404) {
70
71
  console.warn(`Agenda ${agendaUrl} not found`);
@@ -80,15 +81,33 @@ async function downloadAgenda(agendaName, agendaPath) {
80
81
  }
81
82
  fs.writeFileSync(agendaPath, Buffer.from(agendaContent));
82
83
  }
84
+ function writeGroupsAsFiles(dir, groups) {
85
+ for (const g of groups) {
86
+ const outPath = path.join(dir, `${g.uid}.json`);
87
+ fs.writeJSONSync(outPath, g, { spaces: 2 });
88
+ }
89
+ }
83
90
  async function parseAgenda(transformedAgendaSessionDir, agendaFileName, agendaPath) {
84
- if (!options["silent"]) {
91
+ if (!options["silent"])
85
92
  console.log(`Parsing Agenda ${agendaPath}…`);
86
- }
87
93
  const parsedAgendaEvents = await parseAgendaFromFile(agendaPath);
88
- if (!parsedAgendaEvents || parsedAgendaEvents.length === 0) {
94
+ if (!parsedAgendaEvents?.length)
89
95
  return;
96
+ const flatPath = path.join(transformedAgendaSessionDir, `${agendaFileName}.json`);
97
+ fs.writeJSONSync(flatPath, parsedAgendaEvents, { spaces: 2 });
98
+ // 1) SP → groubed by (date, slot)
99
+ const spGrouped = groupSeancePubliqueBySlot(parsedAgendaEvents);
100
+ if (spGrouped.length > 0) {
101
+ writeGroupsAsFiles(transformedAgendaSessionDir, spGrouped);
102
+ }
103
+ // 2) NON-SP → groubed by (date, organe, hour)
104
+ const groupedBySuffix = groupNonSPByTypeOrganeHour(parsedAgendaEvents);
105
+ for (const suffix of ["IDC", "IDM", "IDO", "IDI"]) {
106
+ const groups = groupedBySuffix[suffix] || [];
107
+ if (groups.length > 0) {
108
+ writeGroupsAsFiles(transformedAgendaSessionDir, groups);
109
+ }
90
110
  }
91
- fs.writeJSONSync(path.join(transformedAgendaSessionDir, `${agendaFileName}.json`), parsedAgendaEvents, { spaces: 2 });
92
111
  }
93
112
  async function main() {
94
113
  const dataDir = options["dataDir"];
@@ -1 +1,6 @@
1
- export {};
1
+ /**
2
+ * Needs to be run after retrieve_agenda.ts !
3
+ * - downloads the ZIP of comptes-rendus des débats (CRI) from data.senat.fr
4
+ * - extracts XML files, distributes them by session/year
5
+ */
6
+ export declare function retrieveCriXmlDump(dataDir: string, options?: Record<string, any>): Promise<void>;