@tricoteuses/senat 2.10.0 → 2.10.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE.md +22 -22
- package/README.md +116 -116
- package/lib/loaders.d.ts +6 -1
- package/lib/loaders.js +54 -0
- package/lib/model/agenda.js +0 -2
- package/lib/model/compte_rendu.d.ts +9 -2
- package/lib/model/compte_rendu.js +223 -211
- package/lib/model/util.d.ts +1 -0
- package/lib/model/util.js +3 -0
- package/lib/scripts/retrieve_agenda.js +25 -6
- package/lib/scripts/retrieve_comptes_rendus.d.ts +6 -1
- package/lib/scripts/retrieve_comptes_rendus.js +230 -77
- package/lib/scripts/retrieve_comptes_rendus_seance.d.ts +6 -0
- package/lib/scripts/retrieve_comptes_rendus_seance.js +273 -0
- package/lib/scripts/retrieve_videos.js +1 -9
- package/lib/types/agenda.d.ts +19 -2
- package/lib/types/compte_rendu.d.ts +1 -1
- package/lib/utils/cr_spliting.d.ts +7 -0
- package/lib/utils/cr_spliting.js +125 -0
- package/lib/utils/reunion_grouping.d.ts +6 -0
- package/lib/utils/reunion_grouping.js +359 -0
- package/lib/validators/senat.d.ts +0 -0
- package/lib/validators/senat.js +24 -0
- package/package.json +98 -98
- package/lib/raw_types/kysely-table-types.d.ts +0 -5
- package/lib/raw_types/kysely-table-types.js +0 -1
|
@@ -1,69 +1,153 @@
|
|
|
1
|
-
import
|
|
1
|
+
import fs from "fs";
|
|
2
2
|
import * as cheerio from "cheerio";
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
3
|
+
import path from "path";
|
|
4
|
+
import { computeIntervalsBySlot } from "../utils/cr_spliting";
|
|
5
|
+
import { norm } from "./util";
|
|
6
|
+
const asArray = (x) => x == null ? [] : Array.isArray(x) ? x : [x];
|
|
7
|
+
const toInt = (s) => Number.isFinite(Number(s)) ? Number(s) : Number.POSITIVE_INFINITY;
|
|
8
|
+
export async function parseCompteRenduSlotFromFile(xmlFilePath, wantedSlot, firstSlotOfDay) {
|
|
9
|
+
try {
|
|
10
|
+
const raw = fs.readFileSync(xmlFilePath, "utf8");
|
|
11
|
+
const $ = cheerio.load(raw, { xml: false });
|
|
12
|
+
const metadonnees = extractMetadonnees($, xmlFilePath);
|
|
13
|
+
const order = $("body *").toArray();
|
|
14
|
+
const idx = new Map(order.map((el, i) => [el, i]));
|
|
15
|
+
const intervalsAll = computeIntervalsBySlot($, idx, firstSlotOfDay);
|
|
16
|
+
const intervals = intervalsAll.filter(iv => iv.slot === wantedSlot);
|
|
17
|
+
if (intervals.length === 0) {
|
|
18
|
+
console.warn(`[CRI] no intervals for ${path.basename(xmlFilePath)} [${wantedSlot}]`);
|
|
19
|
+
return null;
|
|
20
|
+
}
|
|
21
|
+
metadonnees.sommaire = extractSommaireForIntervals($, idx, intervals);
|
|
22
|
+
const points = [];
|
|
23
|
+
let ordre = 0;
|
|
24
|
+
const addPoint = (p) => points.push({ ...p, ordre_absolu_seance: String(++ordre) });
|
|
25
|
+
// Titles
|
|
26
|
+
$("cri\\:titreS1 p.titre_S1").each((_, el) => {
|
|
27
|
+
if (!elementInAnyInterval(el, idx, intervals))
|
|
28
|
+
return;
|
|
29
|
+
const t = normalizeTitle(norm($(el).text() || ""));
|
|
30
|
+
if (t)
|
|
31
|
+
addPoint({ code_grammaire: "TITRE_TEXTE_DISCUSSION", texte: { _: t }, code_style: "Titre" });
|
|
32
|
+
});
|
|
33
|
+
// Interventions
|
|
34
|
+
$("div.intervenant").each((_, block) => {
|
|
35
|
+
if (!elementInAnyInterval(block, idx, intervals))
|
|
36
|
+
return;
|
|
37
|
+
const $block = $(block);
|
|
38
|
+
$block.find([
|
|
39
|
+
"p[class^='titre_S']",
|
|
40
|
+
"p.mention_titre",
|
|
41
|
+
"p.intitule_titre",
|
|
42
|
+
"p.mention_chapitre",
|
|
43
|
+
"p.intitule_chapitre",
|
|
44
|
+
"p.mention_article",
|
|
45
|
+
"p.intitule_article",
|
|
46
|
+
"p.mention_section",
|
|
47
|
+
"p.intitule_section",
|
|
48
|
+
].join(",")).remove();
|
|
49
|
+
const firstP = $block.find("p").first();
|
|
50
|
+
const speakerLabelRaw = firstP.find(".orateur_nom").text() || firstP.find("a.lien_senfic").text() || "";
|
|
51
|
+
const speakerLabel = dedupeSpeaker(speakerLabelRaw);
|
|
52
|
+
const { mat, nom: nomCRI, qua: quaCRI } = readIntervenantMeta($block);
|
|
53
|
+
const qualFromSpans = extractAndRemoveLeadingQualite($, $block);
|
|
54
|
+
const qualite = norm(decodeHtmlEntities(quaCRI || "")) || qualFromSpans;
|
|
55
|
+
const canonicalName = dedupeSpeaker(nomCRI || speakerLabel);
|
|
56
|
+
const role = roleForSpeaker(speakerLabel) || roleForSpeaker(qualite) || roleForSpeaker(quaCRI || "");
|
|
57
|
+
const speechHtml = sanitizeInterventionHtml($, $block);
|
|
58
|
+
if (!norm(cheerio.load(speechHtml).text() || ""))
|
|
59
|
+
return;
|
|
60
|
+
addPoint({
|
|
61
|
+
code_grammaire: "PAROLE_GENERIQUE",
|
|
62
|
+
roledebat: role,
|
|
63
|
+
orateurs: { orateur: { nom: canonicalName, id: mat || "", qualite } },
|
|
64
|
+
texte: { _: speechHtml },
|
|
65
|
+
});
|
|
66
|
+
});
|
|
67
|
+
const contenu = {
|
|
68
|
+
quantiemes: { journee: metadonnees.dateSeance, session: metadonnees.session },
|
|
69
|
+
point: points,
|
|
70
|
+
};
|
|
71
|
+
return {
|
|
72
|
+
uid: "CRSSN" + xmlFilePath.replace(/^.*?(\d{8}).*$/i, "$1") + `-${wantedSlot}`,
|
|
73
|
+
seanceRef: "RUSN" + xmlFilePath.replace(/^.*?(\d{8}).*$/i, "$1") + "IDS-" + wantedSlot,
|
|
74
|
+
sessionRef: metadonnees.session,
|
|
75
|
+
metadonnees,
|
|
76
|
+
contenu,
|
|
77
|
+
};
|
|
15
78
|
}
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
const t = norm($(el).text());
|
|
20
|
-
if (t)
|
|
21
|
-
paras.push(toTexte(t));
|
|
22
|
-
});
|
|
23
|
-
if (paras.length) {
|
|
24
|
-
sommaire.para = paras.length === 1 ? paras[0] : paras;
|
|
79
|
+
catch (e) {
|
|
80
|
+
console.error(`[CRI] parseSlot error file=${xmlFilePath} slot=${wantedSlot}:`, e);
|
|
81
|
+
return null;
|
|
25
82
|
}
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
83
|
+
}
|
|
84
|
+
export function sessionStartYearFromDate(d) {
|
|
85
|
+
// Session (1th oct N → 30 sept N+1)
|
|
86
|
+
const m = d.getMonth();
|
|
87
|
+
const y = d.getFullYear();
|
|
88
|
+
return m >= 9 ? y : y - 1;
|
|
89
|
+
}
|
|
90
|
+
export function parseYYYYMMDD(yyyymmdd) {
|
|
91
|
+
if (!/^\d{8}$/.test(yyyymmdd))
|
|
92
|
+
return null;
|
|
93
|
+
const y = Number(yyyymmdd.slice(0, 4));
|
|
94
|
+
const m = Number(yyyymmdd.slice(4, 6)) - 1;
|
|
95
|
+
const d = Number(yyyymmdd.slice(6, 8));
|
|
96
|
+
const dt = new Date(y, m, d);
|
|
97
|
+
return Number.isFinite(dt.getTime()) ? dt : null;
|
|
98
|
+
}
|
|
99
|
+
export function deriveTitreObjetFromSommaire(sommaire, slot) {
|
|
100
|
+
const items = extractLevel1Items(sommaire);
|
|
101
|
+
const meaningful = items.filter(it => !isBoilerplate(it.label));
|
|
102
|
+
if (meaningful.length === 0) {
|
|
103
|
+
return {
|
|
104
|
+
titre: `Séance publique ${slotLabel(slot)}`,
|
|
105
|
+
objet: "",
|
|
48
106
|
};
|
|
49
|
-
items.push(elem);
|
|
50
|
-
});
|
|
51
|
-
if (items.length) {
|
|
52
|
-
sommaire.sommaire1 = items;
|
|
53
107
|
}
|
|
54
|
-
|
|
108
|
+
const titre = meaningful[0].label;
|
|
109
|
+
const objet = meaningful.slice(0, 3).map(it => it.label).join(" ; ");
|
|
110
|
+
return { titre, objet };
|
|
111
|
+
}
|
|
112
|
+
function slotLabel(slot) {
|
|
113
|
+
switch (slot) {
|
|
114
|
+
case "MATIN": return "du matin";
|
|
115
|
+
case "APRES-MIDI": return "de l’après-midi";
|
|
116
|
+
case "SOIR": return "du soir";
|
|
117
|
+
default: return "";
|
|
118
|
+
}
|
|
55
119
|
}
|
|
56
|
-
|
|
57
|
-
|
|
120
|
+
const BOILERPLATE_PATTERNS = [
|
|
121
|
+
/proc(?:è|e)s-?verbal/i,
|
|
122
|
+
/hommages?/i,
|
|
123
|
+
/désignation des vice-?président/i,
|
|
124
|
+
/candidatures? aux?/i,
|
|
125
|
+
/ordre du jour/i,
|
|
126
|
+
/rappels? au règlement/i,
|
|
127
|
+
/communications?/i,
|
|
128
|
+
/dépôts?/i,
|
|
129
|
+
/proclamation/i,
|
|
130
|
+
/présidence de/i,
|
|
131
|
+
/questions? diverses?/i,
|
|
132
|
+
/ouverture de la séance/i,
|
|
133
|
+
/clo(?:t|̂)ure de la séance/i,
|
|
134
|
+
];
|
|
135
|
+
const isBoilerplate = (label) => !label?.trim() || BOILERPLATE_PATTERNS.some(rx => rx.test(label));
|
|
136
|
+
function extractLevel1Items(sommaire) {
|
|
137
|
+
const level1 = asArray(sommaire?.sommaire1);
|
|
138
|
+
return level1
|
|
139
|
+
.map(el => ({
|
|
140
|
+
numero: toInt(el?.valeur_pts_odj),
|
|
141
|
+
label: String(el?.titreStruct?.intitule ?? "").trim(),
|
|
142
|
+
}))
|
|
143
|
+
.filter(it => !!it.label)
|
|
144
|
+
.sort((a, b) => a.numero - b.numero);
|
|
58
145
|
}
|
|
146
|
+
function stripTrailingPunct(s) { return s.replace(/\s*([:,.;])\s*$/u, "").trim(); }
|
|
59
147
|
function dedupeSpeaker(raw) {
|
|
60
148
|
let s = norm(raw);
|
|
61
149
|
s = stripTrailingPunct(s);
|
|
62
|
-
const dupPatterns = [
|
|
63
|
-
/^(.+?)\s*[.]\s*\1$/u,
|
|
64
|
-
/^(.+?)\s*,\s*\1,?$/u,
|
|
65
|
-
/^(.+?)\s+\1$/u,
|
|
66
|
-
];
|
|
150
|
+
const dupPatterns = [/^(.+?)\s*[.]\s*\1$/u, /^(.+?)\s*,\s*\1,?$/u, /^(.+?)\s+\1$/u];
|
|
67
151
|
for (const re of dupPatterns) {
|
|
68
152
|
const m = s.match(re);
|
|
69
153
|
if (m) {
|
|
@@ -74,12 +158,10 @@ function dedupeSpeaker(raw) {
|
|
|
74
158
|
return s.replace(/\.\s*$/, "");
|
|
75
159
|
}
|
|
76
160
|
function decodeHtmlEntities(s) {
|
|
77
|
-
return s
|
|
78
|
-
.replace(/&#(\d+);/g, (_, d) => String.fromCharCode(parseInt(d, 10)))
|
|
161
|
+
return s.replace(/&#(\d+);/g, (_, d) => String.fromCharCode(parseInt(d, 10)))
|
|
79
162
|
.replace(/&#x([0-9a-fA-F]+);/g, (_, h) => String.fromCharCode(parseInt(h, 16)));
|
|
80
163
|
}
|
|
81
164
|
function fixApostrophes(s) {
|
|
82
|
-
// Tighten spacing around French apostrophes and punctuation
|
|
83
165
|
let out = s;
|
|
84
166
|
out = out.replace(/\s*’\s*/g, "’");
|
|
85
167
|
out = out.replace(/\b([dljctmsn])\s*’/gi, (_, m) => m + "’");
|
|
@@ -87,43 +169,33 @@ function fixApostrophes(s) {
|
|
|
87
169
|
out = out.replace(/\s+([,;:.!?])/g, "$1");
|
|
88
170
|
return out;
|
|
89
171
|
}
|
|
90
|
-
function normalizeTitle(text) {
|
|
91
|
-
return text.replace(/^PR[ÉE]SIDENCE DE\b/i, "Présidence de ");
|
|
92
|
-
}
|
|
172
|
+
function normalizeTitle(text) { return text.replace(/^PR[ÉE]SIDENCE DE\b/i, "Présidence de "); }
|
|
93
173
|
function roleForSpeaker(labelOrQualite) {
|
|
94
|
-
const s = labelOrQualite.toLowerCase();
|
|
95
|
-
if (/^(m\.|mme)?\s*(le|la)\s+pr[ée]sident(e)?\b/.test(s) ||
|
|
96
|
-
/\bpr[ée]sident[e]?\s+de\s+séance\b/.test(s)) {
|
|
174
|
+
const s = (labelOrQualite || "").toLowerCase();
|
|
175
|
+
if (/^(m\.|mme)?\s*(le|la)\s+pr[ée]sident(e)?\b/.test(s) || /\bpr[ée]sident[e]?\s+de\s+séance\b/.test(s))
|
|
97
176
|
return "président";
|
|
98
|
-
}
|
|
99
177
|
return "";
|
|
100
178
|
}
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
179
|
+
function readIntervenantMeta($block) {
|
|
180
|
+
const int = $block.find('cri\\:intervenant').first();
|
|
181
|
+
if (int.length)
|
|
182
|
+
return { mat: int.attr("mat") || undefined, nom: int.attr("nom") || undefined, qua: int.attr("qua") || undefined };
|
|
183
|
+
const html = $block.html() || "";
|
|
104
184
|
const m = html.match(/<!--\s*cri:intervenant\b([^>]+)-->/i);
|
|
105
185
|
if (!m)
|
|
106
186
|
return {};
|
|
107
|
-
const attrs = m[1];
|
|
108
187
|
const out = {};
|
|
109
188
|
const re = /(\w+)="([^"]*)"/g;
|
|
110
189
|
let a;
|
|
111
|
-
while ((a = re.exec(
|
|
190
|
+
while ((a = re.exec(m[1])))
|
|
112
191
|
out[a[1]] = decodeHtmlEntities(a[2]);
|
|
113
|
-
}
|
|
114
192
|
return { mat: out["mat"], nom: out["nom"], qua: out["qua"] };
|
|
115
193
|
}
|
|
116
|
-
/**
|
|
117
|
-
* Extract leading .orateur_qualite chunks from the FIRST <p> only,
|
|
118
|
-
* concatenate them, clean punctuation/apostrophes, and REMOVE those nodes
|
|
119
|
-
* (and .orateur_nom) from the first paragraph so the speech starts cleanly.
|
|
120
|
-
*/
|
|
121
194
|
function extractAndRemoveLeadingQualite($, $block) {
|
|
122
195
|
const firstP = $block.find("p").first();
|
|
123
196
|
if (firstP.length === 0)
|
|
124
197
|
return "";
|
|
125
198
|
const parts = [];
|
|
126
|
-
// Iterate over the first <p>'s children from the start
|
|
127
199
|
let stop = false;
|
|
128
200
|
firstP.contents().each((_, node) => {
|
|
129
201
|
if (stop)
|
|
@@ -131,7 +203,6 @@ function extractAndRemoveLeadingQualite($, $block) {
|
|
|
131
203
|
if (node.type === "tag") {
|
|
132
204
|
const $node = $(node);
|
|
133
205
|
if ($node.hasClass("orateur_nom")) {
|
|
134
|
-
// speaker label node — remove it
|
|
135
206
|
$node.remove();
|
|
136
207
|
return;
|
|
137
208
|
}
|
|
@@ -140,66 +211,95 @@ function extractAndRemoveLeadingQualite($, $block) {
|
|
|
140
211
|
$node.remove();
|
|
141
212
|
return;
|
|
142
213
|
}
|
|
143
|
-
// Non-qualite tag: if it has meaningful text, we reached the speech
|
|
144
214
|
const t = norm($node.text() || "");
|
|
145
|
-
if (t)
|
|
215
|
+
if (t)
|
|
146
216
|
stop = true;
|
|
147
|
-
|
|
148
|
-
else {
|
|
149
|
-
// empty-ish node; remove to avoid stray punctuation
|
|
217
|
+
else
|
|
150
218
|
$node.remove();
|
|
151
|
-
}
|
|
152
219
|
}
|
|
153
220
|
else if (node.type === "text") {
|
|
154
221
|
const t = norm(node.data || "");
|
|
155
|
-
if (!t) {
|
|
156
|
-
// whitespace only — drop it
|
|
157
|
-
;
|
|
158
|
-
node.data = "";
|
|
159
|
-
return;
|
|
160
|
-
}
|
|
161
|
-
// boundary punctuation like ":" just after label — drop it
|
|
162
|
-
if (/^[:.,;–—-]+$/.test(t)) {
|
|
163
|
-
;
|
|
222
|
+
if (!t || /^[:.,;–—-]+$/.test(t)) {
|
|
164
223
|
node.data = "";
|
|
165
224
|
return;
|
|
166
225
|
}
|
|
167
|
-
// any other text means speech starts here
|
|
168
226
|
stop = true;
|
|
169
227
|
}
|
|
170
|
-
else {
|
|
171
|
-
// comment or others — ignore
|
|
172
|
-
}
|
|
173
228
|
});
|
|
174
|
-
|
|
175
|
-
return qual;
|
|
229
|
+
return fixApostrophes(norm(parts.join(" ")));
|
|
176
230
|
}
|
|
177
231
|
function sanitizeInterventionHtml($, $block) {
|
|
178
|
-
// Clone to avoid mutating outer tree order
|
|
179
232
|
const $clone = $block.clone();
|
|
180
|
-
// Remove navigation / anchors / images
|
|
181
233
|
$clone.find('a[name]').remove();
|
|
182
234
|
$clone.find('div[align="right"]').remove();
|
|
183
235
|
$clone.find('a.link').remove();
|
|
184
236
|
$clone.find('img').remove();
|
|
185
|
-
// Remove technical anchors inside interventions
|
|
186
237
|
$clone.find('a#ameli_amendement_cri_phrase, a#ameli_amendement_cra_contenu, a#ameli_amendement_cra_objet').remove();
|
|
187
|
-
|
|
188
|
-
$clone.find(".orateur_nom").remove();
|
|
189
|
-
$clone.find(".orateur_qualite").remove();
|
|
190
|
-
// Strip HTML comments
|
|
238
|
+
$clone.find(".orateur_nom, .orateur_qualite").remove();
|
|
191
239
|
let html = $clone.html() || "";
|
|
192
240
|
html = html.replace(/<!--[\s\S]*?-->/g, "");
|
|
193
241
|
return html.trim();
|
|
194
242
|
}
|
|
195
|
-
function
|
|
196
|
-
const
|
|
197
|
-
const
|
|
198
|
-
const
|
|
199
|
-
|
|
243
|
+
function extractSommaireForIntervals($, idx, intervals) {
|
|
244
|
+
const inIv = (el) => elementInAnyInterval(el, idx, intervals);
|
|
245
|
+
const root = $("body");
|
|
246
|
+
const sommaire = { presidentSeance: { _: "" }, sommaire1: [] };
|
|
247
|
+
// (1) Présidence (tm2) — première ligne dans l’intervalle
|
|
248
|
+
const pres = root.find("p.tm2").filter((_, el) => inIv(el)).first();
|
|
249
|
+
if (pres.length)
|
|
250
|
+
sommaire.presidentSeance = { _: norm(pres.text()) };
|
|
251
|
+
// (2) Paras tm5 présents dans l’intervalle
|
|
252
|
+
const paras = [];
|
|
253
|
+
root.find("p.tm5").each((_, el) => {
|
|
254
|
+
if (!inIv(el))
|
|
255
|
+
return;
|
|
256
|
+
const t = norm($(el).text());
|
|
257
|
+
if (t)
|
|
258
|
+
paras.push({ _: t });
|
|
259
|
+
});
|
|
260
|
+
if (paras.length)
|
|
261
|
+
sommaire.para = paras.length === 1 ? paras[0] : paras;
|
|
262
|
+
// (3) Items de 1er niveau (tm3) présents dans l’intervalle
|
|
263
|
+
const items = [];
|
|
264
|
+
root.find("p.tm3").each((_, el) => {
|
|
265
|
+
if (!inIv(el))
|
|
266
|
+
return;
|
|
267
|
+
const $p = $(el);
|
|
268
|
+
const full = norm($p.text() || "");
|
|
269
|
+
if (!full)
|
|
270
|
+
return;
|
|
271
|
+
const numMatch = full.match(/^(\d+)\s*[.\-–—]\s*/);
|
|
272
|
+
const valeur = numMatch ? numMatch[1] : undefined;
|
|
273
|
+
// prefere intitule in ancre <a> if present
|
|
274
|
+
const a = $p.find("a").first();
|
|
275
|
+
const intituleRaw = a.length ? a.text() : full.replace(/^(\d+)\s*[.\-–—]\s*/, "");
|
|
276
|
+
const intitule = norm(intituleRaw);
|
|
277
|
+
// id_syceron from href="#Niv1_SOMx"
|
|
278
|
+
const href = (a.attr("href") || "").trim();
|
|
279
|
+
const idSyceron = href.startsWith("#") ? href.slice(1) : href;
|
|
280
|
+
const titreStruct = { id_syceron: idSyceron || "", intitule };
|
|
281
|
+
items.push({ valeur_pts_odj: valeur, titreStruct });
|
|
282
|
+
});
|
|
283
|
+
if (items.length)
|
|
284
|
+
sommaire.sommaire1 = items;
|
|
285
|
+
return sommaire;
|
|
286
|
+
}
|
|
287
|
+
function extractMetadonnees($, filePath) {
|
|
288
|
+
let dateText = norm($("h1, h2, .page-title").first().text() || "");
|
|
289
|
+
if (!dateText)
|
|
290
|
+
dateText = norm($("p").first().text() || "");
|
|
291
|
+
const dateMatch = dateText.match(/\b(\d{1,2}\s+\w+\s+\d{4})\b/i);
|
|
292
|
+
const allText = norm($("body").text() || "");
|
|
293
|
+
const sessionMatch = allText.match(/\bsession\s+(\d{4}-\d{4})\b/i);
|
|
294
|
+
let dateSeance = dateMatch?.[1] || "";
|
|
295
|
+
if (!dateSeance) {
|
|
296
|
+
const m = filePath.match(/d(\d{4})(\d{2})(\d{2})\.xml$/i);
|
|
297
|
+
if (m)
|
|
298
|
+
dateSeance = `${m[1]}-${m[2]}-${m[3]}`;
|
|
299
|
+
}
|
|
200
300
|
return {
|
|
201
|
-
dateSeance
|
|
202
|
-
dateSeanceJour:
|
|
301
|
+
dateSeance,
|
|
302
|
+
dateSeanceJour: dateSeance,
|
|
203
303
|
numSeanceJour: "",
|
|
204
304
|
numSeance: "",
|
|
205
305
|
typeAssemblee: "SN",
|
|
@@ -211,103 +311,15 @@ function extractMetadonnees($) {
|
|
|
211
311
|
diffusion: "",
|
|
212
312
|
version: "1.0",
|
|
213
313
|
environnement: "",
|
|
214
|
-
heureGeneration: new Date()
|
|
215
|
-
sommaire: extractSommaire($)
|
|
314
|
+
heureGeneration: new Date()
|
|
216
315
|
};
|
|
217
316
|
}
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
const addPoint = (p) => points.push({ ...p, ordre_absolu_seance: String(++ordre) });
|
|
227
|
-
// (1) Global section titles (common high-level headings)
|
|
228
|
-
let lastTitle = "";
|
|
229
|
-
$("#cri p[class^='titre_S']").each((_, el) => {
|
|
230
|
-
const t = normalizeTitle(norm($(el).text() || ""));
|
|
231
|
-
if (t && t !== lastTitle) {
|
|
232
|
-
addPoint({ code_grammaire: "TITRE_TEXTE_DISCUSSION", texte: { _: t }, code_style: "Titre" });
|
|
233
|
-
lastTitle = t;
|
|
234
|
-
}
|
|
235
|
-
});
|
|
236
|
-
// (2) Interventions
|
|
237
|
-
$("#cri div.intervenant").each((_, block) => {
|
|
238
|
-
const $block = $(block);
|
|
239
|
-
// (2.a) Extract internal structural titles inside this block (and remove them)
|
|
240
|
-
const structuralSel = [
|
|
241
|
-
"p[class^='titre_S']",
|
|
242
|
-
"p.mention_titre",
|
|
243
|
-
"p.intitule_titre",
|
|
244
|
-
"p.mention_chapitre",
|
|
245
|
-
"p.intitule_chapitre",
|
|
246
|
-
"p.mention_article",
|
|
247
|
-
"p.intitule_article",
|
|
248
|
-
"p.mention_section",
|
|
249
|
-
"p.intitule_section",
|
|
250
|
-
].join(",");
|
|
251
|
-
$block.find(structuralSel).each((__, el) => {
|
|
252
|
-
const title = normalizeTitle(norm($(el).text() || ""));
|
|
253
|
-
if (title && title !== lastTitle) {
|
|
254
|
-
addPoint({ code_grammaire: "TITRE_TEXTE_DISCUSSION", texte: { _: title }, code_style: "Titre" });
|
|
255
|
-
lastTitle = title;
|
|
256
|
-
}
|
|
257
|
-
$(el).remove();
|
|
258
|
-
});
|
|
259
|
-
// (2.b) Speaker label & quality
|
|
260
|
-
const firstP = $block.find("p").first();
|
|
261
|
-
const speakerLabelRaw = firstP.find(".orateur_nom").text() ||
|
|
262
|
-
firstP.find("a.lien_senfic").text() ||
|
|
263
|
-
"";
|
|
264
|
-
const speakerLabel = dedupeSpeaker(speakerLabelRaw);
|
|
265
|
-
// Prefer <!--cri:intervenant ...--> for id/name/qualite when available
|
|
266
|
-
const rawHtml = $block.html() || "";
|
|
267
|
-
const { mat, nom: nomFromComment, qua: quaFromCommentRaw } = parseCriIntervenantComment(rawHtml);
|
|
268
|
-
// Extract and remove leading .orateur_qualite chunks from first <p>
|
|
269
|
-
const qualFromSpans = extractAndRemoveLeadingQualite($, $block);
|
|
270
|
-
const qualite = norm(decodeHtmlEntities(quaFromCommentRaw || "")) ||
|
|
271
|
-
qualFromSpans;
|
|
272
|
-
const canonicalName = dedupeSpeaker(nomFromComment || speakerLabel);
|
|
273
|
-
const role = roleForSpeaker(speakerLabel) ||
|
|
274
|
-
roleForSpeaker(qualite) ||
|
|
275
|
-
roleForSpeaker(quaFromCommentRaw || "");
|
|
276
|
-
// (2.c) Build cleaned speech HTML
|
|
277
|
-
let speechHtml = sanitizeInterventionHtml($, $block);
|
|
278
|
-
// If nothing meaningful remains, skip
|
|
279
|
-
if (!norm(cheerio.load(speechHtml).text() || ""))
|
|
280
|
-
return;
|
|
281
|
-
addPoint({
|
|
282
|
-
code_grammaire: "PAROLE_GENERIQUE",
|
|
283
|
-
roledebat: role,
|
|
284
|
-
orateurs: {
|
|
285
|
-
orateur: {
|
|
286
|
-
nom: canonicalName,
|
|
287
|
-
id: mat || "",
|
|
288
|
-
qualite: qualite,
|
|
289
|
-
},
|
|
290
|
-
},
|
|
291
|
-
texte: { _: speechHtml },
|
|
292
|
-
});
|
|
293
|
-
});
|
|
294
|
-
const contenu = {
|
|
295
|
-
quantiemes: {
|
|
296
|
-
journee: metadonnees.dateSeance,
|
|
297
|
-
session: metadonnees.session,
|
|
298
|
-
},
|
|
299
|
-
point: points,
|
|
300
|
-
};
|
|
301
|
-
return {
|
|
302
|
-
uid: htmlFilePath.replace(/^.*?(\d{8}).*$/i, "$1"),
|
|
303
|
-
seanceRef: htmlFilePath.replace(/^.*?(\d{8}).*$/i, "$1"),
|
|
304
|
-
sessionRef: metadonnees.session,
|
|
305
|
-
metadonnees,
|
|
306
|
-
contenu,
|
|
307
|
-
};
|
|
308
|
-
}
|
|
309
|
-
catch (e) {
|
|
310
|
-
console.error("Could not parse compte-rendu with error", e);
|
|
311
|
-
return null;
|
|
312
|
-
}
|
|
317
|
+
function elementInAnyInterval(el, idx, intervals) {
|
|
318
|
+
const p = idx.get(el);
|
|
319
|
+
if (p == null)
|
|
320
|
+
return false;
|
|
321
|
+
for (const iv of intervals)
|
|
322
|
+
if (p >= iv.start && p < iv.end)
|
|
323
|
+
return true;
|
|
324
|
+
return false;
|
|
313
325
|
}
|
package/lib/model/util.d.ts
CHANGED
|
@@ -6,3 +6,4 @@ export declare function removeSubstring(expr: Expression<string | null | undefin
|
|
|
6
6
|
export declare function replace(expr: Expression<string | null | undefined>, pattern: Expression<string>, replacement: Expression<string>): import("kysely").RawBuilder<string>;
|
|
7
7
|
export declare function rtrim(expr: Expression<string | null | undefined>): import("kysely").RawBuilder<string>;
|
|
8
8
|
export declare function toDateString(expr: Expression<Date | null | undefined>, format?: Expression<string>): import("kysely").RawBuilder<string>;
|
|
9
|
+
export declare function norm(s?: string | null): string;
|
package/lib/model/util.js
CHANGED
|
@@ -8,7 +8,8 @@ import { parseAgendaFromFile } from "../model/agenda";
|
|
|
8
8
|
import { getSessionsFromStart } from "../types/sessions";
|
|
9
9
|
import { ID_DATE_FORMAT } from "./datautil";
|
|
10
10
|
import { commonOptions } from "./shared/cli_helpers";
|
|
11
|
-
import { ensureAndClearDir } from "./shared/util";
|
|
11
|
+
import { ensureAndClearDir, fetchWithRetry } from "./shared/util";
|
|
12
|
+
import { groupNonSPByTypeOrganeHour, groupSeancePubliqueBySlot } from "../utils/reunion_grouping";
|
|
12
13
|
const optionsDefinitions = [
|
|
13
14
|
...commonOptions,
|
|
14
15
|
{
|
|
@@ -64,7 +65,7 @@ async function downloadAgenda(agendaName, agendaPath) {
|
|
|
64
65
|
if (!options["silent"]) {
|
|
65
66
|
console.log(`Downloading Agenda ${agendaUrl}…`);
|
|
66
67
|
}
|
|
67
|
-
const response = await
|
|
68
|
+
const response = await fetchWithRetry(agendaUrl);
|
|
68
69
|
if (!response.ok) {
|
|
69
70
|
if (response.status === 404) {
|
|
70
71
|
console.warn(`Agenda ${agendaUrl} not found`);
|
|
@@ -80,15 +81,33 @@ async function downloadAgenda(agendaName, agendaPath) {
|
|
|
80
81
|
}
|
|
81
82
|
fs.writeFileSync(agendaPath, Buffer.from(agendaContent));
|
|
82
83
|
}
|
|
84
|
+
function writeGroupsAsFiles(dir, groups) {
|
|
85
|
+
for (const g of groups) {
|
|
86
|
+
const outPath = path.join(dir, `${g.uid}.json`);
|
|
87
|
+
fs.writeJSONSync(outPath, g, { spaces: 2 });
|
|
88
|
+
}
|
|
89
|
+
}
|
|
83
90
|
async function parseAgenda(transformedAgendaSessionDir, agendaFileName, agendaPath) {
|
|
84
|
-
if (!options["silent"])
|
|
91
|
+
if (!options["silent"])
|
|
85
92
|
console.log(`Parsing Agenda ${agendaPath}…`);
|
|
86
|
-
}
|
|
87
93
|
const parsedAgendaEvents = await parseAgendaFromFile(agendaPath);
|
|
88
|
-
if (!parsedAgendaEvents
|
|
94
|
+
if (!parsedAgendaEvents?.length)
|
|
89
95
|
return;
|
|
96
|
+
const flatPath = path.join(transformedAgendaSessionDir, `${agendaFileName}.json`);
|
|
97
|
+
fs.writeJSONSync(flatPath, parsedAgendaEvents, { spaces: 2 });
|
|
98
|
+
// 1) SP → groubed by (date, slot)
|
|
99
|
+
const spGrouped = groupSeancePubliqueBySlot(parsedAgendaEvents);
|
|
100
|
+
if (spGrouped.length > 0) {
|
|
101
|
+
writeGroupsAsFiles(transformedAgendaSessionDir, spGrouped);
|
|
102
|
+
}
|
|
103
|
+
// 2) NON-SP → groubed by (date, organe, hour)
|
|
104
|
+
const groupedBySuffix = groupNonSPByTypeOrganeHour(parsedAgendaEvents);
|
|
105
|
+
for (const suffix of ["IDC", "IDM", "IDO", "IDI"]) {
|
|
106
|
+
const groups = groupedBySuffix[suffix] || [];
|
|
107
|
+
if (groups.length > 0) {
|
|
108
|
+
writeGroupsAsFiles(transformedAgendaSessionDir, groups);
|
|
109
|
+
}
|
|
90
110
|
}
|
|
91
|
-
fs.writeJSONSync(path.join(transformedAgendaSessionDir, `${agendaFileName}.json`), parsedAgendaEvents, { spaces: 2 });
|
|
92
111
|
}
|
|
93
112
|
async function main() {
|
|
94
113
|
const dataDir = options["dataDir"];
|
|
@@ -1 +1,6 @@
|
|
|
1
|
-
|
|
1
|
+
/**
|
|
2
|
+
* Needs to be run after retrieve_agenda.ts !
|
|
3
|
+
* - downloads the ZIP of comptes-rendus des débats (CRI) from data.senat.fr
|
|
4
|
+
* - extracts XML files, distributes them by session/year
|
|
5
|
+
*/
|
|
6
|
+
export declare function retrieveCriXmlDump(dataDir: string, options?: Record<string, any>): Promise<void>;
|