@tricoteuses/senat 2.11.0 → 2.11.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/loaders.d.ts +5 -0
- package/lib/loaders.js +14 -9
- package/lib/model/commission.d.ts +5 -0
- package/lib/model/commission.js +263 -0
- package/lib/model/{compte_rendu.js → seance.js} +47 -28
- package/lib/model/util.d.ts +1 -0
- package/lib/model/util.js +19 -1
- package/lib/scripts/retrieve_cr_commission.d.ts +1 -0
- package/lib/scripts/retrieve_cr_commission.js +291 -0
- package/lib/scripts/{retrieve_comptes_rendus.js → retrieve_cr_seance.js} +1 -1
- package/lib/utils/cr_spliting.d.ts +22 -1
- package/lib/utils/cr_spliting.js +273 -12
- package/lib/utils/reunion_grouping.d.ts +3 -0
- package/lib/utils/reunion_grouping.js +1 -1
- package/package.json +3 -2
- package/lib/raw_types/db.d.ts +0 -11389
- package/lib/raw_types/db.js +0 -5
- /package/lib/model/{compte_rendu.d.ts → seance.d.ts} +0 -0
- /package/lib/scripts/{retrieve_comptes_rendus.d.ts → retrieve_cr_seance.d.ts} +0 -0
package/lib/loaders.d.ts
CHANGED
|
@@ -9,6 +9,7 @@ import { CompteRendu } from "./types/compte_rendu";
|
|
|
9
9
|
export { EnabledDatasets } from "./datasets";
|
|
10
10
|
export declare const AGENDA_FOLDER = "agenda";
|
|
11
11
|
export declare const COMPTES_RENDUS_FOLDER = "seances";
|
|
12
|
+
export declare const COMMISSION_FOLDER = "commissions";
|
|
12
13
|
export declare const DOSLEG_DOSSIERS_FOLDER = "dossiers";
|
|
13
14
|
export declare const SCRUTINS_FOLDER = "scrutins";
|
|
14
15
|
export declare const RAPPORT_FOLDER = "rap";
|
|
@@ -68,6 +69,10 @@ export declare function iterLoadSenatComptesRendusSeances(dataDir: string, sessi
|
|
|
68
69
|
compteRendu: CompteRendu;
|
|
69
70
|
session: number;
|
|
70
71
|
}>;
|
|
72
|
+
export declare function iterLoadSenatComptesRendusCommissions(dataDir: string, session: number): Generator<{
|
|
73
|
+
compteRendu: CompteRendu;
|
|
74
|
+
session: number;
|
|
75
|
+
}>;
|
|
71
76
|
export declare function iterLoadSenatDossiersLegislatifs(dataDir: string, session: number | undefined, options?: {}): Generator<IterItem<DossierLegislatifResult>>;
|
|
72
77
|
export declare function iterLoadSenatDossiersLegislatifsRapportUrls(dataDir: string, session: number | undefined): Generator<IterItem<RapportMetadata>>;
|
|
73
78
|
export declare function iterLoadSenatDossiersLegislatifsTexteUrls(dataDir: string, session: number | undefined): Generator<IterItem<TexteMetadata>>;
|
package/lib/loaders.js
CHANGED
|
@@ -6,6 +6,7 @@ import { UNDEFINED_SESSION } from "./types/sessions";
|
|
|
6
6
|
export { EnabledDatasets } from "./datasets";
|
|
7
7
|
export const AGENDA_FOLDER = "agenda";
|
|
8
8
|
export const COMPTES_RENDUS_FOLDER = "seances";
|
|
9
|
+
export const COMMISSION_FOLDER = "commissions";
|
|
9
10
|
export const DOSLEG_DOSSIERS_FOLDER = "dossiers";
|
|
10
11
|
export const SCRUTINS_FOLDER = "scrutins";
|
|
11
12
|
export const RAPPORT_FOLDER = "rap";
|
|
@@ -61,13 +62,13 @@ export function* iterLoadSenatDebats(dataDir, session, options = {}) {
|
|
|
61
62
|
yield debatItem;
|
|
62
63
|
}
|
|
63
64
|
}
|
|
64
|
-
|
|
65
|
-
const basePath = path.join(dataDir,
|
|
66
|
-
if (!fs.existsSync(basePath))
|
|
65
|
+
function* iterLoadSenatComptesRendusGeneric(dataDir, session, subFolder) {
|
|
66
|
+
const basePath = path.join(dataDir, subFolder, DATA_TRANSFORMED_FOLDER, String(session));
|
|
67
|
+
if (!fs.existsSync(basePath)) {
|
|
68
|
+
console.warn(`[SN] Missing basePath → ${basePath}`);
|
|
67
69
|
return;
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
.sort();
|
|
70
|
+
}
|
|
71
|
+
const files = (fs.readdirSync(basePath) || []).filter((f) => f.endsWith(".json")).sort();
|
|
71
72
|
for (const fileName of files) {
|
|
72
73
|
const filePath = path.join(basePath, fileName);
|
|
73
74
|
try {
|
|
@@ -84,6 +85,12 @@ export function* iterLoadSenatComptesRendusSeances(dataDir, session) {
|
|
|
84
85
|
}
|
|
85
86
|
}
|
|
86
87
|
}
|
|
88
|
+
export function* iterLoadSenatComptesRendusSeances(dataDir, session) {
|
|
89
|
+
yield* iterLoadSenatComptesRendusGeneric(dataDir, session, COMPTES_RENDUS_FOLDER);
|
|
90
|
+
}
|
|
91
|
+
export function* iterLoadSenatComptesRendusCommissions(dataDir, session) {
|
|
92
|
+
yield* iterLoadSenatComptesRendusGeneric(dataDir, session, COMMISSION_FOLDER);
|
|
93
|
+
}
|
|
87
94
|
export function* iterLoadSenatDossiersLegislatifs(dataDir, session, options = {}) {
|
|
88
95
|
for (const dossierLegislatifItem of iterLoadSenatItems(dataDir, datasets.dosleg.database, session, DOSLEG_DOSSIERS_FOLDER, options)) {
|
|
89
96
|
yield dossierLegislatifItem;
|
|
@@ -193,9 +200,7 @@ export function* iterLoadSenatAgendasGrouped(dataDir, session) {
|
|
|
193
200
|
const baseDir = path.join(dataDir, AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, String(session ?? ""));
|
|
194
201
|
if (!fs.existsSync(baseDir))
|
|
195
202
|
return;
|
|
196
|
-
const files = (fs.readdirSync(baseDir) || [])
|
|
197
|
-
.filter((f) => f.startsWith("RUSN") && f.endsWith(".json"))
|
|
198
|
-
.sort();
|
|
203
|
+
const files = (fs.readdirSync(baseDir) || []).filter((f) => f.startsWith("RUSN") && f.endsWith(".json")).sort();
|
|
199
204
|
for (const fileName of files) {
|
|
200
205
|
const filePath = path.join(baseDir, fileName);
|
|
201
206
|
let raw;
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
import * as cheerio from "cheerio";
|
|
2
|
+
import { CompteRendu } from "../types/compte_rendu";
|
|
3
|
+
import { GroupedReunion } from "../types/agenda";
|
|
4
|
+
export declare function getRemainingTextAfterSpeakerHeader($: cheerio.CheerioAPI, $p: cheerio.Cheerio<any>): string;
|
|
5
|
+
export declare function parseCommissionCRFromFile(htmlFilePath: string, best: GroupedReunion): CompteRendu | null;
|
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
import * as cheerio from "cheerio";
|
|
2
|
+
import path from "path";
|
|
3
|
+
import fs from "fs";
|
|
4
|
+
import { norm, toCRDate } from "./util";
|
|
5
|
+
const PARA_h3_SEL = "p.sh_justify, p.sh_center, p.sh_marge, p[align], li, h3";
|
|
6
|
+
function findDayRoot($, targetISO) {
|
|
7
|
+
let $root = $();
|
|
8
|
+
$("h2").each((_, el) => {
|
|
9
|
+
const txt = norm($(el).text());
|
|
10
|
+
const m = txt.match(/(?:Lundi|Mardi|Mercredi|Jeudi|Vendredi|Samedi|Dimanche)\s+(.+)$/i);
|
|
11
|
+
const iso = m ? frDateToISO(m[1]) : undefined;
|
|
12
|
+
if (iso === targetISO && $root.length === 0)
|
|
13
|
+
$root = $(el);
|
|
14
|
+
});
|
|
15
|
+
return $root;
|
|
16
|
+
}
|
|
17
|
+
function normalizeSpaces(s) {
|
|
18
|
+
return s.replace(/[\u00A0\u202F\u2009]/g, " ");
|
|
19
|
+
}
|
|
20
|
+
function stripIntroPunct(s) {
|
|
21
|
+
return s.replace(/^[\s]*[.:;]?\s*(?:[–—-]\s*)+/u, "");
|
|
22
|
+
}
|
|
23
|
+
function collectLeadingHeaderStrongEls($, $clone) {
|
|
24
|
+
const els = [];
|
|
25
|
+
const nodes = $clone.contents().toArray();
|
|
26
|
+
for (const node of nodes) {
|
|
27
|
+
if (node.type === "text") {
|
|
28
|
+
if (norm(node.data || ""))
|
|
29
|
+
break;
|
|
30
|
+
continue;
|
|
31
|
+
}
|
|
32
|
+
if (node.type === "tag") {
|
|
33
|
+
const $n = $(node);
|
|
34
|
+
if ($n.is("strong, b")) {
|
|
35
|
+
els.push(node);
|
|
36
|
+
continue;
|
|
37
|
+
}
|
|
38
|
+
if ($n.is("a") && $n.children("strong, b").length) {
|
|
39
|
+
$n.children("strong, b").each((_, el) => {
|
|
40
|
+
els.push($(el));
|
|
41
|
+
});
|
|
42
|
+
continue;
|
|
43
|
+
}
|
|
44
|
+
break;
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
return els;
|
|
48
|
+
}
|
|
49
|
+
// Remove orateur's name from text and clean intro punct
|
|
50
|
+
export function getRemainingTextAfterSpeakerHeader($, $p) {
|
|
51
|
+
const $clone = $p.clone();
|
|
52
|
+
// 1) Remove <strong> at start
|
|
53
|
+
const headerStrongEls = collectLeadingHeaderStrongEls($, $clone);
|
|
54
|
+
for (const el of headerStrongEls)
|
|
55
|
+
$(el).remove();
|
|
56
|
+
// 2) normalize + clean intro punct
|
|
57
|
+
let remainingHtml = $clone.html() || "";
|
|
58
|
+
remainingHtml = normalizeSpaces(cheerio.load(remainingHtml).text());
|
|
59
|
+
remainingHtml = stripIntroPunct(remainingHtml);
|
|
60
|
+
const remainingText = norm(remainingHtml || "");
|
|
61
|
+
return remainingText;
|
|
62
|
+
}
|
|
63
|
+
function buildPointsFromParagraphs($, paras) {
|
|
64
|
+
const points = [];
|
|
65
|
+
let ordreAbsoluSeance = 0;
|
|
66
|
+
const normSpeaker = (s) => s
|
|
67
|
+
.normalize("NFKC")
|
|
68
|
+
.replace(/\s+/g, " ")
|
|
69
|
+
.replace(/[:\.]\s*$/, "")
|
|
70
|
+
.trim();
|
|
71
|
+
const normQual = (s) => s
|
|
72
|
+
.normalize("NFKC")
|
|
73
|
+
.replace(/\s+/g, " ")
|
|
74
|
+
.replace(/^\s*,\s*|\s+$/g, "")
|
|
75
|
+
.replace(/[\s\u00A0]*[.,;:–—-]+$/u, "")
|
|
76
|
+
.trim();
|
|
77
|
+
let currentOrateur = null;
|
|
78
|
+
let currentQualite = "";
|
|
79
|
+
let currentTexte = "";
|
|
80
|
+
function isPresidentQual(qual) {
|
|
81
|
+
return /\bprésident(e)?\b/i.test(qual);
|
|
82
|
+
}
|
|
83
|
+
// Flush the buffered speaker’s text into points[] if any.
|
|
84
|
+
function flush() {
|
|
85
|
+
if (!currentOrateur || !currentTexte.trim())
|
|
86
|
+
return;
|
|
87
|
+
ordreAbsoluSeance++;
|
|
88
|
+
points.push({
|
|
89
|
+
code_grammaire: "PAROLE_GENERIQUE",
|
|
90
|
+
roledebat: isPresidentQual(currentQualite) ? "président" : "",
|
|
91
|
+
ordre_absolu_seance: String(ordreAbsoluSeance),
|
|
92
|
+
orateurs: { orateur: { nom: currentOrateur, id: "", qualite: currentQualite || "" } },
|
|
93
|
+
texte: { _: currentTexte.trim() },
|
|
94
|
+
});
|
|
95
|
+
currentOrateur = null;
|
|
96
|
+
currentQualite = "";
|
|
97
|
+
currentTexte = "";
|
|
98
|
+
}
|
|
99
|
+
function addPoint(payload) {
|
|
100
|
+
ordreAbsoluSeance++;
|
|
101
|
+
points.push({ ...payload, ordre_absolu_seance: String(ordreAbsoluSeance) });
|
|
102
|
+
}
|
|
103
|
+
for (const $p of paras) {
|
|
104
|
+
if ($p.closest("table").length)
|
|
105
|
+
continue;
|
|
106
|
+
const tagName = ($p.prop("tagName") || "").toString().toLowerCase();
|
|
107
|
+
const rawText = ($p.text() || "").replace(/\u00a0/g, " ").trim();
|
|
108
|
+
const text = norm(rawText);
|
|
109
|
+
if (!text || text.length <= 3)
|
|
110
|
+
continue;
|
|
111
|
+
const html = ($p.html() || "").trim();
|
|
112
|
+
const italicSpans = $p.find("i, em, span[style*='italic']");
|
|
113
|
+
const firstItalicOuter = italicSpans.length ? $(italicSpans[0]).prop("outerHTML") || "" : "";
|
|
114
|
+
const htmlBeforeFirstItalic = firstItalicOuter ? html.split(firstItalicOuter)[0].trim() : "";
|
|
115
|
+
const isPureItalic = italicSpans.length > 0 && italicSpans.length === $p.find("span,i,em").length && htmlBeforeFirstItalic === "";
|
|
116
|
+
if (tagName === "h3") {
|
|
117
|
+
flush();
|
|
118
|
+
addPoint({
|
|
119
|
+
code_style: "Titre",
|
|
120
|
+
code_grammaire: "TITRE_TEXTE_DISCUSSION",
|
|
121
|
+
texte: { _: text },
|
|
122
|
+
});
|
|
123
|
+
continue;
|
|
124
|
+
}
|
|
125
|
+
const boldSpans = $p.find("strong, b");
|
|
126
|
+
const joinedBold = norm(boldSpans
|
|
127
|
+
.map((_, el) => $(el).text() || "")
|
|
128
|
+
.get()
|
|
129
|
+
.join(""));
|
|
130
|
+
const [namePartRaw, qualPartRaw] = joinedBold.split(/\s*,\s+/, 2);
|
|
131
|
+
const namePart = namePartRaw ? normSpeaker(namePartRaw) : "";
|
|
132
|
+
const qualPart = qualPartRaw ? normQual(qualPartRaw) : "";
|
|
133
|
+
const looksLikeName = namePart.length > 3 && /^(M\.|Mme)[\s\u00A0\u202F]+/i.test(namePart);
|
|
134
|
+
const startsWithName = namePart && text.startsWith(namePart);
|
|
135
|
+
const isNewSpeaker = looksLikeName && startsWithName && namePart !== currentOrateur;
|
|
136
|
+
if (isNewSpeaker) {
|
|
137
|
+
flush();
|
|
138
|
+
currentOrateur = namePart;
|
|
139
|
+
currentQualite = qualPart;
|
|
140
|
+
const remainingText = getRemainingTextAfterSpeakerHeader($, $p);
|
|
141
|
+
currentTexte = remainingText;
|
|
142
|
+
continue;
|
|
143
|
+
}
|
|
144
|
+
if (isPureItalic || (!joinedBold && !currentOrateur && text)) {
|
|
145
|
+
flush();
|
|
146
|
+
addPoint({
|
|
147
|
+
code_style: "Info Italiques",
|
|
148
|
+
code_grammaire: "PAROLE_GENERIQUE",
|
|
149
|
+
texte: { _: "<i>" + text + "</i>" },
|
|
150
|
+
});
|
|
151
|
+
continue;
|
|
152
|
+
}
|
|
153
|
+
// concat text because same orateur
|
|
154
|
+
if (currentOrateur) {
|
|
155
|
+
const removeOrateurFromText = getRemainingTextAfterSpeakerHeader($, $p);
|
|
156
|
+
currentTexte += (currentTexte ? "<br/><br/>" : "") + removeOrateurFromText;
|
|
157
|
+
continue;
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
flush();
|
|
161
|
+
return points;
|
|
162
|
+
}
|
|
163
|
+
function frDateToISO(s) {
|
|
164
|
+
if (!s)
|
|
165
|
+
return;
|
|
166
|
+
const months = {
|
|
167
|
+
janvier: 1,
|
|
168
|
+
fevrier: 2,
|
|
169
|
+
février: 2,
|
|
170
|
+
mars: 3,
|
|
171
|
+
avril: 4,
|
|
172
|
+
mai: 5,
|
|
173
|
+
juin: 6,
|
|
174
|
+
juillet: 7,
|
|
175
|
+
aout: 8,
|
|
176
|
+
août: 8,
|
|
177
|
+
septembre: 9,
|
|
178
|
+
octobre: 10,
|
|
179
|
+
novembre: 11,
|
|
180
|
+
decembre: 12,
|
|
181
|
+
décembre: 12,
|
|
182
|
+
};
|
|
183
|
+
const m = norm(s).match(/^(\d{1,2})\s+([A-Za-zéûôîà]+)\s+(\d{4})$/i);
|
|
184
|
+
if (!m)
|
|
185
|
+
return;
|
|
186
|
+
const d = Number(m[1]);
|
|
187
|
+
const mon = months[m[2].toLowerCase()];
|
|
188
|
+
const y = Number(m[3]);
|
|
189
|
+
if (!mon)
|
|
190
|
+
return;
|
|
191
|
+
return `${y}-${String(mon).padStart(2, "0")}-${String(d).padStart(2, "0")}`;
|
|
192
|
+
}
|
|
193
|
+
export function parseCommissionCRFromFile(htmlFilePath, best) {
|
|
194
|
+
try {
|
|
195
|
+
const raw = fs.readFileSync(htmlFilePath, "utf8");
|
|
196
|
+
const $ = cheerio.load(raw, { xmlMode: false });
|
|
197
|
+
const dateISO = best.date;
|
|
198
|
+
const dateSeance = toCRDate(dateISO, best.startTime);
|
|
199
|
+
const $dayRoot = findDayRoot($, dateISO);
|
|
200
|
+
if ($dayRoot.length === 0) {
|
|
201
|
+
console.warn(`[COM-CR][parse] day root not found for ${dateISO} in ${path.basename(htmlFilePath)}`);
|
|
202
|
+
return null;
|
|
203
|
+
}
|
|
204
|
+
let points = [];
|
|
205
|
+
// Take all paragraphs/h3 until next h2
|
|
206
|
+
const dayParas = [];
|
|
207
|
+
let $cursor = $dayRoot.next();
|
|
208
|
+
while ($cursor.length && !$cursor.is("h2")) {
|
|
209
|
+
if ($cursor.is("h3")) {
|
|
210
|
+
dayParas.push($cursor);
|
|
211
|
+
}
|
|
212
|
+
if ($cursor.is(PARA_h3_SEL)) {
|
|
213
|
+
dayParas.push($cursor);
|
|
214
|
+
}
|
|
215
|
+
else {
|
|
216
|
+
const $ps = $cursor.find(PARA_h3_SEL);
|
|
217
|
+
if ($ps.length) {
|
|
218
|
+
$ps.each((_, p) => {
|
|
219
|
+
dayParas.push($(p));
|
|
220
|
+
});
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
$cursor = $cursor.next();
|
|
224
|
+
}
|
|
225
|
+
const allDayPoints = buildPointsFromParagraphs($, dayParas);
|
|
226
|
+
if (allDayPoints.length > 0)
|
|
227
|
+
points = allDayPoints;
|
|
228
|
+
const session = dateISO.slice(5, 7) >= "10"
|
|
229
|
+
? `${dateISO.slice(0, 4)}-${Number(dateISO.slice(0, 4)) + 1}`
|
|
230
|
+
: `${Number(dateISO.slice(0, 4)) - 1}-${dateISO.slice(0, 4)}`;
|
|
231
|
+
const contenu = {
|
|
232
|
+
quantiemes: { journee: dateISO, session },
|
|
233
|
+
point: points,
|
|
234
|
+
};
|
|
235
|
+
const metadonnees = {
|
|
236
|
+
dateSeance: dateSeance,
|
|
237
|
+
dateSeanceJour: dateISO,
|
|
238
|
+
numSeanceJour: "",
|
|
239
|
+
numSeance: "",
|
|
240
|
+
typeAssemblee: "SN",
|
|
241
|
+
legislature: "",
|
|
242
|
+
session,
|
|
243
|
+
nomFichierJo: path.basename(htmlFilePath),
|
|
244
|
+
validite: "non-certifie",
|
|
245
|
+
etat: "definitif",
|
|
246
|
+
diffusion: "publique",
|
|
247
|
+
version: "1",
|
|
248
|
+
environnement: "prod",
|
|
249
|
+
heureGeneration: new Date(),
|
|
250
|
+
};
|
|
251
|
+
return {
|
|
252
|
+
uid: best.uid.replace(/^RUSN/, "CRC"),
|
|
253
|
+
seanceRef: best.uid,
|
|
254
|
+
sessionRef: session,
|
|
255
|
+
metadonnees,
|
|
256
|
+
contenu,
|
|
257
|
+
};
|
|
258
|
+
}
|
|
259
|
+
catch (e) {
|
|
260
|
+
console.error(`[COM-CR][parse] error file=${path.basename(htmlFilePath)}:`, e);
|
|
261
|
+
return null;
|
|
262
|
+
}
|
|
263
|
+
}
|
|
@@ -2,9 +2,9 @@ import fs from "fs";
|
|
|
2
2
|
import * as cheerio from "cheerio";
|
|
3
3
|
import path from "path";
|
|
4
4
|
import { computeIntervalsBySlot } from "../utils/cr_spliting";
|
|
5
|
-
import { norm } from "./util";
|
|
6
|
-
const asArray = (x) => x == null ? [] : Array.isArray(x) ? x : [x];
|
|
7
|
-
const toInt = (s) => Number.isFinite(Number(s)) ? Number(s) : Number.POSITIVE_INFINITY;
|
|
5
|
+
import { norm, toCRDate } from "./util";
|
|
6
|
+
const asArray = (x) => (x == null ? [] : Array.isArray(x) ? x : [x]);
|
|
7
|
+
const toInt = (s) => (Number.isFinite(Number(s)) ? Number(s) : Number.POSITIVE_INFINITY);
|
|
8
8
|
export async function parseCompteRenduSlotFromFile(xmlFilePath, wantedSlot, firstSlotOfDay) {
|
|
9
9
|
try {
|
|
10
10
|
const raw = fs.readFileSync(xmlFilePath, "utf8");
|
|
@@ -13,7 +13,7 @@ export async function parseCompteRenduSlotFromFile(xmlFilePath, wantedSlot, firs
|
|
|
13
13
|
const order = $("body *").toArray();
|
|
14
14
|
const idx = new Map(order.map((el, i) => [el, i]));
|
|
15
15
|
const intervalsAll = computeIntervalsBySlot($, idx, firstSlotOfDay);
|
|
16
|
-
const intervals = intervalsAll.filter(iv => iv.slot === wantedSlot);
|
|
16
|
+
const intervals = intervalsAll.filter((iv) => iv.slot === wantedSlot);
|
|
17
17
|
if (intervals.length === 0) {
|
|
18
18
|
console.warn(`[CRI] no intervals for ${path.basename(xmlFilePath)} [${wantedSlot}]`);
|
|
19
19
|
return null;
|
|
@@ -35,7 +35,8 @@ export async function parseCompteRenduSlotFromFile(xmlFilePath, wantedSlot, firs
|
|
|
35
35
|
if (!elementInAnyInterval(block, idx, intervals))
|
|
36
36
|
return;
|
|
37
37
|
const $block = $(block);
|
|
38
|
-
$block
|
|
38
|
+
$block
|
|
39
|
+
.find([
|
|
39
40
|
"p[class^='titre_S']",
|
|
40
41
|
"p.mention_titre",
|
|
41
42
|
"p.intitule_titre",
|
|
@@ -45,7 +46,8 @@ export async function parseCompteRenduSlotFromFile(xmlFilePath, wantedSlot, firs
|
|
|
45
46
|
"p.intitule_article",
|
|
46
47
|
"p.mention_section",
|
|
47
48
|
"p.intitule_section",
|
|
48
|
-
].join(","))
|
|
49
|
+
].join(","))
|
|
50
|
+
.remove();
|
|
49
51
|
const firstP = $block.find("p").first();
|
|
50
52
|
const speakerLabelRaw = firstP.find(".orateur_nom").text() || firstP.find("a.lien_senfic").text() || "";
|
|
51
53
|
const speakerLabel = dedupeSpeaker(speakerLabelRaw);
|
|
@@ -98,7 +100,7 @@ export function parseYYYYMMDD(yyyymmdd) {
|
|
|
98
100
|
}
|
|
99
101
|
export function deriveTitreObjetFromSommaire(sommaire, slot) {
|
|
100
102
|
const items = extractLevel1Items(sommaire);
|
|
101
|
-
const meaningful = items.filter(it => !isBoilerplate(it.label));
|
|
103
|
+
const meaningful = items.filter((it) => !isBoilerplate(it.label));
|
|
102
104
|
if (meaningful.length === 0) {
|
|
103
105
|
return {
|
|
104
106
|
titre: `Séance publique ${slotLabel(slot)}`,
|
|
@@ -106,15 +108,22 @@ export function deriveTitreObjetFromSommaire(sommaire, slot) {
|
|
|
106
108
|
};
|
|
107
109
|
}
|
|
108
110
|
const titre = meaningful[0].label;
|
|
109
|
-
const objet = meaningful
|
|
111
|
+
const objet = meaningful
|
|
112
|
+
.slice(0, 3)
|
|
113
|
+
.map((it) => it.label)
|
|
114
|
+
.join(" ; ");
|
|
110
115
|
return { titre, objet };
|
|
111
116
|
}
|
|
112
117
|
function slotLabel(slot) {
|
|
113
118
|
switch (slot) {
|
|
114
|
-
case "MATIN":
|
|
115
|
-
|
|
116
|
-
case "
|
|
117
|
-
|
|
119
|
+
case "MATIN":
|
|
120
|
+
return "du matin";
|
|
121
|
+
case "APRES-MIDI":
|
|
122
|
+
return "de l’après-midi";
|
|
123
|
+
case "SOIR":
|
|
124
|
+
return "du soir";
|
|
125
|
+
default:
|
|
126
|
+
return "";
|
|
118
127
|
}
|
|
119
128
|
}
|
|
120
129
|
const BOILERPLATE_PATTERNS = [
|
|
@@ -132,18 +141,20 @@ const BOILERPLATE_PATTERNS = [
|
|
|
132
141
|
/ouverture de la séance/i,
|
|
133
142
|
/clo(?:t|̂)ure de la séance/i,
|
|
134
143
|
];
|
|
135
|
-
const isBoilerplate = (label) => !label?.trim() || BOILERPLATE_PATTERNS.some(rx => rx.test(label));
|
|
144
|
+
const isBoilerplate = (label) => !label?.trim() || BOILERPLATE_PATTERNS.some((rx) => rx.test(label));
|
|
136
145
|
function extractLevel1Items(sommaire) {
|
|
137
146
|
const level1 = asArray(sommaire?.sommaire1);
|
|
138
147
|
return level1
|
|
139
|
-
.map(el => ({
|
|
148
|
+
.map((el) => ({
|
|
140
149
|
numero: toInt(el?.valeur_pts_odj),
|
|
141
150
|
label: String(el?.titreStruct?.intitule ?? "").trim(),
|
|
142
151
|
}))
|
|
143
|
-
.filter(it => !!it.label)
|
|
152
|
+
.filter((it) => !!it.label)
|
|
144
153
|
.sort((a, b) => a.numero - b.numero);
|
|
145
154
|
}
|
|
146
|
-
function stripTrailingPunct(s) {
|
|
155
|
+
function stripTrailingPunct(s) {
|
|
156
|
+
return s.replace(/\s*([:,.;])\s*$/u, "").trim();
|
|
157
|
+
}
|
|
147
158
|
function dedupeSpeaker(raw) {
|
|
148
159
|
let s = norm(raw);
|
|
149
160
|
s = stripTrailingPunct(s);
|
|
@@ -158,7 +169,8 @@ function dedupeSpeaker(raw) {
|
|
|
158
169
|
return s.replace(/\.\s*$/, "");
|
|
159
170
|
}
|
|
160
171
|
function decodeHtmlEntities(s) {
|
|
161
|
-
return s
|
|
172
|
+
return s
|
|
173
|
+
.replace(/&#(\d+);/g, (_, d) => String.fromCharCode(parseInt(d, 10)))
|
|
162
174
|
.replace(/&#x([0-9a-fA-F]+);/g, (_, h) => String.fromCharCode(parseInt(h, 16)));
|
|
163
175
|
}
|
|
164
176
|
function fixApostrophes(s) {
|
|
@@ -169,7 +181,9 @@ function fixApostrophes(s) {
|
|
|
169
181
|
out = out.replace(/\s+([,;:.!?])/g, "$1");
|
|
170
182
|
return out;
|
|
171
183
|
}
|
|
172
|
-
function normalizeTitle(text) {
|
|
184
|
+
function normalizeTitle(text) {
|
|
185
|
+
return text.replace(/^PR[ÉE]SIDENCE DE\b/i, "Présidence de ");
|
|
186
|
+
}
|
|
173
187
|
function roleForSpeaker(labelOrQualite) {
|
|
174
188
|
const s = (labelOrQualite || "").toLowerCase();
|
|
175
189
|
if (/^(m\.|mme)?\s*(le|la)\s+pr[ée]sident(e)?\b/.test(s) || /\bpr[ée]sident[e]?\s+de\s+séance\b/.test(s))
|
|
@@ -177,7 +191,7 @@ function roleForSpeaker(labelOrQualite) {
|
|
|
177
191
|
return "";
|
|
178
192
|
}
|
|
179
193
|
function readIntervenantMeta($block) {
|
|
180
|
-
const int = $block.find(
|
|
194
|
+
const int = $block.find("cri\\:intervenant").first();
|
|
181
195
|
if (int.length)
|
|
182
196
|
return { mat: int.attr("mat") || undefined, nom: int.attr("nom") || undefined, qua: int.attr("qua") || undefined };
|
|
183
197
|
const html = $block.html() || "";
|
|
@@ -220,6 +234,7 @@ function extractAndRemoveLeadingQualite($, $block) {
|
|
|
220
234
|
else if (node.type === "text") {
|
|
221
235
|
const t = norm(node.data || "");
|
|
222
236
|
if (!t || /^[:.,;–—-]+$/.test(t)) {
|
|
237
|
+
;
|
|
223
238
|
node.data = "";
|
|
224
239
|
return;
|
|
225
240
|
}
|
|
@@ -230,11 +245,11 @@ function extractAndRemoveLeadingQualite($, $block) {
|
|
|
230
245
|
}
|
|
231
246
|
function sanitizeInterventionHtml($, $block) {
|
|
232
247
|
const $clone = $block.clone();
|
|
233
|
-
$clone.find(
|
|
248
|
+
$clone.find("a[name]").remove();
|
|
234
249
|
$clone.find('div[align="right"]').remove();
|
|
235
|
-
$clone.find(
|
|
236
|
-
$clone.find(
|
|
237
|
-
$clone.find(
|
|
250
|
+
$clone.find("a.link").remove();
|
|
251
|
+
$clone.find("img").remove();
|
|
252
|
+
$clone.find("a#ameli_amendement_cri_phrase, a#ameli_amendement_cra_contenu, a#ameli_amendement_cra_objet").remove();
|
|
238
253
|
$clone.find(".orateur_nom, .orateur_qualite").remove();
|
|
239
254
|
let html = $clone.html() || "";
|
|
240
255
|
html = html.replace(/<!--[\s\S]*?-->/g, "");
|
|
@@ -244,11 +259,14 @@ function extractSommaireForIntervals($, idx, intervals) {
|
|
|
244
259
|
const inIv = (el) => elementInAnyInterval(el, idx, intervals);
|
|
245
260
|
const root = $("body");
|
|
246
261
|
const sommaire = { presidentSeance: { _: "" }, sommaire1: [] };
|
|
247
|
-
// (1) Présidence (tm2) — première ligne dans l’intervalle
|
|
248
|
-
const pres = root
|
|
262
|
+
// (1) Présidence (tm2) — première ligne dans l’intervalle
|
|
263
|
+
const pres = root
|
|
264
|
+
.find("p.tm2")
|
|
265
|
+
.filter((_, el) => inIv(el))
|
|
266
|
+
.first();
|
|
249
267
|
if (pres.length)
|
|
250
268
|
sommaire.presidentSeance = { _: norm(pres.text()) };
|
|
251
|
-
// (2) Paras tm5 présents dans l’intervalle
|
|
269
|
+
// (2) Paras tm5 présents dans l’intervalle
|
|
252
270
|
const paras = [];
|
|
253
271
|
root.find("p.tm5").each((_, el) => {
|
|
254
272
|
if (!inIv(el))
|
|
@@ -259,7 +277,7 @@ function extractSommaireForIntervals($, idx, intervals) {
|
|
|
259
277
|
});
|
|
260
278
|
if (paras.length)
|
|
261
279
|
sommaire.para = paras.length === 1 ? paras[0] : paras;
|
|
262
|
-
// (3) Items de 1er niveau (tm3) présents dans l’intervalle
|
|
280
|
+
// (3) Items de 1er niveau (tm3) présents dans l’intervalle
|
|
263
281
|
const items = [];
|
|
264
282
|
root.find("p.tm3").each((_, el) => {
|
|
265
283
|
if (!inIv(el))
|
|
@@ -297,6 +315,7 @@ function extractMetadonnees($, filePath) {
|
|
|
297
315
|
if (m)
|
|
298
316
|
dateSeance = `${m[1]}-${m[2]}-${m[3]}`;
|
|
299
317
|
}
|
|
318
|
+
dateSeance = toCRDate(dateSeance, null);
|
|
300
319
|
return {
|
|
301
320
|
dateSeance,
|
|
302
321
|
dateSeanceJour: dateSeance,
|
|
@@ -311,7 +330,7 @@ function extractMetadonnees($, filePath) {
|
|
|
311
330
|
diffusion: "",
|
|
312
331
|
version: "1.0",
|
|
313
332
|
environnement: "",
|
|
314
|
-
heureGeneration: new Date()
|
|
333
|
+
heureGeneration: new Date(),
|
|
315
334
|
};
|
|
316
335
|
}
|
|
317
336
|
function elementInAnyInterval(el, idx, intervals) {
|
package/lib/model/util.d.ts
CHANGED
|
@@ -7,3 +7,4 @@ export declare function replace(expr: Expression<string | null | undefined>, pat
|
|
|
7
7
|
export declare function rtrim(expr: Expression<string | null | undefined>): import("kysely").RawBuilder<string>;
|
|
8
8
|
export declare function toDateString(expr: Expression<Date | null | undefined>, format?: Expression<string>): import("kysely").RawBuilder<string>;
|
|
9
9
|
export declare function norm(s?: string | null): string;
|
|
10
|
+
export declare function toCRDate(dateISO: string, startTime?: string | null): string;
|
package/lib/model/util.js
CHANGED
|
@@ -22,5 +22,23 @@ export function toDateString(expr, format = sql.val(STANDARD_DATE_FORMAT)) {
|
|
|
22
22
|
return sql `to_char(${expr}, ${format})`;
|
|
23
23
|
}
|
|
24
24
|
export function norm(s) {
|
|
25
|
-
return (s || "")
|
|
25
|
+
return (s || "")
|
|
26
|
+
.replace(/\u00A0/g, " ")
|
|
27
|
+
.replace(/\s+/g, " ")
|
|
28
|
+
.trim();
|
|
29
|
+
}
|
|
30
|
+
export function toCRDate(dateISO, startTime) {
|
|
31
|
+
const yyyymmdd = dateISO.replace(/-/g, ""); // "20250716"
|
|
32
|
+
let hh = "00", mm = "00", ss = "00", SSS = "000";
|
|
33
|
+
if (startTime) {
|
|
34
|
+
// accepte "HH:MM:SS", "HH:MM:SS.mmm", "HH:MM:SS.mmm+02:00"
|
|
35
|
+
const m = startTime.match(/(\d{2}):(\d{2}):(\d{2})(?:\.(\d{3}))?/);
|
|
36
|
+
if (m) {
|
|
37
|
+
hh = m[1];
|
|
38
|
+
mm = m[2];
|
|
39
|
+
ss = m[3];
|
|
40
|
+
SSS = m[4] || "000";
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
return `${yyyymmdd}${hh}${mm}${ss}${SSS}`;
|
|
26
44
|
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|