@tricoteuses/senat 2.9.5 → 2.9.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/loaders.d.ts +4 -0
- package/lib/loaders.js +8 -0
- package/lib/model/compte_rendu.d.ts +1 -2
- package/lib/model/compte_rendu.js +303 -22
- package/lib/scripts/retrieve_comptes_rendus.js +27 -14
- package/lib/types/compte_rendu.d.ts +72 -7
- package/package.json +2 -1
package/lib/loaders.d.ts
CHANGED
|
@@ -5,6 +5,7 @@ import { QuestionResult } from "./model/questions";
|
|
|
5
5
|
import { CirconscriptionResult, OrganismeResult, SenateurResult } from "./model/sens";
|
|
6
6
|
import { AgendaEvent } from "./types/agenda";
|
|
7
7
|
import { FlatTexte } from "./types/texte";
|
|
8
|
+
import { CompteRendu } from "./types/compte_rendu";
|
|
8
9
|
export { EnabledDatasets } from "./datasets";
|
|
9
10
|
export declare const AGENDA_FOLDER = "agenda";
|
|
10
11
|
export declare const COMPTES_RENDUS_FOLDER = "seances";
|
|
@@ -69,6 +70,9 @@ export declare function iterLoadSenatDossiersLegislatifsDocuments(dataDir: strin
|
|
|
69
70
|
export declare function iterLoadSenatDossiersLegislatifsRapports(dataDir: string, session: number | undefined, options?: {}): Generator<IterItem<DossierLegislatifDocumentResult>>;
|
|
70
71
|
export declare function iterLoadSenatDossiersLegislatifsTextes(dataDir: string, session: number | undefined, options?: {}): Generator<IterItem<DossierLegislatifDocumentResult>>;
|
|
71
72
|
export declare function loadSenatTexteContent(dataDir: string, textePathFromDataset: string): IterItem<FlatTexte | null>;
|
|
73
|
+
export declare function loadSenatCompteRenduContent(dataDir: string, session: number, debatId: string | number): {
|
|
74
|
+
item: CompteRendu | null;
|
|
75
|
+
};
|
|
72
76
|
export declare function iterLoadSenatAgendas(dataDir: string, session: number | undefined, options?: {}): Generator<IterItem<AgendaEvent[]>>;
|
|
73
77
|
export declare function iterLoadSenatEvenements(dataDir: string, session: number | undefined, options?: {}): Generator<IterItem<AgendaEvent>>;
|
|
74
78
|
export declare function iterLoadSenatCirconscriptions(dataDir: string, options?: {}): Generator<IterItem<CirconscriptionResult>>;
|
package/lib/loaders.js
CHANGED
|
@@ -144,6 +144,14 @@ export function loadSenatTexteContent(dataDir, textePathFromDataset) {
|
|
|
144
144
|
const texteJson = fs.readFileSync(fullTextePath, { encoding: "utf8" });
|
|
145
145
|
return { item: JSON.parse(texteJson) };
|
|
146
146
|
}
|
|
147
|
+
export function loadSenatCompteRenduContent(dataDir, session, debatId) {
|
|
148
|
+
const fullPath = path.join(dataDir, COMPTES_RENDUS_FOLDER, DATA_TRANSFORMED_FOLDER, String(session), `${debatId}.json`);
|
|
149
|
+
if (!fs.existsSync(fullPath)) {
|
|
150
|
+
return { item: null };
|
|
151
|
+
}
|
|
152
|
+
const json = fs.readFileSync(fullPath, { encoding: "utf8" });
|
|
153
|
+
return { item: JSON.parse(json) };
|
|
154
|
+
}
|
|
147
155
|
export function* iterLoadSenatAgendas(dataDir, session, options = {}) {
|
|
148
156
|
for (const evenementsItem of iterLoadSenatItems(dataDir, AGENDA_FOLDER, session, DATA_TRANSFORMED_FOLDER, options)) {
|
|
149
157
|
yield evenementsItem;
|
|
@@ -1,3 +1,2 @@
|
|
|
1
1
|
import { CompteRendu } from "../types/compte_rendu";
|
|
2
|
-
|
|
3
|
-
export declare function parseCompteRenduFromFile(htmlFilePath: string, debat: DebatResult): Promise<CompteRendu | null>;
|
|
2
|
+
export declare function parseCompteRenduFromFile(htmlFilePath: string): Promise<CompteRendu | null>;
|
|
@@ -1,32 +1,313 @@
|
|
|
1
1
|
import { JSDOM } from "jsdom";
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
2
|
+
import * as cheerio from "cheerio";
|
|
3
|
+
const norm = (s) => s.replace(/\u00A0/g, " ").replace(/\s+/g, " ").trim();
|
|
4
|
+
const toTexte = (s) => ({ _: s });
|
|
5
|
+
function extractSommaire($) {
|
|
6
|
+
const root = $("#wysiwyg").length ? $("#wysiwyg") : $("#cri");
|
|
7
|
+
const sommaire = {
|
|
8
|
+
presidentSeance: toTexte(""),
|
|
9
|
+
sommaire1: [],
|
|
5
10
|
};
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
11
|
+
// (1) presidency line (e.g., "Présidence de Mme …")
|
|
12
|
+
const pres = root.find("p.tm2").filter((_, el) => /présidence/i.test($(el).text())).first();
|
|
13
|
+
if (pres.length) {
|
|
14
|
+
sommaire.presidentSeance = toTexte(norm(pres.text()));
|
|
15
|
+
}
|
|
16
|
+
// (2) extra info lines like "Secrétaires :" (tm5)
|
|
17
|
+
const paras = [];
|
|
18
|
+
root.find("p.tm5").each((_, el) => {
|
|
19
|
+
const t = norm($(el).text());
|
|
20
|
+
if (t)
|
|
21
|
+
paras.push(toTexte(t));
|
|
22
|
+
});
|
|
23
|
+
if (paras.length) {
|
|
24
|
+
sommaire.para = paras.length === 1 ? paras[0] : paras;
|
|
25
|
+
}
|
|
26
|
+
// (3) first-level items (tm3)
|
|
27
|
+
const items = [];
|
|
28
|
+
root.find("p.tm3").each((_, el) => {
|
|
29
|
+
const $p = $(el);
|
|
30
|
+
const full = norm($p.text());
|
|
31
|
+
// try to extract the numeric order at the start: "1. ..." or "2 – ..." etc.
|
|
32
|
+
const numMatch = full.match(/^(\d+)\s*[.\-–—]/);
|
|
33
|
+
const valeur = numMatch ? numMatch[1] : undefined;
|
|
34
|
+
// prefer the linked title text; fallback to full text
|
|
35
|
+
const a = $p.find("a").first();
|
|
36
|
+
const intitule = norm(a.text() || full.replace(/^(\d+)\s*[.\-–—]\s*/, ""));
|
|
37
|
+
// id_syceron = href target without '#' ? TODO verify
|
|
38
|
+
const href = a.attr("href") || "";
|
|
39
|
+
const idSyceron = href.startsWith("#") ? href.slice(1) : href;
|
|
40
|
+
const titreStruct = {
|
|
41
|
+
id_syceron: idSyceron || "",
|
|
42
|
+
intitule,
|
|
10
43
|
};
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
44
|
+
const elem = {
|
|
45
|
+
valeur_pts_odj: valeur,
|
|
46
|
+
titreStruct,
|
|
47
|
+
// sommaire2/3 undefined (first level only)
|
|
48
|
+
};
|
|
49
|
+
items.push(elem);
|
|
50
|
+
});
|
|
51
|
+
if (items.length) {
|
|
52
|
+
sommaire.sommaire1 = items;
|
|
53
|
+
}
|
|
54
|
+
return sommaire;
|
|
55
|
+
}
|
|
56
|
+
function stripTrailingPunct(s) {
|
|
57
|
+
return s.replace(/\s*([:,.;])\s*$/u, "").trim();
|
|
58
|
+
}
|
|
59
|
+
function dedupeSpeaker(raw) {
|
|
60
|
+
let s = norm(raw);
|
|
61
|
+
s = stripTrailingPunct(s);
|
|
62
|
+
const dupPatterns = [
|
|
63
|
+
/^(.+?)\s*[.]\s*\1$/u,
|
|
64
|
+
/^(.+?)\s*,\s*\1,?$/u,
|
|
65
|
+
/^(.+?)\s+\1$/u,
|
|
66
|
+
];
|
|
67
|
+
for (const re of dupPatterns) {
|
|
68
|
+
const m = s.match(re);
|
|
69
|
+
if (m) {
|
|
70
|
+
s = m[1];
|
|
71
|
+
break;
|
|
19
72
|
}
|
|
20
73
|
}
|
|
21
|
-
return
|
|
74
|
+
return s.replace(/\.\s*$/, "");
|
|
22
75
|
}
|
|
23
|
-
|
|
76
|
+
function decodeHtmlEntities(s) {
|
|
77
|
+
return s
|
|
78
|
+
.replace(/&#(\d+);/g, (_, d) => String.fromCharCode(parseInt(d, 10)))
|
|
79
|
+
.replace(/&#x([0-9a-fA-F]+);/g, (_, h) => String.fromCharCode(parseInt(h, 16)));
|
|
80
|
+
}
|
|
81
|
+
function fixApostrophes(s) {
|
|
82
|
+
// Tighten spacing around French apostrophes and punctuation
|
|
83
|
+
let out = s;
|
|
84
|
+
out = out.replace(/\s*’\s*/g, "’");
|
|
85
|
+
out = out.replace(/\b([dljctmsn])\s*’/gi, (_, m) => m + "’");
|
|
86
|
+
out = out.replace(/’\s+([A-Za-zÀ-ÖØ-öø-ÿ])/g, "’$1");
|
|
87
|
+
out = out.replace(/\s+([,;:.!?])/g, "$1");
|
|
88
|
+
return out;
|
|
89
|
+
}
|
|
90
|
+
function normalizeTitle(text) {
|
|
91
|
+
return text.replace(/^PR[ÉE]SIDENCE DE\b/i, "Présidence de ");
|
|
92
|
+
}
|
|
93
|
+
function roleForSpeaker(labelOrQualite) {
|
|
94
|
+
const s = labelOrQualite.toLowerCase();
|
|
95
|
+
if (/^(m\.|mme)?\s*(le|la)\s+pr[ée]sident(e)?\b/.test(s) ||
|
|
96
|
+
/\bpr[ée]sident[e]?\s+de\s+séance\b/.test(s)) {
|
|
97
|
+
return "président";
|
|
98
|
+
}
|
|
99
|
+
return "";
|
|
100
|
+
}
|
|
101
|
+
// ---------------- DOM helpers ----------------
|
|
102
|
+
function parseCriIntervenantComment(html) {
|
|
103
|
+
// From <!-- cri:intervenant mat="..." nom="..." qua="..." ... -->
|
|
104
|
+
const m = html.match(/<!--\s*cri:intervenant\b([^>]+)-->/i);
|
|
105
|
+
if (!m)
|
|
106
|
+
return {};
|
|
107
|
+
const attrs = m[1];
|
|
108
|
+
const out = {};
|
|
109
|
+
const re = /(\w+)="([^"]*)"/g;
|
|
110
|
+
let a;
|
|
111
|
+
while ((a = re.exec(attrs))) {
|
|
112
|
+
out[a[1]] = decodeHtmlEntities(a[2]);
|
|
113
|
+
}
|
|
114
|
+
return { mat: out["mat"], nom: out["nom"], qua: out["qua"] };
|
|
115
|
+
}
|
|
116
|
+
/**
|
|
117
|
+
* Extract leading .orateur_qualite chunks from the FIRST <p> only,
|
|
118
|
+
* concatenate them, clean punctuation/apostrophes, and REMOVE those nodes
|
|
119
|
+
* (and .orateur_nom) from the first paragraph so the speech starts cleanly.
|
|
120
|
+
*/
|
|
121
|
+
function extractAndRemoveLeadingQualite($, $block) {
|
|
122
|
+
const firstP = $block.find("p").first();
|
|
123
|
+
if (firstP.length === 0)
|
|
124
|
+
return "";
|
|
125
|
+
const parts = [];
|
|
126
|
+
// Iterate over the first <p>'s children from the start
|
|
127
|
+
let stop = false;
|
|
128
|
+
firstP.contents().each((_, node) => {
|
|
129
|
+
if (stop)
|
|
130
|
+
return;
|
|
131
|
+
if (node.type === "tag") {
|
|
132
|
+
const $node = $(node);
|
|
133
|
+
if ($node.hasClass("orateur_nom")) {
|
|
134
|
+
// speaker label node — remove it
|
|
135
|
+
$node.remove();
|
|
136
|
+
return;
|
|
137
|
+
}
|
|
138
|
+
if ($node.hasClass("orateur_qualite")) {
|
|
139
|
+
parts.push($node.text() || "");
|
|
140
|
+
$node.remove();
|
|
141
|
+
return;
|
|
142
|
+
}
|
|
143
|
+
// Non-qualite tag: if it has meaningful text, we reached the speech
|
|
144
|
+
const t = norm($node.text() || "");
|
|
145
|
+
if (t) {
|
|
146
|
+
stop = true;
|
|
147
|
+
}
|
|
148
|
+
else {
|
|
149
|
+
// empty-ish node; remove to avoid stray punctuation
|
|
150
|
+
$node.remove();
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
else if (node.type === "text") {
|
|
154
|
+
const t = norm(node.data || "");
|
|
155
|
+
if (!t) {
|
|
156
|
+
// whitespace only — drop it
|
|
157
|
+
;
|
|
158
|
+
node.data = "";
|
|
159
|
+
return;
|
|
160
|
+
}
|
|
161
|
+
// boundary punctuation like ":" just after label — drop it
|
|
162
|
+
if (/^[:.,;–—-]+$/.test(t)) {
|
|
163
|
+
;
|
|
164
|
+
node.data = "";
|
|
165
|
+
return;
|
|
166
|
+
}
|
|
167
|
+
// any other text means speech starts here
|
|
168
|
+
stop = true;
|
|
169
|
+
}
|
|
170
|
+
else {
|
|
171
|
+
// comment or others — ignore
|
|
172
|
+
}
|
|
173
|
+
});
|
|
174
|
+
const qual = fixApostrophes(norm(parts.join(" ")));
|
|
175
|
+
return qual;
|
|
176
|
+
}
|
|
177
|
+
function sanitizeInterventionHtml($, $block) {
|
|
178
|
+
// Clone to avoid mutating outer tree order
|
|
179
|
+
const $clone = $block.clone();
|
|
180
|
+
// Remove navigation / anchors / images
|
|
181
|
+
$clone.find('a[name]').remove();
|
|
182
|
+
$clone.find('div[align="right"]').remove();
|
|
183
|
+
$clone.find('a.link').remove();
|
|
184
|
+
$clone.find('img').remove();
|
|
185
|
+
// Remove technical anchors inside interventions
|
|
186
|
+
$clone.find('a#ameli_amendement_cri_phrase, a#ameli_amendement_cra_contenu, a#ameli_amendement_cra_objet').remove();
|
|
187
|
+
// Remove any remaining speaker label / quality spans anywhere
|
|
188
|
+
$clone.find(".orateur_nom").remove();
|
|
189
|
+
$clone.find(".orateur_qualite").remove();
|
|
190
|
+
// Strip HTML comments
|
|
191
|
+
let html = $clone.html() || "";
|
|
192
|
+
html = html.replace(/<!--[\s\S]*?-->/g, "");
|
|
193
|
+
return html.trim();
|
|
194
|
+
}
|
|
195
|
+
function extractMetadonnees($) {
|
|
196
|
+
const headerText = norm($("h1.page-title").text() || "");
|
|
197
|
+
const dateMatch = headerText.match(/\b(\d{1,2}\s+\w+\s+\d{4})\b/i);
|
|
198
|
+
const bodyText = norm($("#cri").text() || "");
|
|
199
|
+
const sessionMatch = bodyText.match(/\bsession\s+(\d{4}-\d{4})\b/i);
|
|
200
|
+
return {
|
|
201
|
+
dateSeance: dateMatch?.[1] || "",
|
|
202
|
+
dateSeanceJour: dateMatch?.[1] || "",
|
|
203
|
+
numSeanceJour: "",
|
|
204
|
+
numSeance: "",
|
|
205
|
+
typeAssemblee: "SN",
|
|
206
|
+
legislature: "",
|
|
207
|
+
session: sessionMatch?.[1] || "",
|
|
208
|
+
nomFichierJo: "",
|
|
209
|
+
validite: "",
|
|
210
|
+
etat: "",
|
|
211
|
+
diffusion: "",
|
|
212
|
+
version: "1.0",
|
|
213
|
+
environnement: "",
|
|
214
|
+
heureGeneration: new Date(),
|
|
215
|
+
sommaire: extractSommaire($)
|
|
216
|
+
};
|
|
217
|
+
}
|
|
218
|
+
// ---------------- main transform ----------------
|
|
219
|
+
export async function parseCompteRenduFromFile(htmlFilePath) {
|
|
24
220
|
try {
|
|
25
|
-
const {
|
|
26
|
-
|
|
221
|
+
const { window } = await JSDOM.fromFile(htmlFilePath, { contentType: "text/html" });
|
|
222
|
+
const $ = cheerio.load(window.document.documentElement.outerHTML);
|
|
223
|
+
const metadonnees = extractMetadonnees($);
|
|
224
|
+
const points = [];
|
|
225
|
+
let ordre = 0;
|
|
226
|
+
const addPoint = (p) => points.push({ ...p, ordre_absolu_seance: String(++ordre) });
|
|
227
|
+
// (1) Global section titles (common high-level headings)
|
|
228
|
+
let lastTitle = "";
|
|
229
|
+
$("#cri p[class^='titre_S']").each((_, el) => {
|
|
230
|
+
const t = normalizeTitle(norm($(el).text() || ""));
|
|
231
|
+
if (t && t !== lastTitle) {
|
|
232
|
+
addPoint({ code_grammaire: "TITRE_TEXTE_DISCUSSION", texte: { _: t }, code_style: "Titre" });
|
|
233
|
+
lastTitle = t;
|
|
234
|
+
}
|
|
235
|
+
});
|
|
236
|
+
// (2) Interventions
|
|
237
|
+
$("#cri div.intervenant").each((_, block) => {
|
|
238
|
+
const $block = $(block);
|
|
239
|
+
// (2.a) Extract internal structural titles inside this block (and remove them)
|
|
240
|
+
const structuralSel = [
|
|
241
|
+
"p[class^='titre_S']",
|
|
242
|
+
"p.mention_titre",
|
|
243
|
+
"p.intitule_titre",
|
|
244
|
+
"p.mention_chapitre",
|
|
245
|
+
"p.intitule_chapitre",
|
|
246
|
+
"p.mention_article",
|
|
247
|
+
"p.intitule_article",
|
|
248
|
+
"p.mention_section",
|
|
249
|
+
"p.intitule_section",
|
|
250
|
+
].join(",");
|
|
251
|
+
$block.find(structuralSel).each((__, el) => {
|
|
252
|
+
const title = normalizeTitle(norm($(el).text() || ""));
|
|
253
|
+
if (title && title !== lastTitle) {
|
|
254
|
+
addPoint({ code_grammaire: "TITRE_TEXTE_DISCUSSION", texte: { _: title }, code_style: "Titre" });
|
|
255
|
+
lastTitle = title;
|
|
256
|
+
}
|
|
257
|
+
$(el).remove();
|
|
258
|
+
});
|
|
259
|
+
// (2.b) Speaker label & quality
|
|
260
|
+
const firstP = $block.find("p").first();
|
|
261
|
+
const speakerLabelRaw = firstP.find(".orateur_nom").text() ||
|
|
262
|
+
firstP.find("a.lien_senfic").text() ||
|
|
263
|
+
"";
|
|
264
|
+
const speakerLabel = dedupeSpeaker(speakerLabelRaw);
|
|
265
|
+
// Prefer <!--cri:intervenant ...--> for id/name/qualite when available
|
|
266
|
+
const rawHtml = $block.html() || "";
|
|
267
|
+
const { mat, nom: nomFromComment, qua: quaFromCommentRaw } = parseCriIntervenantComment(rawHtml);
|
|
268
|
+
// Extract and remove leading .orateur_qualite chunks from first <p>
|
|
269
|
+
const qualFromSpans = extractAndRemoveLeadingQualite($, $block);
|
|
270
|
+
const qualite = norm(decodeHtmlEntities(quaFromCommentRaw || "")) ||
|
|
271
|
+
qualFromSpans;
|
|
272
|
+
const canonicalName = dedupeSpeaker(nomFromComment || speakerLabel);
|
|
273
|
+
const role = roleForSpeaker(speakerLabel) ||
|
|
274
|
+
roleForSpeaker(qualite) ||
|
|
275
|
+
roleForSpeaker(quaFromCommentRaw || "");
|
|
276
|
+
// (2.c) Build cleaned speech HTML
|
|
277
|
+
let speechHtml = sanitizeInterventionHtml($, $block);
|
|
278
|
+
// If nothing meaningful remains, skip
|
|
279
|
+
if (!norm(cheerio.load(speechHtml).text() || ""))
|
|
280
|
+
return;
|
|
281
|
+
addPoint({
|
|
282
|
+
code_grammaire: "PAROLE_GENERIQUE",
|
|
283
|
+
roledebat: role,
|
|
284
|
+
orateurs: {
|
|
285
|
+
orateur: {
|
|
286
|
+
nom: canonicalName,
|
|
287
|
+
id: mat || "",
|
|
288
|
+
qualite: qualite,
|
|
289
|
+
},
|
|
290
|
+
},
|
|
291
|
+
texte: { _: speechHtml },
|
|
292
|
+
});
|
|
293
|
+
});
|
|
294
|
+
const contenu = {
|
|
295
|
+
quantiemes: {
|
|
296
|
+
journee: metadonnees.dateSeance,
|
|
297
|
+
session: metadonnees.session,
|
|
298
|
+
},
|
|
299
|
+
point: points,
|
|
300
|
+
};
|
|
301
|
+
return {
|
|
302
|
+
uid: htmlFilePath.replace(/^.*?(\d{8}).*$/i, "$1"),
|
|
303
|
+
seanceRef: htmlFilePath.replace(/^.*?(\d{8}).*$/i, "$1"),
|
|
304
|
+
sessionRef: metadonnees.session,
|
|
305
|
+
metadonnees,
|
|
306
|
+
contenu,
|
|
307
|
+
};
|
|
27
308
|
}
|
|
28
|
-
catch (
|
|
29
|
-
console.error(
|
|
309
|
+
catch (e) {
|
|
310
|
+
console.error("Could not parse compte-rendu with error", e);
|
|
311
|
+
return null;
|
|
30
312
|
}
|
|
31
|
-
return null;
|
|
32
313
|
}
|
|
@@ -2,11 +2,11 @@ import assert from "assert";
|
|
|
2
2
|
import commandLineArgs from "command-line-args";
|
|
3
3
|
import fs from "fs-extra";
|
|
4
4
|
import path from "path";
|
|
5
|
-
import { COMPTES_RENDUS_FOLDER, DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER, iterLoadSenatDebats } from "../loaders";
|
|
6
|
-
import { parseCompteRenduFromFile } from "../model/compte_rendu";
|
|
5
|
+
import { COMPTES_RENDUS_FOLDER, DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER, iterLoadSenatDebats, } from "../loaders";
|
|
7
6
|
import { getSessionsFromStart } from "../types/sessions";
|
|
8
7
|
import { commonOptions } from "./shared/cli_helpers";
|
|
9
8
|
import { ensureAndClearDir } from "./shared/util";
|
|
9
|
+
import { parseCompteRenduFromFile } from "../model/compte_rendu";
|
|
10
10
|
const optionsDefinitions = [
|
|
11
11
|
...commonOptions,
|
|
12
12
|
{
|
|
@@ -22,6 +22,22 @@ class CompteRenduError extends Error {
|
|
|
22
22
|
super(`An error occurred while retrieving Compte-Rendu ${compteRenduUrl}: ${message}`);
|
|
23
23
|
}
|
|
24
24
|
}
|
|
25
|
+
async function fetchWithRetry(url, retries = 5, backoffMs = 1000) {
|
|
26
|
+
for (let attempt = 0; attempt < retries; attempt++) {
|
|
27
|
+
try {
|
|
28
|
+
return await fetch(url);
|
|
29
|
+
}
|
|
30
|
+
catch (e) {
|
|
31
|
+
if (attempt === retries)
|
|
32
|
+
break;
|
|
33
|
+
console.warn(`Fetch attempt ${attempt + 1} for ${url} failed. Retrying in ${backoffMs}ms...`);
|
|
34
|
+
await new Promise((resolve) => setTimeout(resolve, backoffMs));
|
|
35
|
+
backoffMs *= 2;
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
console.log(`Failed to fetch ${url} after ${retries} attempts.`);
|
|
39
|
+
return null;
|
|
40
|
+
}
|
|
25
41
|
async function retrieveComptesRendus(dataDir, sessions) {
|
|
26
42
|
const comptesRendusRootDir = path.join(dataDir, COMPTES_RENDUS_FOLDER);
|
|
27
43
|
ensureAndClearDir(comptesRendusRootDir);
|
|
@@ -38,16 +54,15 @@ async function retrieveComptesRendus(dataDir, sessions) {
|
|
|
38
54
|
if (options["parseDebats"]) {
|
|
39
55
|
fs.ensureDirSync(transformedComptesRendusSessionDir);
|
|
40
56
|
}
|
|
41
|
-
for (const { item: debat
|
|
42
|
-
if (!debat.url)
|
|
57
|
+
for (const { item: debat } of iterLoadSenatDebats(dataDir, session)) {
|
|
58
|
+
if (!debat.url)
|
|
43
59
|
continue;
|
|
44
|
-
}
|
|
45
60
|
try {
|
|
46
61
|
const debatMonoUrl = `${path.parse(debat.url).dir}/s${debat.id}_mono.html`;
|
|
47
62
|
const compteRenduPath = path.join(originalComptesRendusSessionDir, `${debat.id}.html`);
|
|
48
63
|
await downloadCompteRendu(debatMonoUrl, compteRenduPath);
|
|
49
64
|
if (options["parseDebats"]) {
|
|
50
|
-
await
|
|
65
|
+
await parseAndWriteJSON(transformedComptesRendusSessionDir, compteRenduPath, debat);
|
|
51
66
|
}
|
|
52
67
|
}
|
|
53
68
|
catch (error) {
|
|
@@ -61,7 +76,7 @@ async function downloadCompteRendu(debatUrl, compteRenduPath) {
|
|
|
61
76
|
if (!options["silent"]) {
|
|
62
77
|
console.log(`Downloading Compte-Rendu ${compteRenduUrl}…`);
|
|
63
78
|
}
|
|
64
|
-
const response = await
|
|
79
|
+
const response = await fetchWithRetry(compteRenduUrl);
|
|
65
80
|
if (!response.ok) {
|
|
66
81
|
if (response.status === 404) {
|
|
67
82
|
console.warn(`Compte-Rendu ${compteRenduUrl} not found`);
|
|
@@ -72,21 +87,19 @@ async function downloadCompteRendu(debatUrl, compteRenduPath) {
|
|
|
72
87
|
return;
|
|
73
88
|
}
|
|
74
89
|
const compteRenduContent = await response.arrayBuffer();
|
|
75
|
-
if (!compteRenduContent)
|
|
90
|
+
if (!compteRenduContent)
|
|
76
91
|
return;
|
|
77
|
-
}
|
|
78
92
|
fs.writeFileSync(compteRenduPath, Buffer.from(compteRenduContent));
|
|
79
93
|
}
|
|
80
|
-
async function
|
|
94
|
+
async function parseAndWriteJSON(transformedComptesRendusSessionDir, compteRenduPath, debat) {
|
|
81
95
|
if (!options["silent"]) {
|
|
82
96
|
console.log(`Parsing compte-rendu ${compteRenduPath}…`);
|
|
83
97
|
}
|
|
84
|
-
const
|
|
85
|
-
if (!
|
|
98
|
+
const parsed = await parseCompteRenduFromFile(compteRenduPath);
|
|
99
|
+
if (!parsed)
|
|
86
100
|
return;
|
|
87
|
-
}
|
|
88
101
|
const parsedFilePath = path.parse(compteRenduPath);
|
|
89
|
-
fs.writeJSONSync(path.join(transformedComptesRendusSessionDir, `${parsedFilePath.name}.json`),
|
|
102
|
+
fs.writeJSONSync(path.join(transformedComptesRendusSessionDir, `${parsedFilePath.name}.json`), parsed, { spaces: 2 });
|
|
90
103
|
}
|
|
91
104
|
async function main() {
|
|
92
105
|
const dataDir = options["dataDir"];
|
|
@@ -1,11 +1,76 @@
|
|
|
1
1
|
export interface CompteRendu {
|
|
2
|
-
|
|
2
|
+
uid?: string;
|
|
3
|
+
seanceRef?: string;
|
|
4
|
+
sessionRef?: string;
|
|
5
|
+
metadonnees?: Metadonnees;
|
|
6
|
+
contenu?: Contenu;
|
|
3
7
|
}
|
|
4
|
-
export interface
|
|
5
|
-
|
|
6
|
-
|
|
8
|
+
export interface Metadonnees {
|
|
9
|
+
dateSeance: string;
|
|
10
|
+
dateSeanceJour: string;
|
|
11
|
+
numSeanceJour: string;
|
|
12
|
+
numSeance: string;
|
|
13
|
+
typeAssemblee: "AN" | "SN";
|
|
14
|
+
legislature: string;
|
|
15
|
+
session: string;
|
|
16
|
+
nomFichierJo: string;
|
|
17
|
+
validite: string;
|
|
18
|
+
etat: string;
|
|
19
|
+
diffusion: string;
|
|
20
|
+
version: string;
|
|
21
|
+
environnement: string;
|
|
22
|
+
heureGeneration: Date;
|
|
23
|
+
sommaire: Sommaire;
|
|
7
24
|
}
|
|
8
|
-
export interface
|
|
9
|
-
|
|
10
|
-
|
|
25
|
+
export interface Contenu {
|
|
26
|
+
quantiemes: {
|
|
27
|
+
journee: string;
|
|
28
|
+
session: string;
|
|
29
|
+
};
|
|
30
|
+
point: Point[];
|
|
31
|
+
}
|
|
32
|
+
export interface Point {
|
|
33
|
+
ordre_absolu_seance: string;
|
|
34
|
+
code_grammaire: string;
|
|
35
|
+
roledebat?: string;
|
|
36
|
+
orateurs?: {
|
|
37
|
+
orateur: {
|
|
38
|
+
nom: string;
|
|
39
|
+
id: string;
|
|
40
|
+
qualite: string;
|
|
41
|
+
};
|
|
42
|
+
};
|
|
43
|
+
texte: {
|
|
44
|
+
_: string;
|
|
45
|
+
};
|
|
46
|
+
code_style?: string;
|
|
47
|
+
}
|
|
48
|
+
export interface Texte {
|
|
49
|
+
_?: string;
|
|
50
|
+
id_syceron?: string;
|
|
51
|
+
stime?: string;
|
|
52
|
+
sup?: string;
|
|
53
|
+
lienAdt?: Texte[] | Texte;
|
|
54
|
+
}
|
|
55
|
+
export interface Sommaire {
|
|
56
|
+
presidentSeance: Texte;
|
|
57
|
+
sommaire1: SommaireElement[] | SommaireElement;
|
|
58
|
+
sommaire3?: SommaireElement[] | SommaireElement;
|
|
59
|
+
sommaire2?: SommaireElement[] | SommaireElement;
|
|
60
|
+
para?: Texte[] | Texte;
|
|
61
|
+
}
|
|
62
|
+
export interface SommaireElement {
|
|
63
|
+
valeur_pts_odj: string | undefined;
|
|
64
|
+
titreStruct: TitreStruct;
|
|
65
|
+
para?: Array<Texte | string> | Texte;
|
|
66
|
+
sommaire2?: SommaireElement[] | SommaireElement;
|
|
67
|
+
sommaire3?: SommaireElement[] | SommaireElement;
|
|
68
|
+
presidentSeance?: Texte[] | Texte;
|
|
69
|
+
type_debat?: string;
|
|
70
|
+
}
|
|
71
|
+
export interface TitreStruct {
|
|
72
|
+
id_syceron: string;
|
|
73
|
+
intitule?: string;
|
|
74
|
+
sousIntitule?: string;
|
|
75
|
+
type_debat?: string;
|
|
11
76
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tricoteuses/senat",
|
|
3
|
-
"version": "2.9.
|
|
3
|
+
"version": "2.9.6",
|
|
4
4
|
"description": "Handle French Sénat's open data",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"France",
|
|
@@ -60,6 +60,7 @@
|
|
|
60
60
|
},
|
|
61
61
|
"dependencies": {
|
|
62
62
|
"@biryani/core": "^0.2.1",
|
|
63
|
+
"cheerio": "^1.1.2",
|
|
63
64
|
"command-line-args": "^5.1.1",
|
|
64
65
|
"dotenv": "^8.2.0",
|
|
65
66
|
"fs-extra": "^9.1.0",
|