@tricoteuses/senat 2.22.0 → 2.22.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. package/package.json +2 -2
  2. package/lib/config.d.ts +0 -3
  3. package/lib/config.js +0 -16
  4. package/lib/databases.d.ts +0 -2
  5. package/lib/databases.js +0 -26
  6. package/lib/datasets.d.ts +0 -34
  7. package/lib/datasets.js +0 -233
  8. package/lib/git.d.ts +0 -26
  9. package/lib/git.js +0 -167
  10. package/lib/index.d.ts +0 -13
  11. package/lib/index.js +0 -1
  12. package/lib/loaders.d.ts +0 -58
  13. package/lib/loaders.js +0 -286
  14. package/lib/model/agenda.d.ts +0 -6
  15. package/lib/model/agenda.js +0 -148
  16. package/lib/model/ameli.d.ts +0 -51
  17. package/lib/model/ameli.js +0 -149
  18. package/lib/model/commission.d.ts +0 -18
  19. package/lib/model/commission.js +0 -269
  20. package/lib/model/debats.d.ts +0 -67
  21. package/lib/model/debats.js +0 -95
  22. package/lib/model/documents.d.ts +0 -12
  23. package/lib/model/documents.js +0 -151
  24. package/lib/model/dosleg.d.ts +0 -7
  25. package/lib/model/dosleg.js +0 -326
  26. package/lib/model/index.d.ts +0 -7
  27. package/lib/model/index.js +0 -7
  28. package/lib/model/questions.d.ts +0 -45
  29. package/lib/model/questions.js +0 -89
  30. package/lib/model/scrutins.d.ts +0 -13
  31. package/lib/model/scrutins.js +0 -114
  32. package/lib/model/seance.d.ts +0 -3
  33. package/lib/model/seance.js +0 -267
  34. package/lib/model/sens.d.ts +0 -146
  35. package/lib/model/sens.js +0 -454
  36. package/lib/model/texte.d.ts +0 -7
  37. package/lib/model/texte.js +0 -228
  38. package/lib/model/util.d.ts +0 -9
  39. package/lib/model/util.js +0 -38
  40. package/lib/parsers/texte.d.ts +0 -7
  41. package/lib/parsers/texte.js +0 -228
  42. package/lib/raw_types/ameli.d.ts +0 -914
  43. package/lib/raw_types/ameli.js +0 -5
  44. package/lib/raw_types/debats.d.ts +0 -207
  45. package/lib/raw_types/debats.js +0 -5
  46. package/lib/raw_types/dosleg.d.ts +0 -1619
  47. package/lib/raw_types/dosleg.js +0 -5
  48. package/lib/raw_types/questions.d.ts +0 -419
  49. package/lib/raw_types/questions.js +0 -5
  50. package/lib/raw_types/senat.d.ts +0 -11368
  51. package/lib/raw_types/senat.js +0 -5
  52. package/lib/raw_types/sens.d.ts +0 -8248
  53. package/lib/raw_types/sens.js +0 -5
  54. package/lib/raw_types_schemats/ameli.d.ts +0 -539
  55. package/lib/raw_types_schemats/ameli.js +0 -2
  56. package/lib/raw_types_schemats/debats.d.ts +0 -127
  57. package/lib/raw_types_schemats/debats.js +0 -2
  58. package/lib/raw_types_schemats/dosleg.d.ts +0 -977
  59. package/lib/raw_types_schemats/dosleg.js +0 -2
  60. package/lib/raw_types_schemats/questions.d.ts +0 -235
  61. package/lib/raw_types_schemats/questions.js +0 -2
  62. package/lib/raw_types_schemats/sens.d.ts +0 -6915
  63. package/lib/raw_types_schemats/sens.js +0 -2
  64. package/lib/scripts/convert_data.d.ts +0 -1
  65. package/lib/scripts/convert_data.js +0 -354
  66. package/lib/scripts/data-download.d.ts +0 -1
  67. package/lib/scripts/data-download.js +0 -12
  68. package/lib/scripts/datautil.d.ts +0 -8
  69. package/lib/scripts/datautil.js +0 -34
  70. package/lib/scripts/parse_textes.d.ts +0 -1
  71. package/lib/scripts/parse_textes.js +0 -44
  72. package/lib/scripts/retrieve_agenda.d.ts +0 -1
  73. package/lib/scripts/retrieve_agenda.js +0 -132
  74. package/lib/scripts/retrieve_cr_commission.d.ts +0 -1
  75. package/lib/scripts/retrieve_cr_commission.js +0 -364
  76. package/lib/scripts/retrieve_cr_seance.d.ts +0 -6
  77. package/lib/scripts/retrieve_cr_seance.js +0 -347
  78. package/lib/scripts/retrieve_documents.d.ts +0 -3
  79. package/lib/scripts/retrieve_documents.js +0 -219
  80. package/lib/scripts/retrieve_open_data.d.ts +0 -1
  81. package/lib/scripts/retrieve_open_data.js +0 -315
  82. package/lib/scripts/retrieve_senateurs_photos.d.ts +0 -1
  83. package/lib/scripts/retrieve_senateurs_photos.js +0 -147
  84. package/lib/scripts/retrieve_videos.d.ts +0 -1
  85. package/lib/scripts/retrieve_videos.js +0 -461
  86. package/lib/scripts/shared/cli_helpers.d.ts +0 -95
  87. package/lib/scripts/shared/cli_helpers.js +0 -91
  88. package/lib/scripts/shared/util.d.ts +0 -4
  89. package/lib/scripts/shared/util.js +0 -35
  90. package/lib/scripts/test_iter_load.d.ts +0 -1
  91. package/lib/scripts/test_iter_load.js +0 -12
  92. package/lib/strings.d.ts +0 -1
  93. package/lib/strings.js +0 -18
  94. package/lib/types/agenda.d.ts +0 -44
  95. package/lib/types/agenda.js +0 -1
  96. package/lib/types/ameli.d.ts +0 -5
  97. package/lib/types/ameli.js +0 -1
  98. package/lib/types/compte_rendu.d.ts +0 -83
  99. package/lib/types/compte_rendu.js +0 -1
  100. package/lib/types/debats.d.ts +0 -2
  101. package/lib/types/debats.js +0 -1
  102. package/lib/types/dosleg.d.ts +0 -70
  103. package/lib/types/dosleg.js +0 -1
  104. package/lib/types/questions.d.ts +0 -2
  105. package/lib/types/questions.js +0 -1
  106. package/lib/types/sens.d.ts +0 -10
  107. package/lib/types/sens.js +0 -1
  108. package/lib/types/sessions.d.ts +0 -5
  109. package/lib/types/sessions.js +0 -84
  110. package/lib/types/texte.d.ts +0 -74
  111. package/lib/types/texte.js +0 -16
  112. package/lib/utils/cr_spliting.d.ts +0 -28
  113. package/lib/utils/cr_spliting.js +0 -265
  114. package/lib/utils/date.d.ts +0 -10
  115. package/lib/utils/date.js +0 -100
  116. package/lib/utils/nvs-timecode.d.ts +0 -7
  117. package/lib/utils/nvs-timecode.js +0 -79
  118. package/lib/utils/reunion_grouping.d.ts +0 -11
  119. package/lib/utils/reunion_grouping.js +0 -337
  120. package/lib/utils/reunion_odj_building.d.ts +0 -5
  121. package/lib/utils/reunion_odj_building.js +0 -154
  122. package/lib/utils/reunion_parsing.d.ts +0 -23
  123. package/lib/utils/reunion_parsing.js +0 -209
  124. package/lib/utils/scoring.d.ts +0 -14
  125. package/lib/utils/scoring.js +0 -147
  126. package/lib/utils/string_cleaning.d.ts +0 -7
  127. package/lib/utils/string_cleaning.js +0 -57
  128. package/lib/validators/config.d.ts +0 -1
  129. package/lib/validators/config.js +0 -54
@@ -1,347 +0,0 @@
1
- /**
2
- * Needs to be ran after retrieve_agenda.ts script !
3
- * - downloads the ZIP of comptes-rendus des débats (CRI) from data.senat.fr
4
- * - extracts XML files, distributes them by session/year
5
- */
6
- import assert from "assert";
7
- import commandLineArgs from "command-line-args";
8
- import fs, { ensureDirSync } from "fs-extra";
9
- import path from "path";
10
- import StreamZip from "node-stream-zip";
11
- import * as cheerio from "cheerio";
12
- import { AGENDA_FOLDER, COMPTES_RENDUS_FOLDER, DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER } from "../loaders";
13
- import { commonOptions } from "./shared/cli_helpers";
14
- import { parseCompteRenduIntervalFromFile, sessionStartYearFromDate } from "../model/seance";
15
- import { extractSommaireBlocks, makeReunionUid } from "../utils/reunion_parsing";
16
- import { getSessionsFromStart } from "../types/sessions";
17
- import { ensureAndClearDir, fetchWithRetry } from "./shared/util";
18
- import { isNoiseBlock, scoreSommaireBlockForEvent } from "../utils/scoring";
19
- import { parseYYYYMMDD } from "../utils/date";
20
- const optionsDefinitions = [
21
- ...commonOptions,
22
- {
23
- help: "parse and convert comptes-rendus des débats into JSON",
24
- name: "parseDebats",
25
- type: Boolean,
26
- },
27
- ];
28
- const options = commandLineArgs(optionsDefinitions);
29
- const CRI_ZIP_URL = "https://data.senat.fr/data/debats/cri.zip";
30
- class CompteRenduError extends Error {
31
- constructor(message, url) {
32
- super(`An error occurred while retrieving ${url}: ${message}`);
33
- }
34
- }
35
- async function downloadCriZip(zipPath) {
36
- if (!options["silent"])
37
- console.log(`Downloading CRI zip ${CRI_ZIP_URL}…`);
38
- const response = await fetchWithRetry(CRI_ZIP_URL);
39
- if (!response.ok) {
40
- if (response.status === 404) {
41
- console.warn(`CRI zip ${CRI_ZIP_URL} not found`);
42
- return;
43
- }
44
- throw new CompteRenduError(String(response.status), CRI_ZIP_URL);
45
- }
46
- const buf = Buffer.from(await response.arrayBuffer());
47
- await fs.writeFile(zipPath, buf);
48
- if (!options["silent"]) {
49
- const mb = (buf.length / (1024 * 1024)).toFixed(1);
50
- console.log(`[CRI] Downloaded ${mb} MB → ${zipPath}`);
51
- }
52
- }
53
- async function extractAndDistributeXmlBySession(zipPath, originalRoot) {
54
- const zip = new StreamZip.async({ file: zipPath });
55
- const entries = await zip.entries();
56
- let count = 0;
57
- for (const entryName of Object.keys(entries)) {
58
- if (!entryName.toLowerCase().endsWith(".xml"))
59
- continue;
60
- // ex: d20231005.xml
61
- const base = path.basename(entryName);
62
- const m = base.match(/^d(\d{8})\.xml$/i);
63
- if (!m)
64
- continue;
65
- const yyyymmdd = m[1];
66
- const dt = parseYYYYMMDD(yyyymmdd);
67
- if (!dt)
68
- continue;
69
- const session = sessionStartYearFromDate(dt);
70
- const destDir = path.join(originalRoot, String(session));
71
- await fs.ensureDir(destDir);
72
- const outPath = path.join(destDir, base);
73
- await zip.extract(entryName, outPath);
74
- count++;
75
- }
76
- await zip.close();
77
- return count;
78
- }
79
- export async function retrieveCriXmlDump(dataDir, options = {}) {
80
- const root = path.join(dataDir, COMPTES_RENDUS_FOLDER);
81
- ensureDirSync(root);
82
- const originalRoot = path.join(root, DATA_ORIGINAL_FOLDER);
83
- if (!options["keepDir"]) {
84
- ensureAndClearDir(originalRoot);
85
- }
86
- else {
87
- fs.ensureDirSync(originalRoot);
88
- }
89
- const transformedRoot = path.join(root, DATA_TRANSFORMED_FOLDER);
90
- if (!options["keepDir"]) {
91
- ensureAndClearDir(transformedRoot);
92
- }
93
- else {
94
- fs.ensureDirSync(transformedRoot);
95
- }
96
- const sessions = getSessionsFromStart(options["fromSession"]);
97
- // 1) Download ZIP global + distribut by session
98
- const zipPath = path.join(dataDir, "cri.zip");
99
- console.log("[CRI] Downloading global CRI zip…");
100
- await downloadCriZip(zipPath);
101
- console.log("[CRI] Extracting + distributing XMLs by session…");
102
- for (const session of sessions) {
103
- const dir = path.join(originalRoot, String(session));
104
- if (await fs.pathExists(dir)) {
105
- for (const f of await fs.readdir(dir))
106
- if (/\.xml$/i.test(f))
107
- await fs.remove(path.join(dir, f));
108
- }
109
- }
110
- const n = await extractAndDistributeXmlBySession(zipPath, originalRoot);
111
- if (n === 0) {
112
- console.warn("[CRI] No XML extracted. Archive empty or layout changed?");
113
- }
114
- else {
115
- console.log(`[CRI] Distributed ${n} XML file(s) into session folders.`);
116
- }
117
- if (!options["parseDebats"]) {
118
- console.log("[CRI] parseDebats not requested → done.");
119
- return;
120
- }
121
- for (const session of sessions) {
122
- const originalSessionDir = path.join(originalRoot, String(session));
123
- if (!(await fs.pathExists(originalSessionDir))) {
124
- continue;
125
- }
126
- const xmlFiles = (await fs.readdir(originalSessionDir)).filter((f) => /^d\d{8}\.xml$/i.test(f)).sort();
127
- const transformedSessionDir = path.join(transformedRoot, String(session));
128
- await fs.ensureDir(transformedSessionDir);
129
- const now = Date.now();
130
- for (const f of xmlFiles) {
131
- const yyyymmdd = f.slice(1, 9);
132
- const xmlPath = path.join(originalSessionDir, f);
133
- // === ONLY-RECENT
134
- if (options["only-recent"]) {
135
- const cutoff = now - options["only-recent"] * 24 * 3600 * 1000;
136
- const seanceTs = Date.parse(`${yyyymmdd.slice(0, 4)}-${yyyymmdd.slice(4, 6)}-${yyyymmdd.slice(6, 8)}`);
137
- if (seanceTs < cutoff) {
138
- const files = await fs.readdir(transformedSessionDir);
139
- const dayFiles = files.filter((fn) => fn.startsWith(`CRSSN${yyyymmdd}E`) && fn.endsWith(".json"));
140
- if (dayFiles.length > 0) {
141
- for (const fn of dayFiles) {
142
- const match = fn.match(/^CRSSN(\d{8})E(.+)\.json$/);
143
- const eventId = match?.[2];
144
- if (!eventId)
145
- continue;
146
- const crPath = path.join(transformedSessionDir, fn);
147
- try {
148
- const cr = await fs.readJSON(crPath);
149
- await linkCriEventIntoAgenda(dataDir, yyyymmdd, eventId, cr.uid, cr, session);
150
- }
151
- catch (e) {
152
- console.warn(`[CR] [${session}] Could not relink existing CR into a reunion for ${yyyymmdd} event=${eventId}:`, e);
153
- }
154
- }
155
- continue;
156
- }
157
- }
158
- }
159
- // === Charger les events SP du jour depuis les agendas groupés ===
160
- const dayEvents = await loadAgendaSpEventsForDate(dataDir, yyyymmdd, session);
161
- if (dayEvents.length === 0) {
162
- console.warn(`[CRI] [${session}] No agenda SP events found for ${yyyymmdd} → skip split/link`);
163
- continue;
164
- }
165
- // === Lire XML + construire index DOM ===
166
- let raw;
167
- let $;
168
- let order;
169
- let idx;
170
- try {
171
- raw = await fs.readFile(xmlPath, "utf8");
172
- $ = cheerio.load(raw, { xml: false });
173
- order = $("body *").toArray();
174
- idx = new Map(order.map((el, i) => [el, i]));
175
- }
176
- catch (e) {
177
- console.warn(`[CRI] [${session}] Cannot read/parse ${f}:`, e);
178
- continue;
179
- }
180
- // === Extraire sommaire + matcher vers events agenda ===
181
- const blocks = extractSommaireBlocks($, idx);
182
- const intervals = buildIntervalsByAgendaEvents($, idx, order, blocks, dayEvents);
183
- if (!intervals.length) {
184
- console.warn(`[CRI] [${session}] No confident split intervals for ${yyyymmdd} → skip`);
185
- continue;
186
- }
187
- // === Parser / écrire / linker chaque segment par event ===
188
- for (const iv of intervals) {
189
- const outName = `CRSSN${yyyymmdd}E${iv.agendaEventId}.json`;
190
- const outPath = path.join(transformedSessionDir, outName);
191
- const cr = await parseCompteRenduIntervalFromFile(xmlPath, iv.startIndex, iv.endIndex, iv.agendaEventId);
192
- if (!cr) {
193
- console.warn(`[CRI] [${session}] Empty or no points for ${yyyymmdd} event=${iv.agendaEventId} → skip`);
194
- continue;
195
- }
196
- await fs.ensureDir(transformedSessionDir);
197
- await fs.writeJSON(outPath, cr, { spaces: 2 });
198
- try {
199
- await linkCriEventIntoAgenda(dataDir, yyyymmdd, iv.agendaEventId, cr.uid, cr, session);
200
- }
201
- catch (e) {
202
- console.warn(`[CR] [${session}] Could not link CR into agenda for ${yyyymmdd} event=${iv.agendaEventId}:`, e);
203
- }
204
- }
205
- }
206
- }
207
- }
208
- async function linkCriEventIntoAgenda(dataDir, yyyymmdd, agendaEventId, crUid, cr, session) {
209
- const agendadDir = path.join(dataDir, AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, session.toString());
210
- fs.ensureDirSync(agendadDir);
211
- const dateISO = `${yyyymmdd.slice(0, 4)}-${yyyymmdd.slice(4, 6)}-${yyyymmdd.slice(6, 8)}`;
212
- const agendaUid = makeReunionUid(dateISO, "SP", agendaEventId, null);
213
- const agendaPath = path.join(agendadDir, `${agendaUid}.json`);
214
- let agenda = null;
215
- if (await fs.pathExists(agendaPath)) {
216
- try {
217
- agenda = await fs.readJSON(agendaPath);
218
- }
219
- catch (e) {
220
- console.warn(`[CR] unreadable reunion JSON → ${agendaPath} (${e})`);
221
- agenda = null;
222
- }
223
- }
224
- if (!agenda) {
225
- console.warn(`[CR] Missing reunion file for SP event=${agendaEventId}: ${agendaPath}`);
226
- return;
227
- }
228
- ;
229
- agenda.compteRenduRefUid = crUid;
230
- await fs.writeJSON(agendaPath, agenda, { spaces: 2 });
231
- console.log(`[CR] Linked CR ${crUid} → ${path.basename(agendaPath)} (event=${agendaEventId})`);
232
- }
233
- function buildIntervalsByAgendaEvents($, idx, order, blocks, dayEvents) {
234
- const MIN_SCORE = 0.65;
235
- const MIN_GAP = 0.08;
236
- const firstIntervenant = $("div.intervenant").first()[0];
237
- const firstIntervenantIdx = firstIntervenant ? (idx.get(firstIntervenant) ?? null) : null;
238
- const pivots = [];
239
- for (const b of blocks) {
240
- if (isNoiseBlock(b.text))
241
- continue;
242
- let best = null;
243
- let second = 0;
244
- for (const ev of dayEvents) {
245
- const s = scoreSommaireBlockForEvent(b.text, ev);
246
- if (!best || s > best.score) {
247
- second = best?.score ?? second;
248
- best = { ev, score: s };
249
- }
250
- else if (s > second) {
251
- second = s;
252
- }
253
- }
254
- if (!best)
255
- continue;
256
- const resolved = resolveTargetIndex($, idx, b.targetId);
257
- const contentStartIndex = resolved ?? b.startIndex;
258
- if (firstIntervenantIdx != null && contentStartIndex < firstIntervenantIdx && resolved == null) {
259
- continue;
260
- }
261
- if (best.score < MIN_SCORE)
262
- continue;
263
- if (best.score - second < MIN_GAP)
264
- continue;
265
- pivots.push({
266
- agendaEventId: best.ev.id,
267
- startIndex: contentStartIndex,
268
- score: best.score,
269
- });
270
- }
271
- if (pivots.length === 0)
272
- return [];
273
- // Dédupe par event (on garde le premier startIndex)
274
- const byEvent = new Map();
275
- for (const p of pivots.sort((a, b) => a.startIndex - b.startIndex)) {
276
- if (!byEvent.has(p.agendaEventId)) {
277
- byEvent.set(p.agendaEventId, {
278
- startIndex: p.startIndex,
279
- score: p.score,
280
- });
281
- }
282
- }
283
- const sorted = Array.from(byEvent.entries())
284
- .map(([agendaEventId, v]) => ({
285
- agendaEventId,
286
- startIndex: v.startIndex,
287
- score: v.score,
288
- }))
289
- .sort((a, b) => a.startIndex - b.startIndex);
290
- // Construction des intervalles
291
- const intervals = [];
292
- for (let i = 0; i < sorted.length; i++) {
293
- const cur = sorted[i];
294
- const next = sorted[i + 1];
295
- const endIndex = next ? next.startIndex - 1 : order.length - 1;
296
- intervals.push({
297
- agendaEventId: cur.agendaEventId,
298
- startIndex: cur.startIndex,
299
- endIndex,
300
- score: cur.score,
301
- });
302
- }
303
- return intervals;
304
- }
305
- async function loadAgendaSpEventsForDate(dataDir, yyyymmdd, session) {
306
- const agendasDir = path.join(dataDir, AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, session.toString());
307
- if (!(await fs.pathExists(agendasDir)))
308
- return [];
309
- const files = (await fs.readdir(agendasDir)).filter((fn) => fn.startsWith(`RUSN${yyyymmdd}IDS`) && fn.endsWith(".json"));
310
- const events = [];
311
- for (const fn of files) {
312
- try {
313
- const g = (await fs.readJSON(path.join(agendasDir, fn)));
314
- const e = g?.events?.[0];
315
- if (e && e.type === "Séance publique")
316
- events.push(e);
317
- }
318
- catch { }
319
- }
320
- return events;
321
- }
322
- function cssEscapeIdent(s) {
323
- return s.replace(/\\/g, "\\\\").replace(/"/g, '\\"');
324
- }
325
- function resolveTargetIndex($, idx, targetId) {
326
- if (!targetId)
327
- return null;
328
- const safe = cssEscapeIdent(targetId);
329
- const el = $(`[id="${safe}"]`)[0] || $(`[name="${safe}"]`)[0];
330
- if (!el)
331
- return null;
332
- const i = idx.get(el);
333
- return i == null ? null : i;
334
- }
335
- async function main() {
336
- const dataDir = options["dataDir"];
337
- assert(dataDir, "Missing argument: data directory");
338
- console.time("CRI processing time");
339
- await retrieveCriXmlDump(dataDir, options);
340
- console.timeEnd("CRI processing time");
341
- }
342
- main()
343
- .then(() => process.exit(0))
344
- .catch((error) => {
345
- console.error(error);
346
- process.exit(1);
347
- });
@@ -1,3 +0,0 @@
1
- import { DocumentMetadata } from "../types/texte";
2
- export declare function processTexte(texteMetadata: DocumentMetadata, originalTextesDir: string, transformedTextesDir: string, options: any): Promise<void>;
3
- export declare function processRapport(rapportMetadata: any, originalRapportsDir: string, options: any): Promise<void>;
@@ -1,219 +0,0 @@
1
- import assert from "assert";
2
- import commandLineArgs from "command-line-args";
3
- import fs from "fs-extra";
4
- import { DateTime } from "luxon";
5
- import path from "path";
6
- import { DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER, iterLoadSenatRapportUrls, iterLoadSenatTexteUrls, RAPPORT_FOLDER, TEXTE_FOLDER, } from "../loaders";
7
- import { parseExposeDesMotifs, parseTexte, parseTexteFromFile } from "../parsers/texte";
8
- import { getSessionsFromStart, UNDEFINED_SESSION } from "../types/sessions";
9
- import { commonOptions } from "./shared/cli_helpers";
10
- import { ensureAndClearDir, fetchWithRetry, isOptionEmptyOrHasValue } from "./shared/util";
11
- const optionsDefinitions = [
12
- ...commonOptions,
13
- {
14
- alias: "F",
15
- help: "formats of documents to retrieve (xml/html/pdf for textes, html/pdf for rapports); leave empty for all",
16
- multiple: true,
17
- name: "formats",
18
- type: String,
19
- },
20
- {
21
- help: "types of documents to retrieve (textes/rapports); leave empty for all",
22
- multiple: true,
23
- name: "types",
24
- type: String,
25
- },
26
- {
27
- help: "force retrieve all documents, even already retrieved ones",
28
- name: "force",
29
- type: Boolean,
30
- },
31
- ];
32
- const options = commandLineArgs(optionsDefinitions);
33
- const textDecoder = new TextDecoder("utf8");
34
- const today = DateTime.now();
35
- function isDocumentRecent(documentDate, daysThreshold) {
36
- if (!documentDate)
37
- return false;
38
- const docDate = DateTime.fromISO(documentDate);
39
- return docDate.isValid && today.diff(docDate, "days").days <= daysThreshold;
40
- }
41
- function shouldDownload(filePath, docDate, options) {
42
- if (options.force)
43
- return true;
44
- if (!fs.existsSync(filePath))
45
- return true;
46
- if (options.onlyRecent !== undefined) {
47
- return isDocumentRecent(docDate, options.onlyRecent);
48
- }
49
- return false;
50
- }
51
- async function downloadDocument(documentUrl, verbose) {
52
- if (verbose) {
53
- console.log(`Downloading document ${documentUrl}…`);
54
- }
55
- try {
56
- const response = await fetchWithRetry(documentUrl);
57
- if (!response.ok) {
58
- if (response.status === 404) {
59
- if (verbose) {
60
- console.warn(`Document ${documentUrl} not found`);
61
- }
62
- }
63
- else {
64
- if (verbose) {
65
- console.error(`An error occurred while retrieving document ${documentUrl}: ${response.status}`);
66
- }
67
- }
68
- return null;
69
- }
70
- return response.arrayBuffer();
71
- }
72
- catch (error) {
73
- console.error(error.message);
74
- return null;
75
- }
76
- }
77
- async function processDocument(url, destPath, docDate, options) {
78
- if (!shouldDownload(destPath, docDate, options)) {
79
- if (options.verbose)
80
- console.info(`Already downloaded ${destPath}…`);
81
- return { success: true, skipped: true, buffer: null };
82
- }
83
- const arrayBuffer = await downloadDocument(url, options.verbose);
84
- if (!arrayBuffer) {
85
- return { success: false, skipped: false, buffer: null };
86
- }
87
- const buffer = Buffer.from(arrayBuffer);
88
- await fs.outputFile(destPath, buffer);
89
- return { success: true, skipped: false, buffer };
90
- }
91
- export async function processTexte(texteMetadata, originalTextesDir, transformedTextesDir, options) {
92
- const texteDir = path.join(originalTextesDir, `${texteMetadata.session ?? UNDEFINED_SESSION}`, texteMetadata.name);
93
- let exposeDesMotifsContent = null;
94
- if (texteMetadata.url_expose_des_motifs) {
95
- const exposePath = path.join(texteDir, `${texteMetadata.name}-expose.html`);
96
- const res = await processDocument(texteMetadata.url_expose_des_motifs.toString(), exposePath, texteMetadata.date, options);
97
- if (res.buffer) {
98
- exposeDesMotifsContent = res.buffer;
99
- }
100
- else if (res.skipped && options.parseDocuments) {
101
- if (await fs.pathExists(exposePath)) {
102
- exposeDesMotifsContent = await fs.readFile(exposePath);
103
- }
104
- }
105
- }
106
- const formats = [
107
- { type: "xml", url: texteMetadata.url_xml, isParseTarget: true },
108
- { type: "html", url: texteMetadata.url_html, isParseTarget: false },
109
- { type: "pdf", url: texteMetadata.url_pdf, isParseTarget: false },
110
- ];
111
- for (const format of formats) {
112
- if (!isOptionEmptyOrHasValue(options.formats, format.type))
113
- continue;
114
- if (!format.url)
115
- continue;
116
- const destPath = path.join(texteDir, `${texteMetadata.name}.${format.type}`);
117
- const result = await processDocument(format.url.toString(), destPath, texteMetadata.date, options);
118
- // Specific logic: Parsing (Only applies to XML)
119
- if (format.isParseTarget && options.parseDocuments) {
120
- await parseDocument(texteMetadata.session, transformedTextesDir, destPath, texteMetadata.name, result.buffer, exposeDesMotifsContent, options);
121
- }
122
- }
123
- }
124
- export async function processRapport(rapportMetadata, originalRapportsDir, options) {
125
- const rapportDir = path.join(originalRapportsDir, `${rapportMetadata.session ?? UNDEFINED_SESSION}`, rapportMetadata.name);
126
- const formats = [
127
- { type: "html", url: rapportMetadata.url_html },
128
- { type: "pdf", url: rapportMetadata.url_pdf },
129
- ];
130
- for (const format of formats) {
131
- if (!isOptionEmptyOrHasValue(options["formats"], format.type))
132
- continue;
133
- const destPath = path.join(rapportDir, `${rapportMetadata.name}.${format.type}`);
134
- await processDocument(format.url.toString(), destPath, rapportMetadata.date, options);
135
- }
136
- }
137
- async function retrieveTextes(dataDir, sessions) {
138
- const originalTextesDir = path.join(dataDir, TEXTE_FOLDER, DATA_ORIGINAL_FOLDER);
139
- const transformedTextesDir = path.join(dataDir, TEXTE_FOLDER, DATA_TRANSFORMED_FOLDER);
140
- if (options["parseDocuments"]) {
141
- ensureAndClearDir(transformedTextesDir);
142
- }
143
- const dlOptions = {
144
- force: options["force"],
145
- silent: options["silent"],
146
- verbose: options["verbose"],
147
- onlyRecent: options["only-recent"],
148
- formats: options["formats"],
149
- parseDocuments: options["parseDocuments"],
150
- };
151
- for (const session of sessions) {
152
- for (const { item: texteMetadata } of iterLoadSenatTexteUrls(dataDir, session)) {
153
- await processTexte(texteMetadata, originalTextesDir, transformedTextesDir, dlOptions);
154
- }
155
- }
156
- }
157
- async function retrieveRapports(dataDir, sessions) {
158
- const originalRapportsDir = path.join(dataDir, RAPPORT_FOLDER, DATA_ORIGINAL_FOLDER);
159
- const dlOptions = {
160
- force: options["force"],
161
- silent: options["silent"],
162
- verbose: options["verbose"],
163
- onlyRecent: options["only-recent"],
164
- formats: options["formats"],
165
- };
166
- for (const session of sessions) {
167
- for (const { item: rapportMetadata } of iterLoadSenatRapportUrls(dataDir, session)) {
168
- await processRapport(rapportMetadata, originalRapportsDir, dlOptions);
169
- }
170
- }
171
- }
172
- async function parseDocument(session, transformedTextesDir, textePath, texteName, texteBuffer, exposeDesMotifs = null, options = {}) {
173
- if (options.verbose) {
174
- console.log(`Parsing texte ${textePath}…`);
175
- }
176
- let parsedTexte;
177
- if (texteBuffer) {
178
- const texteXml = textDecoder.decode(texteBuffer);
179
- parsedTexte = parseTexte(texteXml);
180
- }
181
- else {
182
- parsedTexte = await parseTexteFromFile(textePath);
183
- }
184
- if (!parsedTexte)
185
- return null;
186
- if (exposeDesMotifs) {
187
- if (options.verbose) {
188
- console.log("Parsing exposé des motifs…");
189
- }
190
- const exposeDesMotifsHtml = textDecoder.decode(exposeDesMotifs);
191
- parsedTexte.exposeDesMotifs = parseExposeDesMotifs(exposeDesMotifsHtml);
192
- }
193
- const transformedTexteDir = path.join(transformedTextesDir, `${session ?? UNDEFINED_SESSION}`, texteName);
194
- await fs.outputJSON(path.join(transformedTexteDir, `${texteName}.json`), parsedTexte, { spaces: 2 });
195
- return parsedTexte;
196
- }
197
- async function main() {
198
- const dataDir = options["dataDir"];
199
- assert(dataDir, "Missing argument: data directory");
200
- const sessions = getSessionsFromStart(options["fromSession"]);
201
- console.time("documents processing time");
202
- if (isOptionEmptyOrHasValue(options["types"], "textes")) {
203
- await retrieveTextes(dataDir, sessions);
204
- }
205
- if (isOptionEmptyOrHasValue(options["types"], "rapports")) {
206
- await retrieveRapports(dataDir, sessions);
207
- }
208
- if (!options["silent"]) {
209
- console.timeEnd("documents processing time");
210
- }
211
- }
212
- if (process.argv[1].endsWith("retrieve_documents.ts")) {
213
- main()
214
- .then(() => process.exit(0))
215
- .catch((error) => {
216
- console.log(error);
217
- process.exit(1);
218
- });
219
- }
@@ -1 +0,0 @@
1
- export {};