@tricoteuses/senat 2.22.11 → 2.22.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/config.d.ts +21 -0
- package/lib/config.js +27 -0
- package/lib/databases.d.ts +2 -0
- package/lib/databases.js +26 -0
- package/lib/datasets.d.ts +34 -0
- package/lib/datasets.js +233 -0
- package/lib/git.d.ts +26 -0
- package/lib/git.js +167 -0
- package/lib/index.d.ts +13 -0
- package/lib/index.js +1 -0
- package/lib/loaders.d.ts +58 -0
- package/lib/loaders.js +286 -0
- package/lib/model/agenda.d.ts +6 -0
- package/lib/model/agenda.js +148 -0
- package/lib/model/ameli.d.ts +51 -0
- package/lib/model/ameli.js +147 -0
- package/lib/model/commission.d.ts +18 -0
- package/lib/model/commission.js +269 -0
- package/lib/model/debats.d.ts +67 -0
- package/lib/model/debats.js +95 -0
- package/lib/model/documents.d.ts +12 -0
- package/lib/model/documents.js +138 -0
- package/lib/model/dosleg.d.ts +7 -0
- package/lib/model/dosleg.js +326 -0
- package/lib/model/index.d.ts +7 -0
- package/lib/model/index.js +7 -0
- package/lib/model/questions.d.ts +45 -0
- package/lib/model/questions.js +89 -0
- package/lib/model/scrutins.d.ts +13 -0
- package/lib/model/scrutins.js +114 -0
- package/lib/model/seance.d.ts +3 -0
- package/lib/model/seance.js +267 -0
- package/lib/model/sens.d.ts +146 -0
- package/lib/model/sens.js +454 -0
- package/lib/model/texte.d.ts +7 -0
- package/lib/model/texte.js +228 -0
- package/lib/model/util.d.ts +9 -0
- package/lib/model/util.js +38 -0
- package/lib/parsers/texte.d.ts +7 -0
- package/lib/parsers/texte.js +228 -0
- package/lib/raw_types/ameli.d.ts +914 -0
- package/lib/raw_types/ameli.js +5 -0
- package/lib/raw_types/debats.d.ts +207 -0
- package/lib/raw_types/debats.js +5 -0
- package/lib/raw_types/dosleg.d.ts +1619 -0
- package/lib/raw_types/dosleg.js +5 -0
- package/lib/raw_types/questions.d.ts +423 -0
- package/lib/raw_types/questions.js +5 -0
- package/lib/raw_types/senat.d.ts +11372 -0
- package/lib/raw_types/senat.js +5 -0
- package/lib/raw_types/sens.d.ts +8248 -0
- package/lib/raw_types/sens.js +5 -0
- package/lib/raw_types_schemats/ameli.d.ts +539 -0
- package/lib/raw_types_schemats/ameli.js +2 -0
- package/lib/raw_types_schemats/debats.d.ts +127 -0
- package/lib/raw_types_schemats/debats.js +2 -0
- package/lib/raw_types_schemats/dosleg.d.ts +977 -0
- package/lib/raw_types_schemats/dosleg.js +2 -0
- package/lib/raw_types_schemats/questions.d.ts +237 -0
- package/lib/raw_types_schemats/questions.js +2 -0
- package/lib/raw_types_schemats/sens.d.ts +6915 -0
- package/lib/raw_types_schemats/sens.js +2 -0
- package/lib/scripts/convert_data.d.ts +1 -0
- package/lib/scripts/convert_data.js +354 -0
- package/lib/scripts/data-download.d.ts +1 -0
- package/lib/scripts/data-download.js +12 -0
- package/lib/scripts/datautil.d.ts +8 -0
- package/lib/scripts/datautil.js +34 -0
- package/lib/scripts/parse_textes.d.ts +1 -0
- package/lib/scripts/parse_textes.js +44 -0
- package/lib/scripts/retrieve_agenda.d.ts +1 -0
- package/lib/scripts/retrieve_agenda.js +132 -0
- package/lib/scripts/retrieve_cr_commission.d.ts +1 -0
- package/lib/scripts/retrieve_cr_commission.js +364 -0
- package/lib/scripts/retrieve_cr_seance.d.ts +6 -0
- package/lib/scripts/retrieve_cr_seance.js +347 -0
- package/lib/scripts/retrieve_documents.d.ts +3 -0
- package/lib/scripts/retrieve_documents.js +219 -0
- package/lib/scripts/retrieve_open_data.d.ts +1 -0
- package/lib/scripts/retrieve_open_data.js +316 -0
- package/lib/scripts/retrieve_senateurs_photos.d.ts +1 -0
- package/lib/scripts/retrieve_senateurs_photos.js +147 -0
- package/lib/scripts/retrieve_videos.d.ts +1 -0
- package/lib/scripts/retrieve_videos.js +461 -0
- package/lib/scripts/shared/cli_helpers.d.ts +95 -0
- package/lib/scripts/shared/cli_helpers.js +91 -0
- package/lib/scripts/shared/util.d.ts +4 -0
- package/lib/scripts/shared/util.js +35 -0
- package/lib/scripts/test_iter_load.d.ts +1 -0
- package/lib/scripts/test_iter_load.js +12 -0
- package/lib/src/model/sens.d.ts +36 -0
- package/lib/src/model/sens.js +35 -4
- package/lib/src/scripts/retrieve_cr_commission.js +12 -0
- package/lib/src/scripts/retrieve_cr_seance.js +12 -0
- package/lib/src/scripts/retrieve_videos.js +13 -1
- package/lib/src/utils/nvs-timecode.d.ts +17 -0
- package/lib/src/utils/nvs-timecode.js +79 -0
- package/lib/src/utils/weights_scoring_config.d.ts +2 -0
- package/lib/src/utils/weights_scoring_config.js +15 -0
- package/lib/strings.d.ts +1 -0
- package/lib/strings.js +18 -0
- package/lib/types/agenda.d.ts +44 -0
- package/lib/types/agenda.js +1 -0
- package/lib/types/ameli.d.ts +5 -0
- package/lib/types/ameli.js +1 -0
- package/lib/types/compte_rendu.d.ts +83 -0
- package/lib/types/compte_rendu.js +1 -0
- package/lib/types/debats.d.ts +2 -0
- package/lib/types/debats.js +1 -0
- package/lib/types/dosleg.d.ts +70 -0
- package/lib/types/dosleg.js +1 -0
- package/lib/types/questions.d.ts +2 -0
- package/lib/types/questions.js +1 -0
- package/lib/types/sens.d.ts +10 -0
- package/lib/types/sens.js +1 -0
- package/lib/types/sessions.d.ts +5 -0
- package/lib/types/sessions.js +84 -0
- package/lib/types/texte.d.ts +74 -0
- package/lib/types/texte.js +16 -0
- package/lib/utils/cr_spliting.d.ts +28 -0
- package/lib/utils/cr_spliting.js +265 -0
- package/lib/utils/date.d.ts +10 -0
- package/lib/utils/date.js +100 -0
- package/lib/utils/nvs-timecode.d.ts +7 -0
- package/lib/utils/nvs-timecode.js +79 -0
- package/lib/utils/reunion_grouping.d.ts +11 -0
- package/lib/utils/reunion_grouping.js +337 -0
- package/lib/utils/reunion_odj_building.d.ts +5 -0
- package/lib/utils/reunion_odj_building.js +154 -0
- package/lib/utils/reunion_parsing.d.ts +23 -0
- package/lib/utils/reunion_parsing.js +209 -0
- package/lib/utils/scoring.d.ts +14 -0
- package/lib/utils/scoring.js +147 -0
- package/lib/utils/string_cleaning.d.ts +7 -0
- package/lib/utils/string_cleaning.js +57 -0
- package/lib/validators/config.d.ts +9 -0
- package/lib/validators/config.js +10 -0
- package/package.json +1 -1
|
@@ -0,0 +1,347 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Needs to be ran after retrieve_agenda.ts script !
|
|
3
|
+
* - downloads the ZIP of comptes-rendus des débats (CRI) from data.senat.fr
|
|
4
|
+
* - extracts XML files, distributes them by session/year
|
|
5
|
+
*/
|
|
6
|
+
import assert from "assert";
|
|
7
|
+
import commandLineArgs from "command-line-args";
|
|
8
|
+
import fs, { ensureDirSync } from "fs-extra";
|
|
9
|
+
import path from "path";
|
|
10
|
+
import StreamZip from "node-stream-zip";
|
|
11
|
+
import * as cheerio from "cheerio";
|
|
12
|
+
import { AGENDA_FOLDER, COMPTES_RENDUS_FOLDER, DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER } from "../loaders";
|
|
13
|
+
import { commonOptions } from "./shared/cli_helpers";
|
|
14
|
+
import { parseCompteRenduIntervalFromFile, sessionStartYearFromDate } from "../model/seance";
|
|
15
|
+
import { extractSommaireBlocks, makeReunionUid } from "../utils/reunion_parsing";
|
|
16
|
+
import { getSessionsFromStart } from "../types/sessions";
|
|
17
|
+
import { ensureAndClearDir, fetchWithRetry } from "./shared/util";
|
|
18
|
+
import { isNoiseBlock, scoreSommaireBlockForEvent } from "../utils/scoring";
|
|
19
|
+
import { parseYYYYMMDD } from "../utils/date";
|
|
20
|
+
const optionsDefinitions = [
|
|
21
|
+
...commonOptions,
|
|
22
|
+
{
|
|
23
|
+
help: "parse and convert comptes-rendus des débats into JSON",
|
|
24
|
+
name: "parseDebats",
|
|
25
|
+
type: Boolean,
|
|
26
|
+
},
|
|
27
|
+
];
|
|
28
|
+
const options = commandLineArgs(optionsDefinitions);
|
|
29
|
+
const CRI_ZIP_URL = "https://data.senat.fr/data/debats/cri.zip";
|
|
30
|
+
class CompteRenduError extends Error {
|
|
31
|
+
constructor(message, url) {
|
|
32
|
+
super(`An error occurred while retrieving ${url}: ${message}`);
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
async function downloadCriZip(zipPath) {
|
|
36
|
+
if (!options["silent"])
|
|
37
|
+
console.log(`Downloading CRI zip ${CRI_ZIP_URL}…`);
|
|
38
|
+
const response = await fetchWithRetry(CRI_ZIP_URL);
|
|
39
|
+
if (!response.ok) {
|
|
40
|
+
if (response.status === 404) {
|
|
41
|
+
console.warn(`CRI zip ${CRI_ZIP_URL} not found`);
|
|
42
|
+
return;
|
|
43
|
+
}
|
|
44
|
+
throw new CompteRenduError(String(response.status), CRI_ZIP_URL);
|
|
45
|
+
}
|
|
46
|
+
const buf = Buffer.from(await response.arrayBuffer());
|
|
47
|
+
await fs.writeFile(zipPath, buf);
|
|
48
|
+
if (!options["silent"]) {
|
|
49
|
+
const mb = (buf.length / (1024 * 1024)).toFixed(1);
|
|
50
|
+
console.log(`[CRI] Downloaded ${mb} MB → ${zipPath}`);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
async function extractAndDistributeXmlBySession(zipPath, originalRoot) {
|
|
54
|
+
const zip = new StreamZip.async({ file: zipPath });
|
|
55
|
+
const entries = await zip.entries();
|
|
56
|
+
let count = 0;
|
|
57
|
+
for (const entryName of Object.keys(entries)) {
|
|
58
|
+
if (!entryName.toLowerCase().endsWith(".xml"))
|
|
59
|
+
continue;
|
|
60
|
+
// ex: d20231005.xml
|
|
61
|
+
const base = path.basename(entryName);
|
|
62
|
+
const m = base.match(/^d(\d{8})\.xml$/i);
|
|
63
|
+
if (!m)
|
|
64
|
+
continue;
|
|
65
|
+
const yyyymmdd = m[1];
|
|
66
|
+
const dt = parseYYYYMMDD(yyyymmdd);
|
|
67
|
+
if (!dt)
|
|
68
|
+
continue;
|
|
69
|
+
const session = sessionStartYearFromDate(dt);
|
|
70
|
+
const destDir = path.join(originalRoot, String(session));
|
|
71
|
+
await fs.ensureDir(destDir);
|
|
72
|
+
const outPath = path.join(destDir, base);
|
|
73
|
+
await zip.extract(entryName, outPath);
|
|
74
|
+
count++;
|
|
75
|
+
}
|
|
76
|
+
await zip.close();
|
|
77
|
+
return count;
|
|
78
|
+
}
|
|
79
|
+
export async function retrieveCriXmlDump(dataDir, options = {}) {
|
|
80
|
+
const root = path.join(dataDir, COMPTES_RENDUS_FOLDER);
|
|
81
|
+
ensureDirSync(root);
|
|
82
|
+
const originalRoot = path.join(root, DATA_ORIGINAL_FOLDER);
|
|
83
|
+
if (!options["keepDir"]) {
|
|
84
|
+
ensureAndClearDir(originalRoot);
|
|
85
|
+
}
|
|
86
|
+
else {
|
|
87
|
+
fs.ensureDirSync(originalRoot);
|
|
88
|
+
}
|
|
89
|
+
const transformedRoot = path.join(root, DATA_TRANSFORMED_FOLDER);
|
|
90
|
+
if (!options["keepDir"]) {
|
|
91
|
+
ensureAndClearDir(transformedRoot);
|
|
92
|
+
}
|
|
93
|
+
else {
|
|
94
|
+
fs.ensureDirSync(transformedRoot);
|
|
95
|
+
}
|
|
96
|
+
const sessions = getSessionsFromStart(options["fromSession"]);
|
|
97
|
+
// 1) Download ZIP global + distribut by session
|
|
98
|
+
const zipPath = path.join(dataDir, "cri.zip");
|
|
99
|
+
console.log("[CRI] Downloading global CRI zip…");
|
|
100
|
+
await downloadCriZip(zipPath);
|
|
101
|
+
console.log("[CRI] Extracting + distributing XMLs by session…");
|
|
102
|
+
for (const session of sessions) {
|
|
103
|
+
const dir = path.join(originalRoot, String(session));
|
|
104
|
+
if (await fs.pathExists(dir)) {
|
|
105
|
+
for (const f of await fs.readdir(dir))
|
|
106
|
+
if (/\.xml$/i.test(f))
|
|
107
|
+
await fs.remove(path.join(dir, f));
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
const n = await extractAndDistributeXmlBySession(zipPath, originalRoot);
|
|
111
|
+
if (n === 0) {
|
|
112
|
+
console.warn("[CRI] No XML extracted. Archive empty or layout changed?");
|
|
113
|
+
}
|
|
114
|
+
else {
|
|
115
|
+
console.log(`[CRI] Distributed ${n} XML file(s) into session folders.`);
|
|
116
|
+
}
|
|
117
|
+
if (!options["parseDebats"]) {
|
|
118
|
+
console.log("[CRI] parseDebats not requested → done.");
|
|
119
|
+
return;
|
|
120
|
+
}
|
|
121
|
+
for (const session of sessions) {
|
|
122
|
+
const originalSessionDir = path.join(originalRoot, String(session));
|
|
123
|
+
if (!(await fs.pathExists(originalSessionDir))) {
|
|
124
|
+
continue;
|
|
125
|
+
}
|
|
126
|
+
const xmlFiles = (await fs.readdir(originalSessionDir)).filter((f) => /^d\d{8}\.xml$/i.test(f)).sort();
|
|
127
|
+
const transformedSessionDir = path.join(transformedRoot, String(session));
|
|
128
|
+
await fs.ensureDir(transformedSessionDir);
|
|
129
|
+
const now = Date.now();
|
|
130
|
+
for (const f of xmlFiles) {
|
|
131
|
+
const yyyymmdd = f.slice(1, 9);
|
|
132
|
+
const xmlPath = path.join(originalSessionDir, f);
|
|
133
|
+
// === ONLY-RECENT
|
|
134
|
+
if (options["only-recent"]) {
|
|
135
|
+
const cutoff = now - options["only-recent"] * 24 * 3600 * 1000;
|
|
136
|
+
const seanceTs = Date.parse(`${yyyymmdd.slice(0, 4)}-${yyyymmdd.slice(4, 6)}-${yyyymmdd.slice(6, 8)}`);
|
|
137
|
+
if (seanceTs < cutoff) {
|
|
138
|
+
const files = await fs.readdir(transformedSessionDir);
|
|
139
|
+
const dayFiles = files.filter((fn) => fn.startsWith(`CRSSN${yyyymmdd}E`) && fn.endsWith(".json"));
|
|
140
|
+
if (dayFiles.length > 0) {
|
|
141
|
+
for (const fn of dayFiles) {
|
|
142
|
+
const match = fn.match(/^CRSSN(\d{8})E(.+)\.json$/);
|
|
143
|
+
const eventId = match?.[2];
|
|
144
|
+
if (!eventId)
|
|
145
|
+
continue;
|
|
146
|
+
const crPath = path.join(transformedSessionDir, fn);
|
|
147
|
+
try {
|
|
148
|
+
const cr = await fs.readJSON(crPath);
|
|
149
|
+
await linkCriEventIntoAgenda(dataDir, yyyymmdd, eventId, cr.uid, cr, session);
|
|
150
|
+
}
|
|
151
|
+
catch (e) {
|
|
152
|
+
console.warn(`[CR] [${session}] Could not relink existing CR into a reunion for ${yyyymmdd} event=${eventId}:`, e);
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
continue;
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
// === Charger les events SP du jour depuis les agendas groupés ===
|
|
160
|
+
const dayEvents = await loadAgendaSpEventsForDate(dataDir, yyyymmdd, session);
|
|
161
|
+
if (dayEvents.length === 0) {
|
|
162
|
+
console.warn(`[CRI] [${session}] No agenda SP events found for ${yyyymmdd} → skip split/link`);
|
|
163
|
+
continue;
|
|
164
|
+
}
|
|
165
|
+
// === Lire XML + construire index DOM ===
|
|
166
|
+
let raw;
|
|
167
|
+
let $;
|
|
168
|
+
let order;
|
|
169
|
+
let idx;
|
|
170
|
+
try {
|
|
171
|
+
raw = await fs.readFile(xmlPath, "utf8");
|
|
172
|
+
$ = cheerio.load(raw, { xml: false });
|
|
173
|
+
order = $("body *").toArray();
|
|
174
|
+
idx = new Map(order.map((el, i) => [el, i]));
|
|
175
|
+
}
|
|
176
|
+
catch (e) {
|
|
177
|
+
console.warn(`[CRI] [${session}] Cannot read/parse ${f}:`, e);
|
|
178
|
+
continue;
|
|
179
|
+
}
|
|
180
|
+
// === Extraire sommaire + matcher vers events agenda ===
|
|
181
|
+
const blocks = extractSommaireBlocks($, idx);
|
|
182
|
+
const intervals = buildIntervalsByAgendaEvents($, idx, order, blocks, dayEvents);
|
|
183
|
+
if (!intervals.length) {
|
|
184
|
+
console.warn(`[CRI] [${session}] No confident split intervals for ${yyyymmdd} → skip`);
|
|
185
|
+
continue;
|
|
186
|
+
}
|
|
187
|
+
// === Parser / écrire / linker chaque segment par event ===
|
|
188
|
+
for (const iv of intervals) {
|
|
189
|
+
const outName = `CRSSN${yyyymmdd}E${iv.agendaEventId}.json`;
|
|
190
|
+
const outPath = path.join(transformedSessionDir, outName);
|
|
191
|
+
const cr = await parseCompteRenduIntervalFromFile(xmlPath, iv.startIndex, iv.endIndex, iv.agendaEventId);
|
|
192
|
+
if (!cr) {
|
|
193
|
+
console.warn(`[CRI] [${session}] Empty or no points for ${yyyymmdd} event=${iv.agendaEventId} → skip`);
|
|
194
|
+
continue;
|
|
195
|
+
}
|
|
196
|
+
await fs.ensureDir(transformedSessionDir);
|
|
197
|
+
await fs.writeJSON(outPath, cr, { spaces: 2 });
|
|
198
|
+
try {
|
|
199
|
+
await linkCriEventIntoAgenda(dataDir, yyyymmdd, iv.agendaEventId, cr.uid, cr, session);
|
|
200
|
+
}
|
|
201
|
+
catch (e) {
|
|
202
|
+
console.warn(`[CR] [${session}] Could not link CR into agenda for ${yyyymmdd} event=${iv.agendaEventId}:`, e);
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
async function linkCriEventIntoAgenda(dataDir, yyyymmdd, agendaEventId, crUid, cr, session) {
|
|
209
|
+
const agendadDir = path.join(dataDir, AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, session.toString());
|
|
210
|
+
fs.ensureDirSync(agendadDir);
|
|
211
|
+
const dateISO = `${yyyymmdd.slice(0, 4)}-${yyyymmdd.slice(4, 6)}-${yyyymmdd.slice(6, 8)}`;
|
|
212
|
+
const agendaUid = makeReunionUid(dateISO, "SP", agendaEventId, null);
|
|
213
|
+
const agendaPath = path.join(agendadDir, `${agendaUid}.json`);
|
|
214
|
+
let agenda = null;
|
|
215
|
+
if (await fs.pathExists(agendaPath)) {
|
|
216
|
+
try {
|
|
217
|
+
agenda = await fs.readJSON(agendaPath);
|
|
218
|
+
}
|
|
219
|
+
catch (e) {
|
|
220
|
+
console.warn(`[CR] unreadable reunion JSON → ${agendaPath} (${e})`);
|
|
221
|
+
agenda = null;
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
if (!agenda) {
|
|
225
|
+
console.warn(`[CR] Missing reunion file for SP event=${agendaEventId}: ${agendaPath}`);
|
|
226
|
+
return;
|
|
227
|
+
}
|
|
228
|
+
;
|
|
229
|
+
agenda.compteRenduRefUid = crUid;
|
|
230
|
+
await fs.writeJSON(agendaPath, agenda, { spaces: 2 });
|
|
231
|
+
console.log(`[CR] Linked CR ${crUid} → ${path.basename(agendaPath)} (event=${agendaEventId})`);
|
|
232
|
+
}
|
|
233
|
+
function buildIntervalsByAgendaEvents($, idx, order, blocks, dayEvents) {
|
|
234
|
+
const MIN_SCORE = 0.65;
|
|
235
|
+
const MIN_GAP = 0.08;
|
|
236
|
+
const firstIntervenant = $("div.intervenant").first()[0];
|
|
237
|
+
const firstIntervenantIdx = firstIntervenant ? (idx.get(firstIntervenant) ?? null) : null;
|
|
238
|
+
const pivots = [];
|
|
239
|
+
for (const b of blocks) {
|
|
240
|
+
if (isNoiseBlock(b.text))
|
|
241
|
+
continue;
|
|
242
|
+
let best = null;
|
|
243
|
+
let second = 0;
|
|
244
|
+
for (const ev of dayEvents) {
|
|
245
|
+
const s = scoreSommaireBlockForEvent(b.text, ev);
|
|
246
|
+
if (!best || s > best.score) {
|
|
247
|
+
second = best?.score ?? second;
|
|
248
|
+
best = { ev, score: s };
|
|
249
|
+
}
|
|
250
|
+
else if (s > second) {
|
|
251
|
+
second = s;
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
if (!best)
|
|
255
|
+
continue;
|
|
256
|
+
const resolved = resolveTargetIndex($, idx, b.targetId);
|
|
257
|
+
const contentStartIndex = resolved ?? b.startIndex;
|
|
258
|
+
if (firstIntervenantIdx != null && contentStartIndex < firstIntervenantIdx && resolved == null) {
|
|
259
|
+
continue;
|
|
260
|
+
}
|
|
261
|
+
if (best.score < MIN_SCORE)
|
|
262
|
+
continue;
|
|
263
|
+
if (best.score - second < MIN_GAP)
|
|
264
|
+
continue;
|
|
265
|
+
pivots.push({
|
|
266
|
+
agendaEventId: best.ev.id,
|
|
267
|
+
startIndex: contentStartIndex,
|
|
268
|
+
score: best.score,
|
|
269
|
+
});
|
|
270
|
+
}
|
|
271
|
+
if (pivots.length === 0)
|
|
272
|
+
return [];
|
|
273
|
+
// Dédupe par event (on garde le premier startIndex)
|
|
274
|
+
const byEvent = new Map();
|
|
275
|
+
for (const p of pivots.sort((a, b) => a.startIndex - b.startIndex)) {
|
|
276
|
+
if (!byEvent.has(p.agendaEventId)) {
|
|
277
|
+
byEvent.set(p.agendaEventId, {
|
|
278
|
+
startIndex: p.startIndex,
|
|
279
|
+
score: p.score,
|
|
280
|
+
});
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
const sorted = Array.from(byEvent.entries())
|
|
284
|
+
.map(([agendaEventId, v]) => ({
|
|
285
|
+
agendaEventId,
|
|
286
|
+
startIndex: v.startIndex,
|
|
287
|
+
score: v.score,
|
|
288
|
+
}))
|
|
289
|
+
.sort((a, b) => a.startIndex - b.startIndex);
|
|
290
|
+
// Construction des intervalles
|
|
291
|
+
const intervals = [];
|
|
292
|
+
for (let i = 0; i < sorted.length; i++) {
|
|
293
|
+
const cur = sorted[i];
|
|
294
|
+
const next = sorted[i + 1];
|
|
295
|
+
const endIndex = next ? next.startIndex - 1 : order.length - 1;
|
|
296
|
+
intervals.push({
|
|
297
|
+
agendaEventId: cur.agendaEventId,
|
|
298
|
+
startIndex: cur.startIndex,
|
|
299
|
+
endIndex,
|
|
300
|
+
score: cur.score,
|
|
301
|
+
});
|
|
302
|
+
}
|
|
303
|
+
return intervals;
|
|
304
|
+
}
|
|
305
|
+
async function loadAgendaSpEventsForDate(dataDir, yyyymmdd, session) {
|
|
306
|
+
const agendasDir = path.join(dataDir, AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, session.toString());
|
|
307
|
+
if (!(await fs.pathExists(agendasDir)))
|
|
308
|
+
return [];
|
|
309
|
+
const files = (await fs.readdir(agendasDir)).filter((fn) => fn.startsWith(`RUSN${yyyymmdd}IDS`) && fn.endsWith(".json"));
|
|
310
|
+
const events = [];
|
|
311
|
+
for (const fn of files) {
|
|
312
|
+
try {
|
|
313
|
+
const g = (await fs.readJSON(path.join(agendasDir, fn)));
|
|
314
|
+
const e = g?.events?.[0];
|
|
315
|
+
if (e && e.type === "Séance publique")
|
|
316
|
+
events.push(e);
|
|
317
|
+
}
|
|
318
|
+
catch { }
|
|
319
|
+
}
|
|
320
|
+
return events;
|
|
321
|
+
}
|
|
322
|
+
function cssEscapeIdent(s) {
|
|
323
|
+
return s.replace(/\\/g, "\\\\").replace(/"/g, '\\"');
|
|
324
|
+
}
|
|
325
|
+
function resolveTargetIndex($, idx, targetId) {
|
|
326
|
+
if (!targetId)
|
|
327
|
+
return null;
|
|
328
|
+
const safe = cssEscapeIdent(targetId);
|
|
329
|
+
const el = $(`[id="${safe}"]`)[0] || $(`[name="${safe}"]`)[0];
|
|
330
|
+
if (!el)
|
|
331
|
+
return null;
|
|
332
|
+
const i = idx.get(el);
|
|
333
|
+
return i == null ? null : i;
|
|
334
|
+
}
|
|
335
|
+
async function main() {
|
|
336
|
+
const dataDir = options["dataDir"];
|
|
337
|
+
assert(dataDir, "Missing argument: data directory");
|
|
338
|
+
console.time("CRI processing time");
|
|
339
|
+
await retrieveCriXmlDump(dataDir, options);
|
|
340
|
+
console.timeEnd("CRI processing time");
|
|
341
|
+
}
|
|
342
|
+
main()
|
|
343
|
+
.then(() => process.exit(0))
|
|
344
|
+
.catch((error) => {
|
|
345
|
+
console.error(error);
|
|
346
|
+
process.exit(1);
|
|
347
|
+
});
|
|
@@ -0,0 +1,3 @@
|
|
|
1
|
+
import { DocumentMetadata } from "../types/texte";
|
|
2
|
+
export declare function processTexte(texteMetadata: DocumentMetadata, originalTextesDir: string, transformedTextesDir: string, options: any): Promise<void>;
|
|
3
|
+
export declare function processRapport(rapportMetadata: any, originalRapportsDir: string, options: any): Promise<void>;
|
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
import assert from "assert";
|
|
2
|
+
import commandLineArgs from "command-line-args";
|
|
3
|
+
import fs from "fs-extra";
|
|
4
|
+
import { DateTime } from "luxon";
|
|
5
|
+
import path from "path";
|
|
6
|
+
import { DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER, iterLoadSenatRapportUrls, iterLoadSenatTexteUrls, RAPPORT_FOLDER, TEXTE_FOLDER, } from "../loaders";
|
|
7
|
+
import { parseExposeDesMotifs, parseTexte, parseTexteFromFile } from "../parsers/texte";
|
|
8
|
+
import { getSessionsFromStart, UNDEFINED_SESSION } from "../types/sessions";
|
|
9
|
+
import { commonOptions } from "./shared/cli_helpers";
|
|
10
|
+
import { ensureAndClearDir, fetchWithRetry, isOptionEmptyOrHasValue } from "./shared/util";
|
|
11
|
+
const optionsDefinitions = [
|
|
12
|
+
...commonOptions,
|
|
13
|
+
{
|
|
14
|
+
alias: "F",
|
|
15
|
+
help: "formats of documents to retrieve (xml/html/pdf for textes, html/pdf for rapports); leave empty for all",
|
|
16
|
+
multiple: true,
|
|
17
|
+
name: "formats",
|
|
18
|
+
type: String,
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
help: "types of documents to retrieve (textes/rapports); leave empty for all",
|
|
22
|
+
multiple: true,
|
|
23
|
+
name: "types",
|
|
24
|
+
type: String,
|
|
25
|
+
},
|
|
26
|
+
{
|
|
27
|
+
help: "force retrieve all documents, even already retrieved ones",
|
|
28
|
+
name: "force",
|
|
29
|
+
type: Boolean,
|
|
30
|
+
},
|
|
31
|
+
];
|
|
32
|
+
const options = commandLineArgs(optionsDefinitions);
|
|
33
|
+
const textDecoder = new TextDecoder("utf8");
|
|
34
|
+
const today = DateTime.now();
|
|
35
|
+
function isDocumentRecent(documentDate, daysThreshold) {
|
|
36
|
+
if (!documentDate)
|
|
37
|
+
return false;
|
|
38
|
+
const docDate = DateTime.fromISO(documentDate);
|
|
39
|
+
return docDate.isValid && today.diff(docDate, "days").days <= daysThreshold;
|
|
40
|
+
}
|
|
41
|
+
function shouldDownload(filePath, docDate, options) {
|
|
42
|
+
if (options.force)
|
|
43
|
+
return true;
|
|
44
|
+
if (!fs.existsSync(filePath))
|
|
45
|
+
return true;
|
|
46
|
+
if (options.onlyRecent !== undefined) {
|
|
47
|
+
return isDocumentRecent(docDate, options.onlyRecent);
|
|
48
|
+
}
|
|
49
|
+
return false;
|
|
50
|
+
}
|
|
51
|
+
async function downloadDocument(documentUrl, verbose) {
|
|
52
|
+
if (verbose) {
|
|
53
|
+
console.log(`Downloading document ${documentUrl}…`);
|
|
54
|
+
}
|
|
55
|
+
try {
|
|
56
|
+
const response = await fetchWithRetry(documentUrl);
|
|
57
|
+
if (!response.ok) {
|
|
58
|
+
if (response.status === 404) {
|
|
59
|
+
if (verbose) {
|
|
60
|
+
console.warn(`Document ${documentUrl} not found`);
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
else {
|
|
64
|
+
if (verbose) {
|
|
65
|
+
console.error(`An error occurred while retrieving document ${documentUrl}: ${response.status}`);
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
return null;
|
|
69
|
+
}
|
|
70
|
+
return response.arrayBuffer();
|
|
71
|
+
}
|
|
72
|
+
catch (error) {
|
|
73
|
+
console.error(error.message);
|
|
74
|
+
return null;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
async function processDocument(url, destPath, docDate, options) {
|
|
78
|
+
if (!shouldDownload(destPath, docDate, options)) {
|
|
79
|
+
if (options.verbose)
|
|
80
|
+
console.info(`Already downloaded ${destPath}…`);
|
|
81
|
+
return { success: true, skipped: true, buffer: null };
|
|
82
|
+
}
|
|
83
|
+
const arrayBuffer = await downloadDocument(url, options.verbose);
|
|
84
|
+
if (!arrayBuffer) {
|
|
85
|
+
return { success: false, skipped: false, buffer: null };
|
|
86
|
+
}
|
|
87
|
+
const buffer = Buffer.from(arrayBuffer);
|
|
88
|
+
await fs.outputFile(destPath, buffer);
|
|
89
|
+
return { success: true, skipped: false, buffer };
|
|
90
|
+
}
|
|
91
|
+
export async function processTexte(texteMetadata, originalTextesDir, transformedTextesDir, options) {
|
|
92
|
+
const texteDir = path.join(originalTextesDir, `${texteMetadata.session ?? UNDEFINED_SESSION}`, texteMetadata.name);
|
|
93
|
+
let exposeDesMotifsContent = null;
|
|
94
|
+
if (texteMetadata.url_expose_des_motifs) {
|
|
95
|
+
const exposePath = path.join(texteDir, `${texteMetadata.name}-expose.html`);
|
|
96
|
+
const res = await processDocument(texteMetadata.url_expose_des_motifs.toString(), exposePath, texteMetadata.date, options);
|
|
97
|
+
if (res.buffer) {
|
|
98
|
+
exposeDesMotifsContent = res.buffer;
|
|
99
|
+
}
|
|
100
|
+
else if (res.skipped && options.parseDocuments) {
|
|
101
|
+
if (await fs.pathExists(exposePath)) {
|
|
102
|
+
exposeDesMotifsContent = await fs.readFile(exposePath);
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
const formats = [
|
|
107
|
+
{ type: "xml", url: texteMetadata.url_xml, isParseTarget: true },
|
|
108
|
+
{ type: "html", url: texteMetadata.url_html, isParseTarget: false },
|
|
109
|
+
{ type: "pdf", url: texteMetadata.url_pdf, isParseTarget: false },
|
|
110
|
+
];
|
|
111
|
+
for (const format of formats) {
|
|
112
|
+
if (!isOptionEmptyOrHasValue(options.formats, format.type))
|
|
113
|
+
continue;
|
|
114
|
+
if (!format.url)
|
|
115
|
+
continue;
|
|
116
|
+
const destPath = path.join(texteDir, `${texteMetadata.name}.${format.type}`);
|
|
117
|
+
const result = await processDocument(format.url.toString(), destPath, texteMetadata.date, options);
|
|
118
|
+
// Specific logic: Parsing (Only applies to XML)
|
|
119
|
+
if (format.isParseTarget && options.parseDocuments) {
|
|
120
|
+
await parseDocument(texteMetadata.session, transformedTextesDir, destPath, texteMetadata.name, result.buffer, exposeDesMotifsContent, options);
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
export async function processRapport(rapportMetadata, originalRapportsDir, options) {
|
|
125
|
+
const rapportDir = path.join(originalRapportsDir, `${rapportMetadata.session ?? UNDEFINED_SESSION}`, rapportMetadata.name);
|
|
126
|
+
const formats = [
|
|
127
|
+
{ type: "html", url: rapportMetadata.url_html },
|
|
128
|
+
{ type: "pdf", url: rapportMetadata.url_pdf },
|
|
129
|
+
];
|
|
130
|
+
for (const format of formats) {
|
|
131
|
+
if (!isOptionEmptyOrHasValue(options["formats"], format.type))
|
|
132
|
+
continue;
|
|
133
|
+
const destPath = path.join(rapportDir, `${rapportMetadata.name}.${format.type}`);
|
|
134
|
+
await processDocument(format.url.toString(), destPath, rapportMetadata.date, options);
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
async function retrieveTextes(dataDir, sessions) {
|
|
138
|
+
const originalTextesDir = path.join(dataDir, TEXTE_FOLDER, DATA_ORIGINAL_FOLDER);
|
|
139
|
+
const transformedTextesDir = path.join(dataDir, TEXTE_FOLDER, DATA_TRANSFORMED_FOLDER);
|
|
140
|
+
if (options["parseDocuments"]) {
|
|
141
|
+
ensureAndClearDir(transformedTextesDir);
|
|
142
|
+
}
|
|
143
|
+
const dlOptions = {
|
|
144
|
+
force: options["force"],
|
|
145
|
+
silent: options["silent"],
|
|
146
|
+
verbose: options["verbose"],
|
|
147
|
+
onlyRecent: options["only-recent"],
|
|
148
|
+
formats: options["formats"],
|
|
149
|
+
parseDocuments: options["parseDocuments"],
|
|
150
|
+
};
|
|
151
|
+
for (const session of sessions) {
|
|
152
|
+
for (const { item: texteMetadata } of iterLoadSenatTexteUrls(dataDir, session)) {
|
|
153
|
+
await processTexte(texteMetadata, originalTextesDir, transformedTextesDir, dlOptions);
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
async function retrieveRapports(dataDir, sessions) {
|
|
158
|
+
const originalRapportsDir = path.join(dataDir, RAPPORT_FOLDER, DATA_ORIGINAL_FOLDER);
|
|
159
|
+
const dlOptions = {
|
|
160
|
+
force: options["force"],
|
|
161
|
+
silent: options["silent"],
|
|
162
|
+
verbose: options["verbose"],
|
|
163
|
+
onlyRecent: options["only-recent"],
|
|
164
|
+
formats: options["formats"],
|
|
165
|
+
};
|
|
166
|
+
for (const session of sessions) {
|
|
167
|
+
for (const { item: rapportMetadata } of iterLoadSenatRapportUrls(dataDir, session)) {
|
|
168
|
+
await processRapport(rapportMetadata, originalRapportsDir, dlOptions);
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
async function parseDocument(session, transformedTextesDir, textePath, texteName, texteBuffer, exposeDesMotifs = null, options = {}) {
|
|
173
|
+
if (options.verbose) {
|
|
174
|
+
console.log(`Parsing texte ${textePath}…`);
|
|
175
|
+
}
|
|
176
|
+
let parsedTexte;
|
|
177
|
+
if (texteBuffer) {
|
|
178
|
+
const texteXml = textDecoder.decode(texteBuffer);
|
|
179
|
+
parsedTexte = parseTexte(texteXml);
|
|
180
|
+
}
|
|
181
|
+
else {
|
|
182
|
+
parsedTexte = await parseTexteFromFile(textePath);
|
|
183
|
+
}
|
|
184
|
+
if (!parsedTexte)
|
|
185
|
+
return null;
|
|
186
|
+
if (exposeDesMotifs) {
|
|
187
|
+
if (options.verbose) {
|
|
188
|
+
console.log("Parsing exposé des motifs…");
|
|
189
|
+
}
|
|
190
|
+
const exposeDesMotifsHtml = textDecoder.decode(exposeDesMotifs);
|
|
191
|
+
parsedTexte.exposeDesMotifs = parseExposeDesMotifs(exposeDesMotifsHtml);
|
|
192
|
+
}
|
|
193
|
+
const transformedTexteDir = path.join(transformedTextesDir, `${session ?? UNDEFINED_SESSION}`, texteName);
|
|
194
|
+
await fs.outputJSON(path.join(transformedTexteDir, `${texteName}.json`), parsedTexte, { spaces: 2 });
|
|
195
|
+
return parsedTexte;
|
|
196
|
+
}
|
|
197
|
+
async function main() {
|
|
198
|
+
const dataDir = options["dataDir"];
|
|
199
|
+
assert(dataDir, "Missing argument: data directory");
|
|
200
|
+
const sessions = getSessionsFromStart(options["fromSession"]);
|
|
201
|
+
console.time("documents processing time");
|
|
202
|
+
if (isOptionEmptyOrHasValue(options["types"], "textes")) {
|
|
203
|
+
await retrieveTextes(dataDir, sessions);
|
|
204
|
+
}
|
|
205
|
+
if (isOptionEmptyOrHasValue(options["types"], "rapports")) {
|
|
206
|
+
await retrieveRapports(dataDir, sessions);
|
|
207
|
+
}
|
|
208
|
+
if (!options["silent"]) {
|
|
209
|
+
console.timeEnd("documents processing time");
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
if (process.argv[1].endsWith("retrieve_documents.ts")) {
|
|
213
|
+
main()
|
|
214
|
+
.then(() => process.exit(0))
|
|
215
|
+
.catch((error) => {
|
|
216
|
+
console.log(error);
|
|
217
|
+
process.exit(1);
|
|
218
|
+
});
|
|
219
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|