@tricoteuses/senat 2.20.17 → 2.20.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +58 -19
- package/lib/git.d.ts +26 -0
- package/lib/git.js +167 -0
- package/lib/index.d.ts +1 -1
- package/lib/loaders.d.ts +3 -2
- package/lib/model/commission.d.ts +2 -2
- package/lib/model/commission.js +5 -4
- package/lib/model/seance.d.ts +2 -8
- package/lib/model/seance.js +28 -113
- package/lib/model/util.d.ts +0 -4
- package/lib/model/util.js +0 -38
- package/lib/scripts/convert_data.js +25 -1
- package/lib/scripts/retrieve_agenda.js +7 -18
- package/lib/scripts/retrieve_cr_commission.js +1 -10
- package/lib/scripts/retrieve_cr_seance.d.ts +1 -1
- package/lib/scripts/retrieve_cr_seance.js +183 -127
- package/lib/scripts/retrieve_videos.d.ts +1 -1
- package/lib/scripts/retrieve_videos.js +46 -92
- package/lib/scripts/shared/cli_helpers.d.ts +25 -3
- package/lib/scripts/shared/cli_helpers.js +28 -0
- package/lib/types/agenda.d.ts +5 -6
- package/lib/utils/cr_spliting.d.ts +2 -10
- package/lib/utils/cr_spliting.js +2 -119
- package/lib/utils/date.d.ts +10 -0
- package/lib/utils/date.js +100 -0
- package/lib/utils/reunion_odj_building.d.ts +2 -2
- package/lib/utils/reunion_odj_building.js +8 -12
- package/lib/utils/reunion_parsing.d.ts +23 -0
- package/lib/utils/reunion_parsing.js +209 -0
- package/lib/utils/scoring.d.ts +14 -0
- package/lib/utils/scoring.js +147 -0
- package/lib/utils/string_cleaning.d.ts +7 -0
- package/lib/utils/string_cleaning.js +57 -0
- package/package.json +1 -1
|
@@ -3,6 +3,7 @@ import commandLineArgs from "command-line-args";
|
|
|
3
3
|
import fs from "fs-extra";
|
|
4
4
|
import path from "path";
|
|
5
5
|
import pLimit from "p-limit";
|
|
6
|
+
import * as git from "../git";
|
|
6
7
|
import { datasets, EnabledDatasets, getEnabledDatasets } from "../datasets";
|
|
7
8
|
import { DATA_ORIGINAL_FOLDER, DOCUMENT_METADATA_FILE, DOSLEG_DOSSIERS_FOLDER, SCRUTINS_FOLDER, RAPPORT_FOLDER, SENS_CIRCONSCRIPTIONS_FOLDER, SENS_ORGANISMES_FOLDER, SENS_SENATEURS_FOLDER, TEXTE_FOLDER, } from "../loaders";
|
|
8
9
|
import { findAllAmendements, findAllCirconscriptions, findAllDebats, findAllDossiers, findAllScrutins, findAllOrganismes, findAllQuestions, findAllSens, findSenatRapportUrls, findSenatTexteUrls, } from "../model";
|
|
@@ -17,14 +18,26 @@ const SENAT_TEXTE_XML_BASE_URL = "https://www.senat.fr/akomantoso/";
|
|
|
17
18
|
const SENAT_TEXTE_BASE_URL = "https://www.senat.fr/leg/";
|
|
18
19
|
const SENAT_EXPOSE_DES_MOTIFS_BASE_URL = "https://www.senat.fr/leg/exposes-des-motifs/";
|
|
19
20
|
const SENAT_RAPPORT_BASE_URL = "https://www.senat.fr/rap/";
|
|
21
|
+
function commitGit(datasetDir, options, exitCode) {
|
|
22
|
+
if (options.commit) {
|
|
23
|
+
const errorCode = git.commitAndPush(datasetDir, "Nouvelle moisson", options.remote);
|
|
24
|
+
if ((exitCode === 10 && errorCode !== 10) || (exitCode === 0 && errorCode !== 0 && errorCode !== 10)) {
|
|
25
|
+
exitCode = errorCode;
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
return exitCode;
|
|
29
|
+
}
|
|
20
30
|
async function convertData() {
|
|
21
31
|
const dataDir = options["dataDir"];
|
|
22
32
|
assert(dataDir, "Missing argument: data directory");
|
|
23
33
|
const enabledDatasets = getEnabledDatasets(options["categories"]);
|
|
24
34
|
console.time("data transformation time");
|
|
35
|
+
let exitCode = 0;
|
|
25
36
|
if (enabledDatasets & EnabledDatasets.Ameli) {
|
|
26
37
|
try {
|
|
27
38
|
await convertDatasetAmeli(dataDir, options);
|
|
39
|
+
const ameliDir = path.join(dataDir, datasets.ameli.database);
|
|
40
|
+
exitCode = commitGit(ameliDir, options, exitCode);
|
|
28
41
|
}
|
|
29
42
|
catch (error) {
|
|
30
43
|
console.error(`Error converting Ameli dataset:`, error);
|
|
@@ -33,6 +46,8 @@ async function convertData() {
|
|
|
33
46
|
if (enabledDatasets & EnabledDatasets.Debats) {
|
|
34
47
|
try {
|
|
35
48
|
await convertDatasetDebats(dataDir, options);
|
|
49
|
+
const debatsDir = path.join(dataDir, datasets.debats.database);
|
|
50
|
+
exitCode = commitGit(debatsDir, options, exitCode);
|
|
36
51
|
}
|
|
37
52
|
catch (error) {
|
|
38
53
|
console.error(`Error converting Debats dataset:`, error);
|
|
@@ -41,12 +56,16 @@ async function convertData() {
|
|
|
41
56
|
if (enabledDatasets & EnabledDatasets.DosLeg) {
|
|
42
57
|
try {
|
|
43
58
|
await convertDatasetDosLeg(dataDir, options);
|
|
59
|
+
const doslegDir = path.join(dataDir, datasets.dosleg.database);
|
|
60
|
+
exitCode = commitGit(doslegDir, options, exitCode);
|
|
44
61
|
}
|
|
45
62
|
catch (error) {
|
|
46
63
|
console.error(`Error converting DosLeg dataset:`, error);
|
|
47
64
|
}
|
|
48
65
|
try {
|
|
49
66
|
await convertDatasetScrutins(dataDir, options);
|
|
67
|
+
const scrutinsDir = path.join(dataDir, SCRUTINS_FOLDER);
|
|
68
|
+
exitCode = commitGit(scrutinsDir, options, exitCode);
|
|
50
69
|
}
|
|
51
70
|
catch (error) {
|
|
52
71
|
console.error(`Error converting Scrutins dataset:`, error);
|
|
@@ -55,6 +74,8 @@ async function convertData() {
|
|
|
55
74
|
if (enabledDatasets & EnabledDatasets.Questions) {
|
|
56
75
|
try {
|
|
57
76
|
await convertDatasetQuestions(dataDir);
|
|
77
|
+
const questionsDir = path.join(dataDir, datasets.questions.database);
|
|
78
|
+
exitCode = commitGit(questionsDir, options, exitCode);
|
|
58
79
|
}
|
|
59
80
|
catch (error) {
|
|
60
81
|
console.error(`Error converting Questions dataset:`, error);
|
|
@@ -63,6 +84,8 @@ async function convertData() {
|
|
|
63
84
|
if (enabledDatasets & EnabledDatasets.Sens) {
|
|
64
85
|
try {
|
|
65
86
|
await convertDatasetSens(dataDir);
|
|
87
|
+
const sensDir = path.join(dataDir, datasets.sens.database);
|
|
88
|
+
exitCode = commitGit(sensDir, options, exitCode);
|
|
66
89
|
}
|
|
67
90
|
catch (error) {
|
|
68
91
|
console.error(`Error converting Sens dataset:`, error);
|
|
@@ -71,6 +94,7 @@ async function convertData() {
|
|
|
71
94
|
if (!options["silent"]) {
|
|
72
95
|
console.timeEnd("data transformation time");
|
|
73
96
|
}
|
|
97
|
+
return exitCode;
|
|
74
98
|
}
|
|
75
99
|
async function convertDatasetAmeli(dataDir, options) {
|
|
76
100
|
const dataset = datasets.ameli;
|
|
@@ -284,7 +308,7 @@ async function convertDatasetSens(dataDir) {
|
|
|
284
308
|
}
|
|
285
309
|
}
|
|
286
310
|
convertData()
|
|
287
|
-
.then(() => process.exit(0))
|
|
311
|
+
.then((exitCode) => process.exit(exitCode || 0))
|
|
288
312
|
.catch((error) => {
|
|
289
313
|
console.log(error);
|
|
290
314
|
process.exit(1);
|
|
@@ -9,7 +9,7 @@ import { getSessionsFromStart } from "../types/sessions";
|
|
|
9
9
|
import { ID_DATE_FORMAT } from "./datautil";
|
|
10
10
|
import { commonOptions } from "./shared/cli_helpers";
|
|
11
11
|
import { fetchWithRetry } from "./shared/util";
|
|
12
|
-
import {
|
|
12
|
+
import { buildReunionsByBucket } from "../utils/reunion_parsing";
|
|
13
13
|
import { buildSenatDossierIndex } from "../utils/reunion_odj_building";
|
|
14
14
|
const optionsDefinitions = [
|
|
15
15
|
...commonOptions,
|
|
@@ -103,25 +103,14 @@ async function parseAgenda(transformedAgendaSessionDir, agendaFileName, agendaPa
|
|
|
103
103
|
return;
|
|
104
104
|
const flatPath = path.join(transformedAgendaSessionDir, `${agendaFileName}.json`);
|
|
105
105
|
fs.writeJSONSync(flatPath, parsedAgendaEvents, { spaces: 2 });
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
// b) (reco) trier pour stabilité, comme pour les NON-SP
|
|
111
|
-
const PARIS = "Europe/Paris";
|
|
112
|
-
spGroups.sort((a, b) => {
|
|
113
|
-
const da = DateTime.fromISO(`${a.date}T${a.startTime || "00:00:00.000+02:00"}`, { zone: PARIS }).toMillis();
|
|
114
|
-
const db = DateTime.fromISO(`${b.date}T${b.startTime || "00:00:00.000+02:00"}`, { zone: PARIS }).toMillis();
|
|
115
|
-
// en cas d’égalité, ordre par slot pour stabilité
|
|
116
|
-
return da - db || (a.slot || "UNKNOWN").localeCompare(b.slot || "UNKNOWN");
|
|
117
|
-
});
|
|
118
|
-
if (spGroups.length > 0) {
|
|
119
|
-
writeGroupsAsFiles(transformedAgendaSessionDir, spGroups);
|
|
106
|
+
const byBucket = buildReunionsByBucket(parsedAgendaEvents, dossierBySenatUrl);
|
|
107
|
+
// SP
|
|
108
|
+
if (byBucket.IDS.length > 0) {
|
|
109
|
+
writeGroupsAsFiles(transformedAgendaSessionDir, byBucket.IDS);
|
|
120
110
|
}
|
|
121
|
-
//
|
|
122
|
-
const groupedBySuffix = groupNonSPByTypeOrganeHour(parsedAgendaEvents, dossierBySenatUrl);
|
|
111
|
+
// NON-SP
|
|
123
112
|
for (const suffix of ["IDC", "IDM", "IDO", "IDI"]) {
|
|
124
|
-
const groups =
|
|
113
|
+
const groups = byBucket[suffix];
|
|
125
114
|
if (groups.length > 0) {
|
|
126
115
|
writeGroupsAsFiles(transformedAgendaSessionDir, groups);
|
|
127
116
|
}
|
|
@@ -10,7 +10,7 @@ import { commonOptions } from "./shared/cli_helpers";
|
|
|
10
10
|
import { sessionStartYearFromDate } from "../model/seance";
|
|
11
11
|
import { getSessionsFromStart } from "../types/sessions";
|
|
12
12
|
import { ensureAndClearDir, fetchWithRetry } from "./shared/util";
|
|
13
|
-
import { jaccardTokenSim } from "../
|
|
13
|
+
import { jaccard, jaccardTokenSim } from "../utils/scoring";
|
|
14
14
|
class CommissionCRDownloadError extends Error {
|
|
15
15
|
constructor(message, url) {
|
|
16
16
|
super(`An error occurred while retrieving Commission CR ${url}: ${message}`);
|
|
@@ -138,15 +138,6 @@ function toTokens(s) {
|
|
|
138
138
|
.split(/\s+/)
|
|
139
139
|
.filter((t) => t.length >= 3 && !["commission", "des", "de", "du", "d", "la", "le", "les", "et"].includes(t)));
|
|
140
140
|
}
|
|
141
|
-
function jaccard(a, b) {
|
|
142
|
-
if (!a.size || !b.size)
|
|
143
|
-
return 0;
|
|
144
|
-
let inter = 0;
|
|
145
|
-
for (const t of a)
|
|
146
|
-
if (b.has(t))
|
|
147
|
-
inter++;
|
|
148
|
-
return inter / (a.size + b.size - inter);
|
|
149
|
-
}
|
|
150
141
|
function reunionOrganeCandidates(h) {
|
|
151
142
|
const any = h;
|
|
152
143
|
const out = [any.organeSlug, any.organeKey, any.organe, h.titre].filter(Boolean);
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Needs to be
|
|
2
|
+
* Needs to be ran after retrieve_agenda.ts script !
|
|
3
3
|
* - downloads the ZIP of comptes-rendus des débats (CRI) from data.senat.fr
|
|
4
4
|
* - extracts XML files, distributes them by session/year
|
|
5
5
|
*/
|
|
@@ -11,11 +11,12 @@ import StreamZip from "node-stream-zip";
|
|
|
11
11
|
import * as cheerio from "cheerio";
|
|
12
12
|
import { AGENDA_FOLDER, COMPTES_RENDUS_FOLDER, DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER } from "../loaders";
|
|
13
13
|
import { commonOptions } from "./shared/cli_helpers";
|
|
14
|
-
import {
|
|
15
|
-
import {
|
|
14
|
+
import { parseCompteRenduIntervalFromFile, sessionStartYearFromDate } from "../model/seance";
|
|
15
|
+
import { extractSommaireBlocks, makeReunionUid } from "../utils/reunion_parsing";
|
|
16
16
|
import { getSessionsFromStart } from "../types/sessions";
|
|
17
|
-
import { fetchWithRetry } from "./shared/util";
|
|
18
|
-
import {
|
|
17
|
+
import { ensureAndClearDir, fetchWithRetry } from "./shared/util";
|
|
18
|
+
import { isNoiseBlock, scoreSommaireBlockForEvent } from "../utils/scoring";
|
|
19
|
+
import { parseYYYYMMDD } from "../utils/date";
|
|
19
20
|
const optionsDefinitions = [
|
|
20
21
|
...commonOptions,
|
|
21
22
|
{
|
|
@@ -26,49 +27,11 @@ const optionsDefinitions = [
|
|
|
26
27
|
];
|
|
27
28
|
const options = commandLineArgs(optionsDefinitions);
|
|
28
29
|
const CRI_ZIP_URL = "https://data.senat.fr/data/debats/cri.zip";
|
|
29
|
-
const SLOT_ORDER = ["MATIN", "APRES-MIDI", "SOIR"];
|
|
30
30
|
class CompteRenduError extends Error {
|
|
31
31
|
constructor(message, url) {
|
|
32
32
|
super(`An error occurred while retrieving ${url}: ${message}`);
|
|
33
33
|
}
|
|
34
34
|
}
|
|
35
|
-
function pickFirstSlotOfDay(slots) {
|
|
36
|
-
for (const s of SLOT_ORDER)
|
|
37
|
-
if (slots.includes(s))
|
|
38
|
-
return s;
|
|
39
|
-
return null;
|
|
40
|
-
}
|
|
41
|
-
function loadAgendaSPSlotsForDate(dataDir, yyyymmdd, session) {
|
|
42
|
-
const dirPath = path.join(dataDir, AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, session.toString());
|
|
43
|
-
if (!fs.existsSync(dirPath)) {
|
|
44
|
-
console.warn(`[AGENDA] Directory not found for session ${session} → ${dirPath}`);
|
|
45
|
-
return null;
|
|
46
|
-
}
|
|
47
|
-
const pattern = new RegExp(`^RUSN${yyyymmdd}IDS-(MATIN|APRES-MIDI|SOIR)\\.json$`);
|
|
48
|
-
const ALLOWED_SLOTS = new Set(["MATIN", "APRES-MIDI", "SOIR"]);
|
|
49
|
-
try {
|
|
50
|
-
const files = fs.readdirSync(dirPath);
|
|
51
|
-
const matched = files.filter((f) => pattern.test(f));
|
|
52
|
-
if (matched.length === 0) {
|
|
53
|
-
return null;
|
|
54
|
-
}
|
|
55
|
-
const found = new Set();
|
|
56
|
-
for (const name of matched) {
|
|
57
|
-
const m = name.match(pattern);
|
|
58
|
-
const raw = (m?.[1] ?? "");
|
|
59
|
-
if (ALLOWED_SLOTS.has(raw))
|
|
60
|
-
found.add(raw);
|
|
61
|
-
}
|
|
62
|
-
const slots = Array.from(found);
|
|
63
|
-
if (slots.length === 0) {
|
|
64
|
-
return null;
|
|
65
|
-
}
|
|
66
|
-
return { filePath: dirPath, slots };
|
|
67
|
-
}
|
|
68
|
-
catch {
|
|
69
|
-
return null;
|
|
70
|
-
}
|
|
71
|
-
}
|
|
72
35
|
async function downloadCriZip(zipPath) {
|
|
73
36
|
if (!options["silent"])
|
|
74
37
|
console.log(`Downloading CRI zip ${CRI_ZIP_URL}…`);
|
|
@@ -117,10 +80,19 @@ export async function retrieveCriXmlDump(dataDir, options = {}) {
|
|
|
117
80
|
const root = path.join(dataDir, COMPTES_RENDUS_FOLDER);
|
|
118
81
|
ensureDirSync(root);
|
|
119
82
|
const originalRoot = path.join(root, DATA_ORIGINAL_FOLDER);
|
|
120
|
-
|
|
83
|
+
if (!options["keepDir"]) {
|
|
84
|
+
ensureAndClearDir(originalRoot);
|
|
85
|
+
}
|
|
86
|
+
else {
|
|
87
|
+
fs.ensureDirSync(originalRoot);
|
|
88
|
+
}
|
|
121
89
|
const transformedRoot = path.join(root, DATA_TRANSFORMED_FOLDER);
|
|
122
|
-
if (options["
|
|
90
|
+
if (!options["keepDir"]) {
|
|
91
|
+
ensureAndClearDir(transformedRoot);
|
|
92
|
+
}
|
|
93
|
+
else {
|
|
123
94
|
fs.ensureDirSync(transformedRoot);
|
|
95
|
+
}
|
|
124
96
|
const sessions = getSessionsFromStart(options["fromSession"]);
|
|
125
97
|
// 1) Download ZIP global + distribut by session
|
|
126
98
|
const zipPath = path.join(dataDir, "cri.zip");
|
|
@@ -158,77 +130,208 @@ export async function retrieveCriXmlDump(dataDir, options = {}) {
|
|
|
158
130
|
for (const f of xmlFiles) {
|
|
159
131
|
const yyyymmdd = f.slice(1, 9);
|
|
160
132
|
const xmlPath = path.join(originalSessionDir, f);
|
|
133
|
+
// === ONLY-RECENT
|
|
161
134
|
if (options["only-recent"]) {
|
|
162
135
|
const cutoff = now - options["only-recent"] * 24 * 3600 * 1000;
|
|
163
|
-
const seanceTs = Date.parse(yyyymmdd.slice(0, 4)
|
|
136
|
+
const seanceTs = Date.parse(`${yyyymmdd.slice(0, 4)}-${yyyymmdd.slice(4, 6)}-${yyyymmdd.slice(6, 8)}`);
|
|
164
137
|
if (seanceTs < cutoff) {
|
|
165
|
-
// Check if some file exists sarting with CRSSN{yyyymmdd} in transformed dir
|
|
166
138
|
const files = await fs.readdir(transformedSessionDir);
|
|
167
|
-
const dayFiles = files.filter((fn) => fn.startsWith(`CRSSN${yyyymmdd}
|
|
139
|
+
const dayFiles = files.filter((fn) => fn.startsWith(`CRSSN${yyyymmdd}E`) && fn.endsWith(".json"));
|
|
168
140
|
if (dayFiles.length > 0) {
|
|
169
|
-
// Link existing files to agendas
|
|
170
141
|
for (const fn of dayFiles) {
|
|
171
|
-
const match = fn.match(/^CRSSN(\d{8})
|
|
172
|
-
const
|
|
142
|
+
const match = fn.match(/^CRSSN(\d{8})E(.+)\.json$/);
|
|
143
|
+
const eventId = match?.[2];
|
|
144
|
+
if (!eventId)
|
|
145
|
+
continue;
|
|
173
146
|
const crPath = path.join(transformedSessionDir, fn);
|
|
174
147
|
try {
|
|
175
148
|
const cr = await fs.readJSON(crPath);
|
|
176
|
-
await
|
|
149
|
+
await linkCriEventIntoAgenda(dataDir, yyyymmdd, eventId, cr.uid, cr, session);
|
|
177
150
|
}
|
|
178
151
|
catch (e) {
|
|
179
|
-
console.warn(`[
|
|
152
|
+
console.warn(`[CR] [${session}] Could not relink existing CR into a reunion for ${yyyymmdd} event=${eventId}:`, e);
|
|
180
153
|
}
|
|
181
154
|
}
|
|
182
155
|
continue;
|
|
183
156
|
}
|
|
184
157
|
}
|
|
185
158
|
}
|
|
186
|
-
//
|
|
187
|
-
const
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
159
|
+
// === Charger les events SP du jour depuis les agendas groupés ===
|
|
160
|
+
const dayEvents = await loadAgendaSpEventsForDate(dataDir, yyyymmdd, session);
|
|
161
|
+
if (dayEvents.length === 0) {
|
|
162
|
+
console.warn(`[CRI] [${session}] No agenda SP events found for ${yyyymmdd} → skip split/link`);
|
|
163
|
+
continue;
|
|
164
|
+
}
|
|
165
|
+
// === Lire XML + construire index DOM ===
|
|
166
|
+
let raw;
|
|
167
|
+
let $;
|
|
168
|
+
let order;
|
|
169
|
+
let idx;
|
|
191
170
|
try {
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
const intervals = computeIntervalsBySlot($, idx, firstSlotOfDay ?? undefined);
|
|
197
|
-
const uniq = new Set();
|
|
198
|
-
for (const iv of intervals)
|
|
199
|
-
if (iv.slot && iv.slot !== "UNKNOWN")
|
|
200
|
-
uniq.add(iv.slot);
|
|
201
|
-
slotsInCri = Array.from(uniq);
|
|
171
|
+
raw = await fs.readFile(xmlPath, "utf8");
|
|
172
|
+
$ = cheerio.load(raw, { xml: false });
|
|
173
|
+
order = $("body *").toArray();
|
|
174
|
+
idx = new Map(order.map((el, i) => [el, i]));
|
|
202
175
|
}
|
|
203
176
|
catch (e) {
|
|
204
177
|
console.warn(`[CRI] [${session}] Cannot read/parse ${f}:`, e);
|
|
205
178
|
continue;
|
|
206
179
|
}
|
|
207
|
-
|
|
208
|
-
|
|
180
|
+
// === Extraire sommaire + matcher vers events agenda ===
|
|
181
|
+
const blocks = extractSommaireBlocks($, idx);
|
|
182
|
+
const intervals = buildIntervalsByAgendaEvents($, idx, order, blocks, dayEvents);
|
|
183
|
+
if (!intervals.length) {
|
|
184
|
+
console.warn(`[CRI] [${session}] No confident split intervals for ${yyyymmdd} → skip`);
|
|
185
|
+
continue;
|
|
209
186
|
}
|
|
210
|
-
//
|
|
211
|
-
for (const
|
|
212
|
-
const outName = `CRSSN${yyyymmdd}
|
|
213
|
-
const
|
|
187
|
+
// === Parser / écrire / linker chaque segment par event ===
|
|
188
|
+
for (const iv of intervals) {
|
|
189
|
+
const outName = `CRSSN${yyyymmdd}E${iv.agendaEventId}.json`;
|
|
190
|
+
const outPath = path.join(transformedSessionDir, outName);
|
|
191
|
+
const cr = await parseCompteRenduIntervalFromFile(xmlPath, iv.startIndex, iv.endIndex, iv.agendaEventId);
|
|
214
192
|
if (!cr) {
|
|
215
|
-
console.warn(`[CRI] [${session}] Empty or no points for ${yyyymmdd}
|
|
193
|
+
console.warn(`[CRI] [${session}] Empty or no points for ${yyyymmdd} event=${iv.agendaEventId} → skip`);
|
|
216
194
|
continue;
|
|
217
195
|
}
|
|
218
|
-
|
|
219
|
-
await fs.ensureDir(outDir);
|
|
220
|
-
const outPath = path.join(outDir, outName);
|
|
196
|
+
await fs.ensureDir(transformedSessionDir);
|
|
221
197
|
await fs.writeJSON(outPath, cr, { spaces: 2 });
|
|
222
198
|
try {
|
|
223
|
-
await
|
|
199
|
+
await linkCriEventIntoAgenda(dataDir, yyyymmdd, iv.agendaEventId, cr.uid, cr, session);
|
|
224
200
|
}
|
|
225
201
|
catch (e) {
|
|
226
|
-
console.warn(`[
|
|
202
|
+
console.warn(`[CR] [${session}] Could not link CR into agenda for ${yyyymmdd} event=${iv.agendaEventId}:`, e);
|
|
227
203
|
}
|
|
228
204
|
}
|
|
229
205
|
}
|
|
230
206
|
}
|
|
231
207
|
}
|
|
208
|
+
async function linkCriEventIntoAgenda(dataDir, yyyymmdd, agendaEventId, crUid, cr, session) {
|
|
209
|
+
const agendadDir = path.join(dataDir, AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, session.toString());
|
|
210
|
+
fs.ensureDirSync(agendadDir);
|
|
211
|
+
const dateISO = `${yyyymmdd.slice(0, 4)}-${yyyymmdd.slice(4, 6)}-${yyyymmdd.slice(6, 8)}`;
|
|
212
|
+
const agendaUid = makeReunionUid(dateISO, "SP", agendaEventId, null);
|
|
213
|
+
const agendaPath = path.join(agendadDir, `${agendaUid}.json`);
|
|
214
|
+
let agenda = null;
|
|
215
|
+
if (await fs.pathExists(agendaPath)) {
|
|
216
|
+
try {
|
|
217
|
+
agenda = await fs.readJSON(agendaPath);
|
|
218
|
+
}
|
|
219
|
+
catch (e) {
|
|
220
|
+
console.warn(`[CR] unreadable reunion JSON → ${agendaPath} (${e})`);
|
|
221
|
+
agenda = null;
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
if (!agenda) {
|
|
225
|
+
console.warn(`[CR] Missing reunion file for SP event=${agendaEventId}: ${agendaPath}`);
|
|
226
|
+
return;
|
|
227
|
+
}
|
|
228
|
+
;
|
|
229
|
+
agenda.compteRenduRefUid = crUid;
|
|
230
|
+
await fs.writeJSON(agendaPath, agenda, { spaces: 2 });
|
|
231
|
+
console.log(`[CR] Linked CR ${crUid} → ${path.basename(agendaPath)} (event=${agendaEventId})`);
|
|
232
|
+
}
|
|
233
|
+
function buildIntervalsByAgendaEvents($, idx, order, blocks, dayEvents) {
|
|
234
|
+
const MIN_SCORE = 0.65;
|
|
235
|
+
const MIN_GAP = 0.08;
|
|
236
|
+
const firstIntervenant = $("div.intervenant").first()[0];
|
|
237
|
+
const firstIntervenantIdx = firstIntervenant ? (idx.get(firstIntervenant) ?? null) : null;
|
|
238
|
+
const pivots = [];
|
|
239
|
+
for (const b of blocks) {
|
|
240
|
+
if (isNoiseBlock(b.text))
|
|
241
|
+
continue;
|
|
242
|
+
let best = null;
|
|
243
|
+
let second = 0;
|
|
244
|
+
for (const ev of dayEvents) {
|
|
245
|
+
const s = scoreSommaireBlockForEvent(b.text, ev);
|
|
246
|
+
if (!best || s > best.score) {
|
|
247
|
+
second = best?.score ?? second;
|
|
248
|
+
best = { ev, score: s };
|
|
249
|
+
}
|
|
250
|
+
else if (s > second) {
|
|
251
|
+
second = s;
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
if (!best)
|
|
255
|
+
continue;
|
|
256
|
+
const resolved = resolveTargetIndex($, idx, b.targetId);
|
|
257
|
+
const contentStartIndex = resolved ?? b.startIndex;
|
|
258
|
+
if (firstIntervenantIdx != null && contentStartIndex < firstIntervenantIdx && resolved == null) {
|
|
259
|
+
continue;
|
|
260
|
+
}
|
|
261
|
+
if (best.score < MIN_SCORE)
|
|
262
|
+
continue;
|
|
263
|
+
if (best.score - second < MIN_GAP)
|
|
264
|
+
continue;
|
|
265
|
+
pivots.push({
|
|
266
|
+
agendaEventId: best.ev.id,
|
|
267
|
+
startIndex: contentStartIndex,
|
|
268
|
+
score: best.score,
|
|
269
|
+
});
|
|
270
|
+
}
|
|
271
|
+
if (pivots.length === 0)
|
|
272
|
+
return [];
|
|
273
|
+
// Dédupe par event (on garde le premier startIndex)
|
|
274
|
+
const byEvent = new Map();
|
|
275
|
+
for (const p of pivots.sort((a, b) => a.startIndex - b.startIndex)) {
|
|
276
|
+
if (!byEvent.has(p.agendaEventId)) {
|
|
277
|
+
byEvent.set(p.agendaEventId, {
|
|
278
|
+
startIndex: p.startIndex,
|
|
279
|
+
score: p.score,
|
|
280
|
+
});
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
const sorted = Array.from(byEvent.entries())
|
|
284
|
+
.map(([agendaEventId, v]) => ({
|
|
285
|
+
agendaEventId,
|
|
286
|
+
startIndex: v.startIndex,
|
|
287
|
+
score: v.score,
|
|
288
|
+
}))
|
|
289
|
+
.sort((a, b) => a.startIndex - b.startIndex);
|
|
290
|
+
// Construction des intervalles
|
|
291
|
+
const intervals = [];
|
|
292
|
+
for (let i = 0; i < sorted.length; i++) {
|
|
293
|
+
const cur = sorted[i];
|
|
294
|
+
const next = sorted[i + 1];
|
|
295
|
+
const endIndex = next ? next.startIndex - 1 : order.length - 1;
|
|
296
|
+
intervals.push({
|
|
297
|
+
agendaEventId: cur.agendaEventId,
|
|
298
|
+
startIndex: cur.startIndex,
|
|
299
|
+
endIndex,
|
|
300
|
+
score: cur.score,
|
|
301
|
+
});
|
|
302
|
+
}
|
|
303
|
+
return intervals;
|
|
304
|
+
}
|
|
305
|
+
async function loadAgendaSpEventsForDate(dataDir, yyyymmdd, session) {
|
|
306
|
+
const agendasDir = path.join(dataDir, AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, session.toString());
|
|
307
|
+
if (!(await fs.pathExists(agendasDir)))
|
|
308
|
+
return [];
|
|
309
|
+
const files = (await fs.readdir(agendasDir)).filter((fn) => fn.startsWith(`RUSN${yyyymmdd}IDS`) && fn.endsWith(".json"));
|
|
310
|
+
const events = [];
|
|
311
|
+
for (const fn of files) {
|
|
312
|
+
try {
|
|
313
|
+
const g = (await fs.readJSON(path.join(agendasDir, fn)));
|
|
314
|
+
const e = g?.events?.[0];
|
|
315
|
+
if (e && e.type === "Séance publique")
|
|
316
|
+
events.push(e);
|
|
317
|
+
}
|
|
318
|
+
catch { }
|
|
319
|
+
}
|
|
320
|
+
return events;
|
|
321
|
+
}
|
|
322
|
+
function cssEscapeIdent(s) {
|
|
323
|
+
return s.replace(/\\/g, "\\\\").replace(/"/g, '\\"');
|
|
324
|
+
}
|
|
325
|
+
function resolveTargetIndex($, idx, targetId) {
|
|
326
|
+
if (!targetId)
|
|
327
|
+
return null;
|
|
328
|
+
const safe = cssEscapeIdent(targetId);
|
|
329
|
+
const el = $(`[id="${safe}"]`)[0] || $(`[name="${safe}"]`)[0];
|
|
330
|
+
if (!el)
|
|
331
|
+
return null;
|
|
332
|
+
const i = idx.get(el);
|
|
333
|
+
return i == null ? null : i;
|
|
334
|
+
}
|
|
232
335
|
async function main() {
|
|
233
336
|
const dataDir = options["dataDir"];
|
|
234
337
|
assert(dataDir, "Missing argument: data directory");
|
|
@@ -242,50 +345,3 @@ main()
|
|
|
242
345
|
console.error(error);
|
|
243
346
|
process.exit(1);
|
|
244
347
|
});
|
|
245
|
-
async function linkCriSlotIntoAgendaGrouped(dataDir, yyyymmdd, slot, crUid, cr, session) {
|
|
246
|
-
const groupedDir = path.join(dataDir, AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, session.toString());
|
|
247
|
-
fs.ensureDirSync(groupedDir);
|
|
248
|
-
const groupedPath = path.join(groupedDir, `RUSN${yyyymmdd}IDS-${slot}.json`);
|
|
249
|
-
let group = null;
|
|
250
|
-
if (fs.existsSync(groupedPath)) {
|
|
251
|
-
try {
|
|
252
|
-
const parsed = JSON.parse(fs.readFileSync(groupedPath, "utf8"));
|
|
253
|
-
if (Array.isArray(parsed)) {
|
|
254
|
-
// Take correct slot if multiple or first one if no direct match ?
|
|
255
|
-
group = parsed.find((g) => g?.slot === slot) ?? parsed[0] ?? null;
|
|
256
|
-
}
|
|
257
|
-
else {
|
|
258
|
-
group = parsed;
|
|
259
|
-
}
|
|
260
|
-
}
|
|
261
|
-
catch (e) {
|
|
262
|
-
console.warn(`[AGENDA] unreadable grouped JSON → ${groupedPath} (${e}) → recreating`);
|
|
263
|
-
group = null;
|
|
264
|
-
}
|
|
265
|
-
}
|
|
266
|
-
const dateISO = `${yyyymmdd.slice(0, 4)}-${yyyymmdd.slice(4, 6)}-${yyyymmdd.slice(6, 8)}`;
|
|
267
|
-
const sommaire = cr?.metadonnees?.sommaire;
|
|
268
|
-
const { titre: dTitre, objet: dObjet } = deriveTitreObjetFromSommaire(sommaire, slot);
|
|
269
|
-
// Création si manquant
|
|
270
|
-
if (!group) {
|
|
271
|
-
group = {
|
|
272
|
-
uid: makeGroupUid(dateISO, slot),
|
|
273
|
-
chambre: "SN",
|
|
274
|
-
date: dateISO,
|
|
275
|
-
slot,
|
|
276
|
-
type: "Séance publique",
|
|
277
|
-
startTime: null,
|
|
278
|
-
endTime: null,
|
|
279
|
-
captationVideo: false,
|
|
280
|
-
titre: dTitre,
|
|
281
|
-
objet: dObjet || "",
|
|
282
|
-
events: [],
|
|
283
|
-
compteRenduRefUid: crUid,
|
|
284
|
-
};
|
|
285
|
-
}
|
|
286
|
-
else {
|
|
287
|
-
group.compteRenduRefUid = crUid;
|
|
288
|
-
}
|
|
289
|
-
await fs.writeJSON(groupedPath, group, { spaces: 2 });
|
|
290
|
-
console.log(`[AGENDA] Linked CR ${crUid} → ${path.basename(groupedPath)} [${slot}]`);
|
|
291
|
-
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
export
|
|
1
|
+
export {};
|