@tricoteuses/senat 2.22.11 → 2.22.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/config.d.ts +21 -0
- package/lib/config.js +27 -0
- package/lib/databases.d.ts +2 -0
- package/lib/databases.js +26 -0
- package/lib/datasets.d.ts +34 -0
- package/lib/datasets.js +233 -0
- package/lib/git.d.ts +26 -0
- package/lib/git.js +167 -0
- package/lib/index.d.ts +13 -0
- package/lib/index.js +1 -0
- package/lib/loaders.d.ts +58 -0
- package/lib/loaders.js +286 -0
- package/lib/model/agenda.d.ts +6 -0
- package/lib/model/agenda.js +148 -0
- package/lib/model/ameli.d.ts +51 -0
- package/lib/model/ameli.js +147 -0
- package/lib/model/commission.d.ts +18 -0
- package/lib/model/commission.js +269 -0
- package/lib/model/debats.d.ts +67 -0
- package/lib/model/debats.js +95 -0
- package/lib/model/documents.d.ts +12 -0
- package/lib/model/documents.js +138 -0
- package/lib/model/dosleg.d.ts +7 -0
- package/lib/model/dosleg.js +326 -0
- package/lib/model/index.d.ts +7 -0
- package/lib/model/index.js +7 -0
- package/lib/model/questions.d.ts +45 -0
- package/lib/model/questions.js +89 -0
- package/lib/model/scrutins.d.ts +13 -0
- package/lib/model/scrutins.js +114 -0
- package/lib/model/seance.d.ts +3 -0
- package/lib/model/seance.js +267 -0
- package/lib/model/sens.d.ts +146 -0
- package/lib/model/sens.js +454 -0
- package/lib/model/texte.d.ts +7 -0
- package/lib/model/texte.js +228 -0
- package/lib/model/util.d.ts +9 -0
- package/lib/model/util.js +38 -0
- package/lib/parsers/texte.d.ts +7 -0
- package/lib/parsers/texte.js +228 -0
- package/lib/raw_types/ameli.d.ts +914 -0
- package/lib/raw_types/ameli.js +5 -0
- package/lib/raw_types/debats.d.ts +207 -0
- package/lib/raw_types/debats.js +5 -0
- package/lib/raw_types/dosleg.d.ts +1619 -0
- package/lib/raw_types/dosleg.js +5 -0
- package/lib/raw_types/questions.d.ts +423 -0
- package/lib/raw_types/questions.js +5 -0
- package/lib/raw_types/senat.d.ts +11372 -0
- package/lib/raw_types/senat.js +5 -0
- package/lib/raw_types/sens.d.ts +8248 -0
- package/lib/raw_types/sens.js +5 -0
- package/lib/raw_types_schemats/ameli.d.ts +539 -0
- package/lib/raw_types_schemats/ameli.js +2 -0
- package/lib/raw_types_schemats/debats.d.ts +127 -0
- package/lib/raw_types_schemats/debats.js +2 -0
- package/lib/raw_types_schemats/dosleg.d.ts +977 -0
- package/lib/raw_types_schemats/dosleg.js +2 -0
- package/lib/raw_types_schemats/questions.d.ts +237 -0
- package/lib/raw_types_schemats/questions.js +2 -0
- package/lib/raw_types_schemats/sens.d.ts +6915 -0
- package/lib/raw_types_schemats/sens.js +2 -0
- package/lib/scripts/convert_data.d.ts +1 -0
- package/lib/scripts/convert_data.js +354 -0
- package/lib/scripts/data-download.d.ts +1 -0
- package/lib/scripts/data-download.js +12 -0
- package/lib/scripts/datautil.d.ts +8 -0
- package/lib/scripts/datautil.js +34 -0
- package/lib/scripts/parse_textes.d.ts +1 -0
- package/lib/scripts/parse_textes.js +44 -0
- package/lib/scripts/retrieve_agenda.d.ts +1 -0
- package/lib/scripts/retrieve_agenda.js +132 -0
- package/lib/scripts/retrieve_cr_commission.d.ts +1 -0
- package/lib/scripts/retrieve_cr_commission.js +364 -0
- package/lib/scripts/retrieve_cr_seance.d.ts +6 -0
- package/lib/scripts/retrieve_cr_seance.js +347 -0
- package/lib/scripts/retrieve_documents.d.ts +3 -0
- package/lib/scripts/retrieve_documents.js +219 -0
- package/lib/scripts/retrieve_open_data.d.ts +1 -0
- package/lib/scripts/retrieve_open_data.js +316 -0
- package/lib/scripts/retrieve_senateurs_photos.d.ts +1 -0
- package/lib/scripts/retrieve_senateurs_photos.js +147 -0
- package/lib/scripts/retrieve_videos.d.ts +1 -0
- package/lib/scripts/retrieve_videos.js +461 -0
- package/lib/scripts/shared/cli_helpers.d.ts +95 -0
- package/lib/scripts/shared/cli_helpers.js +91 -0
- package/lib/scripts/shared/util.d.ts +4 -0
- package/lib/scripts/shared/util.js +35 -0
- package/lib/scripts/test_iter_load.d.ts +1 -0
- package/lib/scripts/test_iter_load.js +12 -0
- package/lib/src/model/sens.d.ts +36 -0
- package/lib/src/model/sens.js +35 -4
- package/lib/src/scripts/retrieve_cr_commission.js +12 -0
- package/lib/src/scripts/retrieve_cr_seance.js +12 -0
- package/lib/src/scripts/retrieve_videos.js +13 -1
- package/lib/src/utils/nvs-timecode.d.ts +17 -0
- package/lib/src/utils/nvs-timecode.js +79 -0
- package/lib/src/utils/weights_scoring_config.d.ts +2 -0
- package/lib/src/utils/weights_scoring_config.js +15 -0
- package/lib/strings.d.ts +1 -0
- package/lib/strings.js +18 -0
- package/lib/types/agenda.d.ts +44 -0
- package/lib/types/agenda.js +1 -0
- package/lib/types/ameli.d.ts +5 -0
- package/lib/types/ameli.js +1 -0
- package/lib/types/compte_rendu.d.ts +83 -0
- package/lib/types/compte_rendu.js +1 -0
- package/lib/types/debats.d.ts +2 -0
- package/lib/types/debats.js +1 -0
- package/lib/types/dosleg.d.ts +70 -0
- package/lib/types/dosleg.js +1 -0
- package/lib/types/questions.d.ts +2 -0
- package/lib/types/questions.js +1 -0
- package/lib/types/sens.d.ts +10 -0
- package/lib/types/sens.js +1 -0
- package/lib/types/sessions.d.ts +5 -0
- package/lib/types/sessions.js +84 -0
- package/lib/types/texte.d.ts +74 -0
- package/lib/types/texte.js +16 -0
- package/lib/utils/cr_spliting.d.ts +28 -0
- package/lib/utils/cr_spliting.js +265 -0
- package/lib/utils/date.d.ts +10 -0
- package/lib/utils/date.js +100 -0
- package/lib/utils/nvs-timecode.d.ts +7 -0
- package/lib/utils/nvs-timecode.js +79 -0
- package/lib/utils/reunion_grouping.d.ts +11 -0
- package/lib/utils/reunion_grouping.js +337 -0
- package/lib/utils/reunion_odj_building.d.ts +5 -0
- package/lib/utils/reunion_odj_building.js +154 -0
- package/lib/utils/reunion_parsing.d.ts +23 -0
- package/lib/utils/reunion_parsing.js +209 -0
- package/lib/utils/scoring.d.ts +14 -0
- package/lib/utils/scoring.js +147 -0
- package/lib/utils/string_cleaning.d.ts +7 -0
- package/lib/utils/string_cleaning.js +57 -0
- package/lib/validators/config.d.ts +9 -0
- package/lib/validators/config.js +10 -0
- package/package.json +1 -1
|
@@ -0,0 +1,364 @@
|
|
|
1
|
+
import fs, { ensureDir } from "fs-extra";
|
|
2
|
+
import assert from "assert";
|
|
3
|
+
import path from "path";
|
|
4
|
+
import * as cheerio from "cheerio";
|
|
5
|
+
import { COMMISSION_FOLDER, DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER } from "../loaders";
|
|
6
|
+
import { loadAgendaForDate, parseCommissionMetadataFromHtml, linkCRtoCommissionGroup } from "../utils/cr_spliting";
|
|
7
|
+
import { cleanTitle, extractDayH3Sections, parseCommissionCRSectionFromDom } from "../model/commission";
|
|
8
|
+
import commandLineArgs from "command-line-args";
|
|
9
|
+
import { commonOptions } from "./shared/cli_helpers";
|
|
10
|
+
import { sessionStartYearFromDate } from "../model/seance";
|
|
11
|
+
import { getSessionsFromStart } from "../types/sessions";
|
|
12
|
+
import { ensureAndClearDir, fetchWithRetry } from "./shared/util";
|
|
13
|
+
import { jaccard, jaccardTokenSim } from "../utils/scoring";
|
|
14
|
+
class CommissionCRDownloadError extends Error {
|
|
15
|
+
constructor(message, url) {
|
|
16
|
+
super(`An error occurred while retrieving Commission CR ${url}: ${message}`);
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
const optionsDefinitions = [
|
|
20
|
+
...commonOptions,
|
|
21
|
+
{ name: "concurrency", type: Number, defaultValue: 6, help: "Max parallel downloads" },
|
|
22
|
+
{ name: "politenessMs", type: Number, defaultValue: 150, help: "Delay per worker (ms)" },
|
|
23
|
+
{
|
|
24
|
+
help: "parse and convert comptes-rendus des débats into JSON",
|
|
25
|
+
name: "parseDebats",
|
|
26
|
+
type: Boolean,
|
|
27
|
+
},
|
|
28
|
+
];
|
|
29
|
+
const options = commandLineArgs(optionsDefinitions);
|
|
30
|
+
const COMMISSION_HUBS = {
|
|
31
|
+
"Commission des affaires étrangères": [
|
|
32
|
+
"https://www.senat.fr/compte-rendu-commissions/affaires-etrangeres.html",
|
|
33
|
+
"https://www.senat.fr/compte-rendu-commissions/affaires-etrangeres_archives.html",
|
|
34
|
+
],
|
|
35
|
+
"Commission des affaires économiques": [
|
|
36
|
+
"https://www.senat.fr/compte-rendu-commissions/economie.html",
|
|
37
|
+
"https://www.senat.fr/compte-rendu-commissions/economie_archives.html",
|
|
38
|
+
],
|
|
39
|
+
"Commission de l'amenagement du territoire et du développement durable": [
|
|
40
|
+
"https://www.senat.fr/compte-rendu-commissions/developpement-durable.html",
|
|
41
|
+
"https://www.senat.fr/compte-rendu-commissions/developpement-durable_archives.html",
|
|
42
|
+
],
|
|
43
|
+
"Commission de la culture": [
|
|
44
|
+
"https://www.senat.fr/compte-rendu-commissions/culture.html",
|
|
45
|
+
"https://www.senat.fr/compte-rendu-commissions/culture_archives.html",
|
|
46
|
+
],
|
|
47
|
+
"Commission des finances": [
|
|
48
|
+
"https://www.senat.fr/compte-rendu-commissions/finances.html",
|
|
49
|
+
"https://www.senat.fr/compte-rendu-commissions/finances_archives.html",
|
|
50
|
+
],
|
|
51
|
+
"Commission des lois": [
|
|
52
|
+
"https://www.senat.fr/compte-rendu-commissions/lois.html",
|
|
53
|
+
"https://www.senat.fr/compte-rendu-commissions/lois_archives.html",
|
|
54
|
+
],
|
|
55
|
+
"Commission des affaires sociales": [
|
|
56
|
+
"https://www.senat.fr/compte-rendu-commissions/affaires-sociales.html",
|
|
57
|
+
"https://www.senat.fr/compte-rendu-commissions/affaires-sociales_archives.html",
|
|
58
|
+
],
|
|
59
|
+
"Commission des affaires européennes": [
|
|
60
|
+
"https://www.senat.fr/compte-rendu-commissions/affaires-europeennes.html",
|
|
61
|
+
"https://www.senat.fr/compte-rendu-commissions/affaires-europeennes_archives.html",
|
|
62
|
+
],
|
|
63
|
+
};
|
|
64
|
+
async function harvestWeeklyLinksFromHub(hubUrl) {
|
|
65
|
+
const res = await fetchWithRetry(hubUrl);
|
|
66
|
+
if (!res.ok)
|
|
67
|
+
return [];
|
|
68
|
+
const html = await res.text();
|
|
69
|
+
const $ = cheerio.load(html);
|
|
70
|
+
const out = [];
|
|
71
|
+
$("a[href]").each((_, a) => {
|
|
72
|
+
const href = ($(a).attr("href") || "").trim();
|
|
73
|
+
const m = href.match(/\/compte-rendu-commissions\/(\d{8})\/([a-z0-9\-]+)\.html$/i);
|
|
74
|
+
if (m) {
|
|
75
|
+
const url = href.startsWith("http") ? href : new URL(href, hubUrl).toString();
|
|
76
|
+
out.push(url);
|
|
77
|
+
}
|
|
78
|
+
});
|
|
79
|
+
return Array.from(new Set(out));
|
|
80
|
+
}
|
|
81
|
+
async function discoverCommissionWeeklyPages(fromSession) {
|
|
82
|
+
const results = [];
|
|
83
|
+
for (const [commissionKey, hubs] of Object.entries(COMMISSION_HUBS)) {
|
|
84
|
+
for (const hubUrl of hubs) {
|
|
85
|
+
try {
|
|
86
|
+
const links = await harvestWeeklyLinksFromHub(hubUrl);
|
|
87
|
+
for (const url of links) {
|
|
88
|
+
const m = url.match(/\/compte-rendu-commissions\/(\d{8})\/([a-z0-9\-]+)\.html$/i);
|
|
89
|
+
if (!m)
|
|
90
|
+
continue;
|
|
91
|
+
const yyyymmdd = m[1];
|
|
92
|
+
const year = Number(yyyymmdd.slice(0, 4));
|
|
93
|
+
const month = Number(yyyymmdd.slice(4, 6));
|
|
94
|
+
const session = month >= 10 ? year : year - 1;
|
|
95
|
+
if (session < fromSession)
|
|
96
|
+
continue;
|
|
97
|
+
results.push({ url, yyyymmdd, commissionKey });
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
catch (e) {
|
|
101
|
+
console.warn(`[COM-CR][hub-fail] ${hubUrl} → ${e?.message ?? e}`);
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
return results.sort((a, b) => a.yyyymmdd.localeCompare(b.yyyymmdd));
|
|
106
|
+
}
|
|
107
|
+
function toHourShort(hhmm) {
|
|
108
|
+
if (!hhmm)
|
|
109
|
+
return null;
|
|
110
|
+
const m = hhmm.match(/^(\d{2}):(\d{2})$/);
|
|
111
|
+
return m ? `${m[1]}${m[2]}` : null;
|
|
112
|
+
}
|
|
113
|
+
function timeToMinutes(hhmm) {
|
|
114
|
+
const [h, m] = hhmm.split(":").map((n) => parseInt(n, 10));
|
|
115
|
+
return (h || 0) * 60 + (m || 0);
|
|
116
|
+
}
|
|
117
|
+
async function tryDownload(url) {
|
|
118
|
+
const res = await fetch(url, { redirect: "follow" });
|
|
119
|
+
if (res.status === 404)
|
|
120
|
+
return null;
|
|
121
|
+
if (!res.ok)
|
|
122
|
+
throw new CommissionCRDownloadError(String(res.status), url);
|
|
123
|
+
const ab = await res.arrayBuffer();
|
|
124
|
+
return Buffer.from(ab);
|
|
125
|
+
}
|
|
126
|
+
function normOrgane(s) {
|
|
127
|
+
return s
|
|
128
|
+
.toLowerCase()
|
|
129
|
+
.normalize("NFD")
|
|
130
|
+
.replace(/[\u0300-\u036f]/g, "")
|
|
131
|
+
.replace(/&/g, " et ")
|
|
132
|
+
.replace(/[^a-z0-9\s-]/g, " ")
|
|
133
|
+
.replace(/\s+/g, " ")
|
|
134
|
+
.trim();
|
|
135
|
+
}
|
|
136
|
+
function toTokens(s) {
|
|
137
|
+
return new Set(normOrgane(s)
|
|
138
|
+
.split(/\s+/)
|
|
139
|
+
.filter((t) => t.length >= 3 && !["commission", "des", "de", "du", "d", "la", "le", "les", "et"].includes(t)));
|
|
140
|
+
}
|
|
141
|
+
function reunionOrganeCandidates(h) {
|
|
142
|
+
const any = h;
|
|
143
|
+
const out = [any.organeSlug, any.organeKey, any.organe, h.titre].filter(Boolean);
|
|
144
|
+
return Array.from(new Set(out.map(normOrgane)));
|
|
145
|
+
}
|
|
146
|
+
function organeSimilarity(h, commissionKey) {
|
|
147
|
+
const keyTokens = toTokens(commissionKey.replace(/-/g, " "));
|
|
148
|
+
const cand = reunionOrganeCandidates(h).map(toTokens);
|
|
149
|
+
let best = 0;
|
|
150
|
+
for (const B of cand)
|
|
151
|
+
best = Math.max(best, jaccard(keyTokens, B));
|
|
152
|
+
return best; // 0..1
|
|
153
|
+
}
|
|
154
|
+
function timeProximityScore(h, openHHMM, maxDeltaMin) {
|
|
155
|
+
if (!openHHMM)
|
|
156
|
+
return 0;
|
|
157
|
+
const hhmm = (h.startTime ?? null);
|
|
158
|
+
if (!hhmm)
|
|
159
|
+
return 0;
|
|
160
|
+
const d = Math.abs(timeToMinutes(hhmm) - timeToMinutes(openHHMM));
|
|
161
|
+
if (d > maxDeltaMin)
|
|
162
|
+
return 0;
|
|
163
|
+
return 1 - d / maxDeltaMin; // 0..1 (1 = même heure)
|
|
164
|
+
}
|
|
165
|
+
function titleSimilarity(reunion, sectionTitle) {
|
|
166
|
+
const t = reunion.titre ?? "";
|
|
167
|
+
const o = reunion.objet ?? "";
|
|
168
|
+
if (!sectionTitle.trim())
|
|
169
|
+
return 0;
|
|
170
|
+
const sTit = jaccardTokenSim(t, sectionTitle);
|
|
171
|
+
const sObj = jaccardTokenSim(o, sectionTitle);
|
|
172
|
+
return Math.max(sTit, sObj);
|
|
173
|
+
}
|
|
174
|
+
async function retrieveCommissionCRs(options = {}) {
|
|
175
|
+
const dataDir = options["dataDir"];
|
|
176
|
+
const fromSession = Number(options["fromSession"]);
|
|
177
|
+
const concurrency = Number(options["concurrency"] ?? 6);
|
|
178
|
+
const politenessMs = Number(options["politenessMs"] ?? 150);
|
|
179
|
+
const commissionsRootDir = path.join(dataDir, COMMISSION_FOLDER);
|
|
180
|
+
const originalRoot = path.join(commissionsRootDir, DATA_ORIGINAL_FOLDER);
|
|
181
|
+
if (!options["keepDir"]) {
|
|
182
|
+
ensureAndClearDir(originalRoot);
|
|
183
|
+
}
|
|
184
|
+
else {
|
|
185
|
+
ensureDir(originalRoot);
|
|
186
|
+
}
|
|
187
|
+
const discovered = await discoverCommissionWeeklyPages(fromSession);
|
|
188
|
+
console.log(`[COM-CR][discover] ${discovered.length} links (>= session ${fromSession})`);
|
|
189
|
+
const jobs = discovered.map(({ url, yyyymmdd, commissionKey }) => {
|
|
190
|
+
const d = new Date(Number(yyyymmdd.slice(0, 4)), Number(yyyymmdd.slice(4, 6)) - 1, Number(yyyymmdd.slice(6, 8)));
|
|
191
|
+
const session = sessionStartYearFromDate(d);
|
|
192
|
+
const dir = path.join(originalRoot, String(session), commissionKey);
|
|
193
|
+
fs.ensureDirSync(dir);
|
|
194
|
+
const slug = url.replace(/^.*\/(\d{8})\/([^\/]+)\.html$/i, "$2");
|
|
195
|
+
const outPath = path.join(dir, `${yyyymmdd}.${slug}.html`);
|
|
196
|
+
return { url, outPath, yyyymmdd, commissionKey };
|
|
197
|
+
});
|
|
198
|
+
console.log(`[COM-CR] Downloading ${jobs.length} links → ${path.relative(process.cwd(), originalRoot)}`);
|
|
199
|
+
let completed = 0, saved = 0, skipped = 0, notFound = 0;
|
|
200
|
+
const workers = Array.from({ length: Math.max(1, concurrency) }, async () => {
|
|
201
|
+
while (true) {
|
|
202
|
+
const job = jobs.shift();
|
|
203
|
+
if (!job)
|
|
204
|
+
break;
|
|
205
|
+
const { url, outPath, yyyymmdd } = job;
|
|
206
|
+
try {
|
|
207
|
+
if (await fs.pathExists(outPath)) {
|
|
208
|
+
skipped++;
|
|
209
|
+
}
|
|
210
|
+
else {
|
|
211
|
+
const buf = await tryDownload(url);
|
|
212
|
+
if (!buf) {
|
|
213
|
+
notFound++;
|
|
214
|
+
console.warn(`[COM-CR][404] ${url} → week=${yyyymmdd}`);
|
|
215
|
+
}
|
|
216
|
+
else {
|
|
217
|
+
await fs.writeFile(outPath, buf);
|
|
218
|
+
saved++;
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
catch (e) {
|
|
223
|
+
console.error(`[COM-CR][err] ${url} → ${e?.message || e}`);
|
|
224
|
+
}
|
|
225
|
+
finally {
|
|
226
|
+
completed++;
|
|
227
|
+
if (politenessMs > 0)
|
|
228
|
+
await new Promise((r) => setTimeout(r, politenessMs));
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
});
|
|
232
|
+
await Promise.all(workers);
|
|
233
|
+
console.log(`[COM-CR] done: saved=${saved} | skipped=${skipped} | 404=${notFound} | total=${completed}`);
|
|
234
|
+
const sessions = getSessionsFromStart(options["fromSession"]);
|
|
235
|
+
const comRoot = path.join(dataDir, COMMISSION_FOLDER);
|
|
236
|
+
const transformedRoot = path.join(comRoot, DATA_TRANSFORMED_FOLDER);
|
|
237
|
+
if (options["keepDir"])
|
|
238
|
+
ensureDir(transformedRoot);
|
|
239
|
+
else
|
|
240
|
+
ensureAndClearDir(transformedRoot);
|
|
241
|
+
for (const session of sessions) {
|
|
242
|
+
const originalSessionDir = path.join(originalRoot, String(session));
|
|
243
|
+
if (!(await fs.pathExists(originalSessionDir)))
|
|
244
|
+
continue;
|
|
245
|
+
const commissionDirs = (await fs.readdir(originalSessionDir, { withFileTypes: true }))
|
|
246
|
+
.filter((d) => d.isDirectory())
|
|
247
|
+
.map((d) => d.name); // ex: "affaires-etrangeres", "finances", etc.
|
|
248
|
+
for (const commissionKey of commissionDirs) {
|
|
249
|
+
const commissionDir = path.join(originalSessionDir, commissionKey);
|
|
250
|
+
const htmlFiles = (await fs.readdir(commissionDir)).filter((f) => /\.html?$/i.test(f)).sort();
|
|
251
|
+
let totalFiles = 0;
|
|
252
|
+
let linkedFiles = 0;
|
|
253
|
+
for (const f of htmlFiles) {
|
|
254
|
+
const htmlPath = path.join(commissionDir, f);
|
|
255
|
+
let meta;
|
|
256
|
+
let raw = "";
|
|
257
|
+
try {
|
|
258
|
+
raw = await fs.readFile(htmlPath, "utf8");
|
|
259
|
+
meta = parseCommissionMetadataFromHtml(raw, f);
|
|
260
|
+
}
|
|
261
|
+
catch (e) {
|
|
262
|
+
console.warn(`[COM-CR][PRE][${session}] Cannot read/parse ${f}:`, e);
|
|
263
|
+
continue;
|
|
264
|
+
}
|
|
265
|
+
if (!meta?.days?.length)
|
|
266
|
+
continue;
|
|
267
|
+
const $ = cheerio.load(raw, { xmlMode: false });
|
|
268
|
+
for (const day of meta.days) {
|
|
269
|
+
const yyyymmdd = day.date.replace(/-/g, "");
|
|
270
|
+
const dt = new Date(Number(day.date.slice(0, 4)), Number(day.date.slice(5, 7)) - 1, Number(day.date.slice(8, 10)));
|
|
271
|
+
const daySession = sessionStartYearFromDate(dt);
|
|
272
|
+
let hits = await loadAgendaForDate(dataDir, yyyymmdd, daySession);
|
|
273
|
+
console.log(`[COM-CR][TRANSFORM] ${f} → ${hits.length} agenda events on ${day.date} :`);
|
|
274
|
+
const sections = extractDayH3Sections($, day.date);
|
|
275
|
+
if (sections.length === 0) {
|
|
276
|
+
console.warn(`[COM-CR][TRANSFORM] no sections found for ${f} on ${day.date}, skipping.`);
|
|
277
|
+
continue;
|
|
278
|
+
}
|
|
279
|
+
const MAX_TIME_DELTA_MIN = 120;
|
|
280
|
+
const ORGANE_GATE = 0.55;
|
|
281
|
+
const TITLE_GATE = 0.2;
|
|
282
|
+
const W_ORG = 0.4;
|
|
283
|
+
const W_TIM = 0.4;
|
|
284
|
+
const W_TIT = 0.2;
|
|
285
|
+
for (let sIdx = 0; sIdx < sections.length; sIdx++) {
|
|
286
|
+
const sec = sections[sIdx];
|
|
287
|
+
let best = null;
|
|
288
|
+
let reason = "fallback-none";
|
|
289
|
+
if (hits.length) {
|
|
290
|
+
const scored = hits
|
|
291
|
+
.map((h) => {
|
|
292
|
+
const sOrg = organeSimilarity(h, commissionKey); // 0..1
|
|
293
|
+
const sTim = timeProximityScore(h, sec.time ?? day.openTime ?? null, MAX_TIME_DELTA_MIN); // 0..1
|
|
294
|
+
const sTit = titleSimilarity(h, sec.title); // 0..1
|
|
295
|
+
const total = W_ORG * sOrg + W_TIM * sTim + W_TIT * sTit;
|
|
296
|
+
return { h, sOrg, sTim, sTit, total };
|
|
297
|
+
})
|
|
298
|
+
.filter((x) => x.sOrg >= ORGANE_GATE && x.sTit >= TITLE_GATE)
|
|
299
|
+
.sort((a, b) => b.total - a.total);
|
|
300
|
+
if (scored[0]) {
|
|
301
|
+
best = scored[0].h;
|
|
302
|
+
reason =
|
|
303
|
+
scored[0].sTit >= Math.max(scored[0].sOrg, scored[0].sTim)
|
|
304
|
+
? "title"
|
|
305
|
+
: scored[0].sOrg >= scored[0].sTim
|
|
306
|
+
? "organe"
|
|
307
|
+
: "time";
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
const hourShort = toHourShort(day.openTime) ?? "NA";
|
|
311
|
+
const cr = parseCommissionCRSectionFromDom($, htmlPath, {
|
|
312
|
+
dateISO: day.date,
|
|
313
|
+
hourShort,
|
|
314
|
+
organe: commissionKey,
|
|
315
|
+
section: sec,
|
|
316
|
+
matched: best ?? undefined,
|
|
317
|
+
});
|
|
318
|
+
if (!cr) {
|
|
319
|
+
console.warn(`[COM-CR][TRANSFORM] parse failed for section#${sIdx} ${path.basename(htmlPath)} → ${best ? best.uid : "NO-GROUP"} (${commissionKey})`);
|
|
320
|
+
continue;
|
|
321
|
+
}
|
|
322
|
+
const fileUid = cr.uid;
|
|
323
|
+
const transformedSessionDir = path.join(transformedRoot, String(daySession));
|
|
324
|
+
fs.ensureDirSync(transformedSessionDir);
|
|
325
|
+
const outPath = path.join(transformedSessionDir, `${fileUid}.json`);
|
|
326
|
+
await fs.writeJSON(outPath, cr, { spaces: 2 });
|
|
327
|
+
const titreGuess = cleanTitle(sections[sIdx].title) || "Commission du " + day.date;
|
|
328
|
+
const up = await linkCRtoCommissionGroup({
|
|
329
|
+
dataDir,
|
|
330
|
+
dateISO: day.date,
|
|
331
|
+
organeDetected: commissionKey,
|
|
332
|
+
hourShort,
|
|
333
|
+
crUid: fileUid,
|
|
334
|
+
titreGuess,
|
|
335
|
+
groupUid: best ? best.uid : undefined,
|
|
336
|
+
});
|
|
337
|
+
totalFiles++;
|
|
338
|
+
if (up.created || up.updated)
|
|
339
|
+
linkedFiles++;
|
|
340
|
+
else {
|
|
341
|
+
console.warn(`[COM-CR][AGENDA][WARN] CR ${fileUid} (section#${sIdx}) not linked (reason=${reason})`);
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
if (!options["silent"]) {
|
|
347
|
+
console.log(`[COM-CR][SESSION ${session}][${commissionKey}] Processed ${totalFiles} CR files, linked to agenda: ${linkedFiles}`);
|
|
348
|
+
}
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
async function main() {
|
|
353
|
+
const dataDir = options["dataDir"];
|
|
354
|
+
assert(dataDir, "Missing argument: data directory");
|
|
355
|
+
console.time("CRI processing time");
|
|
356
|
+
await retrieveCommissionCRs(options);
|
|
357
|
+
console.timeEnd("CRI processing time");
|
|
358
|
+
}
|
|
359
|
+
main()
|
|
360
|
+
.then(() => process.exit(0))
|
|
361
|
+
.catch((error) => {
|
|
362
|
+
console.error(error);
|
|
363
|
+
process.exit(1);
|
|
364
|
+
});
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Needs to be ran after retrieve_agenda.ts script !
|
|
3
|
+
* - downloads the ZIP of comptes-rendus des débats (CRI) from data.senat.fr
|
|
4
|
+
* - extracts XML files, distributes them by session/year
|
|
5
|
+
*/
|
|
6
|
+
export declare function retrieveCriXmlDump(dataDir: string, options?: Record<string, any>): Promise<void>;
|