@tricoteuses/senat 2.13.1 → 2.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/loaders.d.ts +1 -3
- package/lib/loaders.js +0 -12
- package/lib/model/compte_rendu.d.ts +9 -0
- package/lib/model/compte_rendu.js +325 -0
- package/lib/model/dosleg.d.ts +1 -0
- package/lib/model/dosleg.js +78 -13
- package/lib/raw_types/db.d.ts +11389 -0
- package/lib/raw_types/db.js +5 -0
- package/lib/scripts/convert_data.js +5 -22
- package/lib/scripts/retrieve_comptes_rendus.d.ts +6 -0
- package/lib/scripts/retrieve_comptes_rendus.js +274 -0
- package/lib/scripts/retrieve_videos.d.ts +7 -1
- package/lib/scripts/retrieve_videos.js +95 -80
- package/lib/utils/cr_spliting.js +1 -0
- package/package.json +1 -1
|
@@ -5,7 +5,7 @@ import path from "path";
|
|
|
5
5
|
import { datasets, EnabledDatasets, getEnabledDatasets } from "../datasets";
|
|
6
6
|
import { DATA_ORIGINAL_FOLDER, DOCUMENT_METADATA_FILE, DOSLEG_DOSSIERS_FOLDER, SCRUTINS_FOLDER, RAPPORT_FOLDER, SENS_CIRCONSCRIPTIONS_FOLDER, SENS_ORGANISMES_FOLDER, SENS_SENATEURS_FOLDER, TEXTE_FOLDER, } from "../loaders";
|
|
7
7
|
import { findAllAmendements, findAllCirconscriptions, findAllDebats, findAllDossiers, findAllScrutins, findAllOrganismes, findAllQuestions, findAllSens, findAuteurs, findSenatRapportUrls, findSenatTexteUrls, } from "../model";
|
|
8
|
-
import {
|
|
8
|
+
import { createActesLegislatifs } from "../model/dosleg";
|
|
9
9
|
import { UNDEFINED_SESSION } from "../types/sessions";
|
|
10
10
|
import { getSessionFromDate, getSessionFromSignet } from "./datautil";
|
|
11
11
|
import { commonOptions } from "./shared/cli_helpers";
|
|
@@ -121,28 +121,11 @@ async function convertDatasetDosLeg(dataDir, options) {
|
|
|
121
121
|
}
|
|
122
122
|
loiReorganizedDir = path.join(dossiersReorganizedDir, String(session));
|
|
123
123
|
fs.ensureDirSync(loiReorganizedDir);
|
|
124
|
-
// Ajout
|
|
125
|
-
const
|
|
126
|
-
|
|
127
|
-
const codeParent = getCodeActeLecture(loi["code_nature_dossier"], lecture["type_lecture"], lectureAss["assemblee"]);
|
|
128
|
-
const textesWithCodeActe = (lectureAss["textes"] || []).map((texte) => ({
|
|
129
|
-
...texte,
|
|
130
|
-
code_acte: getCodeActeTexte(codeParent, texte["origine"])
|
|
131
|
-
}));
|
|
132
|
-
return {
|
|
133
|
-
...lectureAss,
|
|
134
|
-
code_acte: codeParent,
|
|
135
|
-
textes: textesWithCodeActe
|
|
136
|
-
};
|
|
137
|
-
});
|
|
138
|
-
return {
|
|
139
|
-
...lecture,
|
|
140
|
-
lectures_assemblee: lecturesAssemblee
|
|
141
|
-
};
|
|
142
|
-
});
|
|
143
|
-
const loiWithCodeActe = { ...loi, lectures: lecturesWithCodeActe };
|
|
124
|
+
// Ajout des actes législatifs au dossier
|
|
125
|
+
const actesLegislatifs = createActesLegislatifs(loi);
|
|
126
|
+
const loiWithActes = { ...loi, actes_legislatifs: actesLegislatifs };
|
|
144
127
|
const scrutinFileName = `${loi["signet"]}.json`;
|
|
145
|
-
fs.writeJSONSync(path.join(loiReorganizedDir, scrutinFileName),
|
|
128
|
+
fs.writeJSONSync(path.join(loiReorganizedDir, scrutinFileName), loiWithActes, {
|
|
146
129
|
spaces: 2,
|
|
147
130
|
});
|
|
148
131
|
}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Needs to be run after retrieve_agenda.ts !
|
|
3
|
+
* - downloads the ZIP of comptes-rendus des débats (CRI) from data.senat.fr
|
|
4
|
+
* - extracts XML files, distributes them by session/year
|
|
5
|
+
*/
|
|
6
|
+
export declare function retrieveCriXmlDump(dataDir: string, options?: Record<string, any>): Promise<void>;
|
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Needs to be run after retrieve_agenda.ts !
|
|
3
|
+
* - downloads the ZIP of comptes-rendus des débats (CRI) from data.senat.fr
|
|
4
|
+
* - extracts XML files, distributes them by session/year
|
|
5
|
+
*/
|
|
6
|
+
import assert from "assert";
|
|
7
|
+
import commandLineArgs from "command-line-args";
|
|
8
|
+
import fs from "fs-extra";
|
|
9
|
+
import path from "path";
|
|
10
|
+
import StreamZip from "node-stream-zip";
|
|
11
|
+
import * as cheerio from "cheerio";
|
|
12
|
+
import { AGENDA_FOLDER, COMPTES_RENDUS_FOLDER, DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER, } from "../loaders";
|
|
13
|
+
import { commonOptions } from "./shared/cli_helpers";
|
|
14
|
+
import { deriveTitreObjetFromSommaire, parseCompteRenduSlotFromFile, parseYYYYMMDD, sessionStartYearFromDate } from "../model/compte_rendu";
|
|
15
|
+
import { makeGroupUid } from "../utils/reunion_grouping";
|
|
16
|
+
import { getSessionsFromStart } from "../types/sessions";
|
|
17
|
+
import { ensureAndClearDir, fetchWithRetry } from "./shared/util";
|
|
18
|
+
import { computeIntervalsBySlot } from "../utils/cr_spliting";
|
|
19
|
+
const optionsDefinitions = [
|
|
20
|
+
...commonOptions,
|
|
21
|
+
{
|
|
22
|
+
help: "parse and convert comptes-rendus des débats into JSON",
|
|
23
|
+
name: "parseDebats",
|
|
24
|
+
type: Boolean,
|
|
25
|
+
}
|
|
26
|
+
];
|
|
27
|
+
const options = commandLineArgs(optionsDefinitions);
|
|
28
|
+
const CRI_ZIP_URL = "https://data.senat.fr/data/debats/cri.zip";
|
|
29
|
+
const SLOT_ORDER = ["MATIN", "APRES-MIDI", "SOIR"];
|
|
30
|
+
class CompteRenduError extends Error {
|
|
31
|
+
constructor(message, url) {
|
|
32
|
+
super(`An error occurred while retrieving ${url}: ${message}`);
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
function pickFirstSlotOfDay(slots) {
|
|
36
|
+
for (const s of SLOT_ORDER)
|
|
37
|
+
if (slots.includes(s))
|
|
38
|
+
return s;
|
|
39
|
+
return null;
|
|
40
|
+
}
|
|
41
|
+
function loadAgendaSPSlotsForDate(dataDir, yyyymmdd, session) {
|
|
42
|
+
const dirPath = path.join(dataDir, AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, session.toString());
|
|
43
|
+
if (!fs.existsSync(dirPath)) {
|
|
44
|
+
console.warn(`[AGENDA] Directory not found for session ${session} → ${dirPath}`);
|
|
45
|
+
return null;
|
|
46
|
+
}
|
|
47
|
+
const pattern = new RegExp(`^RUSN${yyyymmdd}IDS-(MATIN|APRES-MIDI|SOIR)\\.json$`);
|
|
48
|
+
const ALLOWED_SLOTS = new Set(["MATIN", "APRES-MIDI", "SOIR"]);
|
|
49
|
+
try {
|
|
50
|
+
const files = fs.readdirSync(dirPath);
|
|
51
|
+
const matched = files.filter((f) => pattern.test(f));
|
|
52
|
+
if (matched.length === 0) {
|
|
53
|
+
return null;
|
|
54
|
+
}
|
|
55
|
+
const found = new Set();
|
|
56
|
+
for (const name of matched) {
|
|
57
|
+
const m = name.match(pattern);
|
|
58
|
+
const raw = (m?.[1] ?? "");
|
|
59
|
+
if (ALLOWED_SLOTS.has(raw))
|
|
60
|
+
found.add(raw);
|
|
61
|
+
}
|
|
62
|
+
const slots = Array.from(found);
|
|
63
|
+
if (slots.length === 0) {
|
|
64
|
+
return null;
|
|
65
|
+
}
|
|
66
|
+
return { filePath: dirPath, slots };
|
|
67
|
+
}
|
|
68
|
+
catch {
|
|
69
|
+
return null;
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
async function downloadCriZip(zipPath) {
|
|
73
|
+
if (!options["silent"])
|
|
74
|
+
console.log(`Downloading CRI zip ${CRI_ZIP_URL}…`);
|
|
75
|
+
const response = await fetchWithRetry(CRI_ZIP_URL);
|
|
76
|
+
if (!response.ok) {
|
|
77
|
+
if (response.status === 404) {
|
|
78
|
+
console.warn(`CRI zip ${CRI_ZIP_URL} not found`);
|
|
79
|
+
return;
|
|
80
|
+
}
|
|
81
|
+
throw new CompteRenduError(String(response.status), CRI_ZIP_URL);
|
|
82
|
+
}
|
|
83
|
+
const buf = Buffer.from(await response.arrayBuffer());
|
|
84
|
+
await fs.writeFile(zipPath, buf);
|
|
85
|
+
if (!options["silent"]) {
|
|
86
|
+
const mb = (buf.length / (1024 * 1024)).toFixed(1);
|
|
87
|
+
console.log(`[CRI] Downloaded ${mb} MB → ${zipPath}`);
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
async function extractAndDistributeXmlBySession(zipPath, originalRoot) {
|
|
91
|
+
const zip = new StreamZip.async({ file: zipPath });
|
|
92
|
+
const entries = await zip.entries();
|
|
93
|
+
let count = 0;
|
|
94
|
+
for (const entryName of Object.keys(entries)) {
|
|
95
|
+
if (!entryName.toLowerCase().endsWith(".xml"))
|
|
96
|
+
continue;
|
|
97
|
+
// ex: d20231005.xml
|
|
98
|
+
const base = path.basename(entryName);
|
|
99
|
+
const m = base.match(/^d(\d{8})\.xml$/i);
|
|
100
|
+
if (!m)
|
|
101
|
+
continue;
|
|
102
|
+
const yyyymmdd = m[1];
|
|
103
|
+
const dt = parseYYYYMMDD(yyyymmdd);
|
|
104
|
+
if (!dt)
|
|
105
|
+
continue;
|
|
106
|
+
const session = sessionStartYearFromDate(dt);
|
|
107
|
+
const destDir = path.join(originalRoot, String(session));
|
|
108
|
+
await fs.ensureDir(destDir);
|
|
109
|
+
const outPath = path.join(destDir, base);
|
|
110
|
+
await zip.extract(entryName, outPath);
|
|
111
|
+
count++;
|
|
112
|
+
}
|
|
113
|
+
await zip.close();
|
|
114
|
+
return count;
|
|
115
|
+
}
|
|
116
|
+
export async function retrieveCriXmlDump(dataDir, options = {}) {
|
|
117
|
+
const root = path.join(dataDir, COMPTES_RENDUS_FOLDER);
|
|
118
|
+
ensureAndClearDir(root);
|
|
119
|
+
const originalRoot = path.join(root, DATA_ORIGINAL_FOLDER);
|
|
120
|
+
fs.ensureDirSync(originalRoot);
|
|
121
|
+
const transformedRoot = path.join(root, DATA_TRANSFORMED_FOLDER);
|
|
122
|
+
if (options["parseDebats"])
|
|
123
|
+
fs.ensureDirSync(transformedRoot);
|
|
124
|
+
const sessions = getSessionsFromStart(options["fromSession"]);
|
|
125
|
+
// 1) Download ZIP global + distribut by session
|
|
126
|
+
const zipPath = path.join(dataDir, "cri.zip");
|
|
127
|
+
console.log("[CRI] Downloading global CRI zip…");
|
|
128
|
+
await downloadCriZip(zipPath);
|
|
129
|
+
console.log("[CRI] Extracting + distributing XMLs by session…");
|
|
130
|
+
for (const session of sessions) {
|
|
131
|
+
const dir = path.join(originalRoot, String(session));
|
|
132
|
+
if (await fs.pathExists(dir)) {
|
|
133
|
+
for (const f of await fs.readdir(dir))
|
|
134
|
+
if (/\.xml$/i.test(f))
|
|
135
|
+
await fs.remove(path.join(dir, f));
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
const n = await extractAndDistributeXmlBySession(zipPath, originalRoot);
|
|
139
|
+
if (n === 0) {
|
|
140
|
+
console.warn("[CRI] No XML extracted. Archive empty or layout changed?");
|
|
141
|
+
}
|
|
142
|
+
else {
|
|
143
|
+
console.log(`[CRI] Distributed ${n} XML file(s) into session folders.`);
|
|
144
|
+
}
|
|
145
|
+
if (!options["parseDebats"]) {
|
|
146
|
+
console.log("[CRI] parseDebats not requested → done.");
|
|
147
|
+
return;
|
|
148
|
+
}
|
|
149
|
+
for (const session of sessions) {
|
|
150
|
+
const originalSessionDir = path.join(originalRoot, String(session));
|
|
151
|
+
if (!(await fs.pathExists(originalSessionDir))) {
|
|
152
|
+
continue;
|
|
153
|
+
}
|
|
154
|
+
const xmlFiles = (await fs.readdir(originalSessionDir))
|
|
155
|
+
.filter((f) => /^d\d{8}\.xml$/i.test(f))
|
|
156
|
+
.sort();
|
|
157
|
+
const transformedSessionDir = path.join(transformedRoot, String(session));
|
|
158
|
+
if (options["parseDebats"])
|
|
159
|
+
await fs.ensureDir(transformedSessionDir);
|
|
160
|
+
for (const f of xmlFiles) {
|
|
161
|
+
const yyyymmdd = f.slice(1, 9);
|
|
162
|
+
const xmlPath = path.join(originalSessionDir, f);
|
|
163
|
+
// 1) Deduce slot(s) from agenda if it exsits
|
|
164
|
+
const agendaInfo = loadAgendaSPSlotsForDate(dataDir, yyyymmdd, session);
|
|
165
|
+
const firstSlotOfDay = pickFirstSlotOfDay(agendaInfo?.slots ?? []);
|
|
166
|
+
// 2) Detect slots from CRI content
|
|
167
|
+
let slotsInCri = [];
|
|
168
|
+
try {
|
|
169
|
+
const raw = await fs.readFile(xmlPath, "utf8");
|
|
170
|
+
const $ = cheerio.load(raw, { xml: false });
|
|
171
|
+
const order = $("body *").toArray();
|
|
172
|
+
const idx = new Map(order.map((el, i) => [el, i]));
|
|
173
|
+
const intervals = computeIntervalsBySlot($, idx, firstSlotOfDay ?? undefined);
|
|
174
|
+
const uniq = new Set();
|
|
175
|
+
for (const iv of intervals)
|
|
176
|
+
if (iv.slot && iv.slot !== "UNKNOWN")
|
|
177
|
+
uniq.add(iv.slot);
|
|
178
|
+
slotsInCri = Array.from(uniq);
|
|
179
|
+
}
|
|
180
|
+
catch (e) {
|
|
181
|
+
console.warn(`[CRI] [${session}] Cannot read/parse ${f}:`, e);
|
|
182
|
+
continue;
|
|
183
|
+
}
|
|
184
|
+
if (slotsInCri.length === 0) {
|
|
185
|
+
slotsInCri = [firstSlotOfDay ?? "MATIN"];
|
|
186
|
+
}
|
|
187
|
+
// 3) Parse & write each slot
|
|
188
|
+
for (const slot of slotsInCri) {
|
|
189
|
+
const outName = `CRSSN${yyyymmdd}-${slot}.json`;
|
|
190
|
+
const cr = await parseCompteRenduSlotFromFile(xmlPath, slot, firstSlotOfDay ?? slot);
|
|
191
|
+
if (!cr) {
|
|
192
|
+
console.warn(`[CRI] [${session}] Empty or no points for ${yyyymmdd} (${slot}) → skip`);
|
|
193
|
+
continue;
|
|
194
|
+
}
|
|
195
|
+
const outDir = transformedSessionDir;
|
|
196
|
+
await fs.ensureDir(outDir);
|
|
197
|
+
const outPath = path.join(outDir, outName);
|
|
198
|
+
await fs.writeJSON(outPath, cr, { spaces: 2 });
|
|
199
|
+
try {
|
|
200
|
+
await linkCriSlotIntoAgendaGrouped(dataDir, yyyymmdd, slot, cr.uid, cr, session);
|
|
201
|
+
}
|
|
202
|
+
catch (e) {
|
|
203
|
+
console.warn(`[AGENDA] [${session}] Could not link CR into grouped for ${yyyymmdd} ${slot}:`, e);
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
async function main() {
|
|
210
|
+
const dataDir = options["dataDir"];
|
|
211
|
+
assert(dataDir, "Missing argument: data directory");
|
|
212
|
+
console.time("CRI processing time");
|
|
213
|
+
await retrieveCriXmlDump(dataDir, options);
|
|
214
|
+
if (!options["silent"]) {
|
|
215
|
+
console.timeEnd("CRI processing time");
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
main()
|
|
219
|
+
.then(() => process.exit(0))
|
|
220
|
+
.catch((error) => {
|
|
221
|
+
console.error(error);
|
|
222
|
+
process.exit(1);
|
|
223
|
+
});
|
|
224
|
+
async function linkCriSlotIntoAgendaGrouped(dataDir, yyyymmdd, slot, crUid, cr, session) {
|
|
225
|
+
const groupedDir = path.join(dataDir, AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, session.toString());
|
|
226
|
+
fs.ensureDirSync(groupedDir);
|
|
227
|
+
const groupedPath = path.join(groupedDir, 'RUSN' + yyyymmdd + 'IDS-' + slot + '.json');
|
|
228
|
+
let groups = [];
|
|
229
|
+
if (fs.existsSync(groupedPath)) {
|
|
230
|
+
try {
|
|
231
|
+
groups = JSON.parse(fs.readFileSync(groupedPath, "utf8"));
|
|
232
|
+
if (!Array.isArray(groups))
|
|
233
|
+
groups = [];
|
|
234
|
+
}
|
|
235
|
+
catch (e) {
|
|
236
|
+
console.warn(`[AGENDA] unreadable grouped JSON → ${groupedPath} (${e}) → recreating`);
|
|
237
|
+
groups = [];
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
// find existing group with same slot
|
|
241
|
+
const sameSlot = groups.filter(g => g?.slot === slot);
|
|
242
|
+
let target = null;
|
|
243
|
+
if (sameSlot.length > 1) {
|
|
244
|
+
console.warn(`[AGENDA] multiple groups for ${yyyymmdd} ${slot} in ${groupedPath} → linking the first`);
|
|
245
|
+
}
|
|
246
|
+
target = sameSlot[0] ?? null;
|
|
247
|
+
const dateISO = `${yyyymmdd.slice(0, 4)}-${yyyymmdd.slice(4, 6)}-${yyyymmdd.slice(6, 8)}`;
|
|
248
|
+
const sommaire = cr?.metadonnees?.sommaire;
|
|
249
|
+
const { titre: dTitre, objet: dObjet } = deriveTitreObjetFromSommaire(sommaire, slot);
|
|
250
|
+
if (!target) {
|
|
251
|
+
const newGroup = {
|
|
252
|
+
uid: makeGroupUid(dateISO, slot),
|
|
253
|
+
chambre: "SN",
|
|
254
|
+
date: dateISO,
|
|
255
|
+
slot,
|
|
256
|
+
type: "Séance publique",
|
|
257
|
+
startTime: null,
|
|
258
|
+
endTime: null,
|
|
259
|
+
captationVideo: false,
|
|
260
|
+
titre: dTitre,
|
|
261
|
+
objet: dObjet || "",
|
|
262
|
+
events: [],
|
|
263
|
+
compteRenduRefUid: crUid,
|
|
264
|
+
};
|
|
265
|
+
groups.push(newGroup);
|
|
266
|
+
}
|
|
267
|
+
else {
|
|
268
|
+
target.compteRenduRefUid = crUid;
|
|
269
|
+
}
|
|
270
|
+
await fs.writeJSON(groupedPath, groups, { spaces: 2 });
|
|
271
|
+
if (!options["silent"]) {
|
|
272
|
+
console.log(`[AGENDA] Linked CR ${crUid} → ${path.basename(groupedPath)} [${slot}]`);
|
|
273
|
+
}
|
|
274
|
+
}
|
|
@@ -1 +1,7 @@
|
|
|
1
|
-
|
|
1
|
+
import { GroupedReunion } from "../types/agenda";
|
|
2
|
+
export declare function buildSenatVodMasterM3u8FromNvs(nvsText: string, finalText: string): string | null;
|
|
3
|
+
export declare function score(agenda: GroupedReunion, agendaTs: number | null, videoTitle?: string, videoEpoch?: number): number;
|
|
4
|
+
/**
|
|
5
|
+
* Build search strategies for senat's videos
|
|
6
|
+
*/
|
|
7
|
+
export declare function buildSearchStrategies(agenda: GroupedReunion): Array<Record<string, string>>;
|
|
@@ -4,12 +4,11 @@ import commandLineArgs from "command-line-args";
|
|
|
4
4
|
import fs from "fs-extra";
|
|
5
5
|
import fsp from "fs/promises";
|
|
6
6
|
import path from "path";
|
|
7
|
-
import { AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER,
|
|
7
|
+
import { AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, iterLoadSenatAgendasGrouped } from "../loaders";
|
|
8
8
|
import { getSessionsFromStart } from "../types/sessions";
|
|
9
9
|
import { commonOptions } from "./shared/cli_helpers";
|
|
10
|
-
import { formatYYYYMMDD, makeReunionUid } from "../utils/reunion_grouping";
|
|
11
10
|
// ===================== Constants =====================
|
|
12
|
-
const MATCH_THRESHOLD = 0.
|
|
11
|
+
const MATCH_THRESHOLD = 0.6;
|
|
13
12
|
const MAX_CANDIDATES = 15;
|
|
14
13
|
const MAX_PAGES = 3;
|
|
15
14
|
const STATS = { total: 0, accepted: 0 };
|
|
@@ -18,9 +17,7 @@ const SENAT_VIDEOS_SEARCH_AJAX = "https://videos.senat.fr/senat_videos_search.ph
|
|
|
18
17
|
const SENAT_DATAS_ROOT = "https://videos.senat.fr/Datas/senat";
|
|
19
18
|
const SENAT_VOD_HOST = "https://vodsenat.akamaized.net";
|
|
20
19
|
// ===================== CLI =====================
|
|
21
|
-
const optionsDefinitions = [
|
|
22
|
-
...commonOptions,
|
|
23
|
-
];
|
|
20
|
+
const optionsDefinitions = [...commonOptions];
|
|
24
21
|
const options = commandLineArgs(optionsDefinitions);
|
|
25
22
|
// ===================== Utils =====================
|
|
26
23
|
function normalize(s) {
|
|
@@ -32,7 +29,9 @@ function normalize(s) {
|
|
|
32
29
|
.replace(/\s+/g, " ")
|
|
33
30
|
.trim();
|
|
34
31
|
}
|
|
35
|
-
function tokens(s) {
|
|
32
|
+
function tokens(s) {
|
|
33
|
+
return normalize(s).split(" ").filter(Boolean);
|
|
34
|
+
}
|
|
36
35
|
function dice(a, b) {
|
|
37
36
|
const A = new Set(tokens(a)), B = new Set(tokens(b));
|
|
38
37
|
if (!A.size || !B.size)
|
|
@@ -46,7 +45,7 @@ function dice(a, b) {
|
|
|
46
45
|
// Heuristic for Europe/Paris DST: +02:00 ≈ April→October, +01:00 otherwise.
|
|
47
46
|
function parisOffsetForDate(dateYYYYMMDD) {
|
|
48
47
|
const m = Number(dateYYYYMMDD.split("-")[1] || "1");
|
|
49
|
-
return
|
|
48
|
+
return m >= 4 && m <= 10 ? "+02:00" : "+01:00";
|
|
50
49
|
}
|
|
51
50
|
function epochToParisDateTime(epochSec) {
|
|
52
51
|
if (!Number.isFinite(epochSec))
|
|
@@ -54,7 +53,7 @@ function epochToParisDateTime(epochSec) {
|
|
|
54
53
|
const dUtc = new Date(epochSec * 1000);
|
|
55
54
|
// Offset heuristic (same logique que parisOffsetForDate)
|
|
56
55
|
const m = dUtc.getUTCMonth() + 1; // 1..12
|
|
57
|
-
const offsetHours =
|
|
56
|
+
const offsetHours = m >= 4 && m <= 10 ? 2 : 1;
|
|
58
57
|
const offsetStr = offsetHours === 2 ? "+02:00" : "+01:00";
|
|
59
58
|
// Applique l'offset pour obtenir la date/heure locales Paris
|
|
60
59
|
const localMs = dUtc.getTime() + offsetHours * 3600 * 1000;
|
|
@@ -149,7 +148,7 @@ function extractCandidatesFromSearchHtml(html) {
|
|
|
149
148
|
out.push({ id, hash, pageUrl, title: t?.[1] });
|
|
150
149
|
}
|
|
151
150
|
const seen = new Set();
|
|
152
|
-
return out.filter(c => {
|
|
151
|
+
return out.filter((c) => {
|
|
153
152
|
const k = `${c.id}_${c.hash}`;
|
|
154
153
|
if (seen.has(k))
|
|
155
154
|
return false;
|
|
@@ -162,46 +161,68 @@ function parseDataNvs(nvs) {
|
|
|
162
161
|
const title = nvs.match(/<metadata\s+name="title"\s+value="([^"]+)"/i)?.[1];
|
|
163
162
|
return { epoch: epoch ? Number(epoch) : undefined, title };
|
|
164
163
|
}
|
|
165
|
-
|
|
166
|
-
|
|
164
|
+
// nvsText = contenu texte de data.nvs (utf-8)
|
|
165
|
+
// finalText = contenu texte de finalplayer.nvs (utf-8)
|
|
166
|
+
export function buildSenatVodMasterM3u8FromNvs(nvsText, finalText) {
|
|
167
|
+
// 1) Base Akamai depuis data.nvs (mp4 "serverfiles://senat/YYYY/MM/encoderX_YYYYMMDDHHMMSS_1.mp4")
|
|
168
|
+
const baseMatch = nvsText.match(/serverfiles:\/\/senat\/(\d{4})\/(\d{2})\/(encoder\d)_(\d{14})/i);
|
|
169
|
+
if (!baseMatch)
|
|
167
170
|
return null;
|
|
168
|
-
|
|
169
|
-
const
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
171
|
+
const [, yyyy, mm, encoder, stamp] = baseMatch;
|
|
172
|
+
const base = `https://vodsenat.akamaized.net/senat/${yyyy}/${mm}/${encoder}_${stamp}`;
|
|
173
|
+
// 2) start/end depuis finalplayer.nvs
|
|
174
|
+
let start = null, end = null;
|
|
175
|
+
const playerAttr = finalText.match(/player[^>]*\bstarttime="(\d+)"[^>]*\bendtime="(\d+)"/i);
|
|
176
|
+
if (playerAttr) {
|
|
177
|
+
start = parseInt(playerAttr[1], 10);
|
|
178
|
+
end = parseInt(playerAttr[2], 10);
|
|
179
|
+
}
|
|
180
|
+
else {
|
|
181
|
+
// fallback: prendre le plus petit timecode des <synchro timecode="...">
|
|
182
|
+
const tc = Array.from(finalText.matchAll(/timecode="(\d+)"/g)).map((m) => parseInt(m[1], 10));
|
|
183
|
+
if (tc.length)
|
|
184
|
+
start = Math.min(...tc);
|
|
185
|
+
}
|
|
186
|
+
// 3) si pas d'end, on peut déduire via "duree" (en secondes) de data.nvs
|
|
187
|
+
if (end == null) {
|
|
188
|
+
const durMeta = nvsText.match(/<metadata[^>]*\bname="duree"[^>]*\bvalue="(\d+)"[^>]*>/i);
|
|
189
|
+
if (durMeta && start != null) {
|
|
190
|
+
const durMs = parseInt(durMeta[1], 10) * 1000; // sec → ms
|
|
191
|
+
end = start + durMs;
|
|
192
|
+
}
|
|
177
193
|
}
|
|
178
|
-
//
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
194
|
+
// 4) Construction de l’URL
|
|
195
|
+
// - si on a start & end → utiliser ps/pd (robuste et conforme à ce que sert le Sénat)
|
|
196
|
+
// - sinon fallback sans suffixe (souvent valide aussi)
|
|
197
|
+
if (start != null && end != null && end > start) {
|
|
198
|
+
const pd = end - start;
|
|
199
|
+
return `${base}_ps${start}_pd${pd}.smil/master.m3u8`;
|
|
183
200
|
}
|
|
184
|
-
|
|
185
|
-
const mAny = xml.match(/https?:\/\/[^"'<>]+\.m3u8/i);
|
|
186
|
-
return mAny ? mAny[0] : null;
|
|
201
|
+
return `${base}.smil/master.m3u8`;
|
|
187
202
|
}
|
|
188
|
-
function score(agenda, agendaTs, videoTitle, videoEpoch) {
|
|
203
|
+
export function score(agenda, agendaTs, videoTitle, videoEpoch) {
|
|
189
204
|
const titleScore = dice(agenda.titre || "", videoTitle || "");
|
|
190
205
|
let timeScore = 0;
|
|
191
206
|
if (agendaTs && videoEpoch) {
|
|
207
|
+
// second
|
|
192
208
|
const deltaMin = Math.abs(videoEpoch - agendaTs) / 60;
|
|
193
|
-
|
|
209
|
+
// delta : 180min
|
|
210
|
+
timeScore = Math.max(0, 1 - deltaMin / 180);
|
|
194
211
|
}
|
|
195
212
|
let orgBonus = 0;
|
|
196
213
|
if (agenda.organe && videoTitle) {
|
|
197
214
|
const o = normalize(agenda.organe);
|
|
198
215
|
const t = normalize(videoTitle);
|
|
199
|
-
|
|
216
|
+
const first = o.split(" ").filter(Boolean)[0];
|
|
217
|
+
if (first && t.includes(first))
|
|
200
218
|
orgBonus = 0.15;
|
|
201
219
|
}
|
|
202
|
-
return 0.3 * titleScore + 0.7 * timeScore + orgBonus;
|
|
220
|
+
return 0.3 * titleScore + 0.7 * timeScore + orgBonus; // Can be adjusted
|
|
203
221
|
}
|
|
204
|
-
|
|
222
|
+
/**
|
|
223
|
+
* Build search strategies for senat's videos
|
|
224
|
+
*/
|
|
225
|
+
export function buildSearchStrategies(agenda) {
|
|
205
226
|
const fr = agenda.date ? toFRDate(agenda.date) : undefined;
|
|
206
227
|
const kw = simplifyTitleForKeywords(agenda.titre || "");
|
|
207
228
|
const commission = agenda.organe || undefined;
|
|
@@ -239,21 +260,23 @@ async function fetchAllSearchPages(args, baseDir, strategyIndex, maxPages = MAX_
|
|
|
239
260
|
}
|
|
240
261
|
return pages;
|
|
241
262
|
}
|
|
242
|
-
async function
|
|
263
|
+
async function processGroupedReunion(agenda, session, dataDir) {
|
|
243
264
|
if (!agenda)
|
|
244
265
|
return;
|
|
266
|
+
// 1) Garde-fous
|
|
245
267
|
if (!agenda.captationVideo) {
|
|
246
268
|
if (!options["silent"])
|
|
247
|
-
console.log(`[skip] ${agenda.
|
|
269
|
+
console.log(`[skip] ${agenda.uid} captationVideo=false`);
|
|
248
270
|
return;
|
|
249
271
|
}
|
|
250
272
|
if (!agenda.date || !agenda.startTime) {
|
|
251
273
|
if (!options["silent"])
|
|
252
|
-
console.log(`[skip] ${agenda.
|
|
274
|
+
console.log(`[skip] ${agenda.uid} date/hour missing`);
|
|
253
275
|
return;
|
|
254
276
|
}
|
|
255
277
|
STATS.total++;
|
|
256
|
-
|
|
278
|
+
// 2) Dossier de sortie (utilise directement l'UID)
|
|
279
|
+
const reunionUid = agenda.uid;
|
|
257
280
|
const baseDir = path.join(dataDir, VIDEOS_ROOT_FOLDER, String(session), reunionUid);
|
|
258
281
|
await fs.ensureDir(baseDir);
|
|
259
282
|
const agendaTs = toTargetEpoch(agenda.date, agenda.startTime);
|
|
@@ -276,8 +299,9 @@ async function processAgenda(agenda, session, dataDir) {
|
|
|
276
299
|
}
|
|
277
300
|
}
|
|
278
301
|
if (usedStrategy === -1 || !candidates.length) {
|
|
279
|
-
if (!options["silent"])
|
|
280
|
-
console.log(`[miss] ${agenda.
|
|
302
|
+
if (!options["silent"]) {
|
|
303
|
+
console.log(`[miss] ${agenda.uid} no candidates (triedStrategies=${strategies.length})`);
|
|
304
|
+
}
|
|
281
305
|
return;
|
|
282
306
|
}
|
|
283
307
|
// ==== 2) Enrich via data.nvs + scoring; pick best ====
|
|
@@ -295,14 +319,14 @@ async function processAgenda(agenda, session, dataDir) {
|
|
|
295
319
|
}
|
|
296
320
|
if (!best) {
|
|
297
321
|
if (!options["silent"])
|
|
298
|
-
console.log(`[miss] ${agenda.
|
|
322
|
+
console.log(`[miss] ${agenda.uid} candidates without data.nvs`);
|
|
299
323
|
return;
|
|
300
324
|
}
|
|
301
325
|
const accepted = best.score >= MATCH_THRESHOLD;
|
|
302
326
|
if (accepted)
|
|
303
327
|
STATS.accepted++;
|
|
304
328
|
if (!options["silent"]) {
|
|
305
|
-
console.log(`[pick] ${agenda.
|
|
329
|
+
console.log(`[pick] ${agenda.uid} best id=${best.id} hash=${best.hash} score=${best.score.toFixed(2)} accepted=${accepted} (strategy=${usedStrategy})`);
|
|
306
330
|
}
|
|
307
331
|
// ==== 3) Write metadata + NVS of the best candidate (always) ====
|
|
308
332
|
const bestDt = best?.epoch ? epochToParisDateTime(best.epoch) : null;
|
|
@@ -317,7 +341,7 @@ async function processAgenda(agenda, session, dataDir) {
|
|
|
317
341
|
startTime: agenda.startTime,
|
|
318
342
|
titre: agenda.titre,
|
|
319
343
|
organe: agenda.organe ?? undefined,
|
|
320
|
-
|
|
344
|
+
uid: agenda.uid,
|
|
321
345
|
},
|
|
322
346
|
best: {
|
|
323
347
|
id: best.id,
|
|
@@ -340,37 +364,30 @@ async function processAgenda(agenda, session, dataDir) {
|
|
|
340
364
|
if (finalTxt)
|
|
341
365
|
await fsp.writeFile(path.join(baseDir, "finalplayer.nvs"), finalTxt, "utf-8");
|
|
342
366
|
let master = null;
|
|
343
|
-
if (dataTxt)
|
|
344
|
-
master = buildSenatVodMasterM3u8FromNvs(dataTxt);
|
|
367
|
+
if (dataTxt && finalTxt)
|
|
368
|
+
master = buildSenatVodMasterM3u8FromNvs(dataTxt, finalTxt);
|
|
345
369
|
// ==== 4) Update agenda file (only if accepted + m3u8) ====
|
|
346
370
|
if (accepted && master) {
|
|
347
|
-
const agendaJsonPath = path.join(dataDir, AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, String(session), `${
|
|
371
|
+
const agendaJsonPath = path.join(dataDir, AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, String(session), `${agenda.uid}.json`);
|
|
348
372
|
if (await fs.pathExists(agendaJsonPath)) {
|
|
349
373
|
const raw = await fsp.readFile(agendaJsonPath, "utf-8");
|
|
350
|
-
let
|
|
374
|
+
let obj;
|
|
351
375
|
try {
|
|
352
|
-
|
|
376
|
+
obj = JSON.parse(raw);
|
|
353
377
|
}
|
|
354
378
|
catch (e) {
|
|
355
379
|
console.warn(`[warn] invalid JSON in ${agendaJsonPath}:`, e?.message);
|
|
356
|
-
|
|
380
|
+
obj = null;
|
|
357
381
|
}
|
|
358
|
-
if (Array.isArray(
|
|
359
|
-
const
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
else {
|
|
364
|
-
// add/update urlVideo on the matching item
|
|
365
|
-
items[idx] = { ...items[idx], urlVideo: master };
|
|
366
|
-
await writeIfChanged(agendaJsonPath, JSON.stringify(items, null, 2));
|
|
367
|
-
if (!options["silent"]) {
|
|
368
|
-
console.log(`[write] ${agenda.id} urlVideo ← ${master}`);
|
|
369
|
-
}
|
|
382
|
+
if (obj && typeof obj === "object" && !Array.isArray(obj)) {
|
|
383
|
+
const next = { ...obj, urlVideo: master };
|
|
384
|
+
await writeIfChanged(agendaJsonPath, JSON.stringify(next, null, 2));
|
|
385
|
+
if (!options["silent"]) {
|
|
386
|
+
console.log(`[write] ${agenda.uid} urlVideo ← ${master}`);
|
|
370
387
|
}
|
|
371
388
|
}
|
|
372
389
|
else {
|
|
373
|
-
console.warn(`[warn] expected an
|
|
390
|
+
console.warn(`[warn] expected an object in ${agendaJsonPath}, got ${Array.isArray(obj) ? "array" : typeof obj}`);
|
|
374
391
|
}
|
|
375
392
|
}
|
|
376
393
|
else {
|
|
@@ -379,15 +396,14 @@ async function processAgenda(agenda, session, dataDir) {
|
|
|
379
396
|
}
|
|
380
397
|
}
|
|
381
398
|
async function processAll(dataDir, sessions) {
|
|
399
|
+
console.log("Process all Agendas and fetch video's url");
|
|
382
400
|
for (const session of sessions) {
|
|
383
|
-
for (const { item:
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
console.error(`[error] ${agenda.id}:`, e?.message || e);
|
|
390
|
-
}
|
|
401
|
+
for (const { item: agenda } of iterLoadSenatAgendasGrouped(dataDir, session)) {
|
|
402
|
+
try {
|
|
403
|
+
await processGroupedReunion(agenda, session, dataDir);
|
|
404
|
+
}
|
|
405
|
+
catch (e) {
|
|
406
|
+
console.error(`[error] ${agenda?.uid ?? "unknown-uid"}:`, e?.message || e);
|
|
391
407
|
}
|
|
392
408
|
}
|
|
393
409
|
}
|
|
@@ -396,17 +412,16 @@ async function main() {
|
|
|
396
412
|
const dataDir = options["dataDir"];
|
|
397
413
|
assert(dataDir, "Missing argument: data directory");
|
|
398
414
|
const sessions = getSessionsFromStart(options["fromSession"]);
|
|
399
|
-
|
|
400
|
-
console.time("senat-agendas→videos start processing time");
|
|
415
|
+
console.time("senat-agendas→videos start processing time");
|
|
401
416
|
await processAll(dataDir, sessions);
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
const ratio = total ? (accepted / total * 100).toFixed(1) : "0.0";
|
|
407
|
-
console.log(`[summary] accepted=${accepted} / total=${total} (${ratio}%)`);
|
|
408
|
-
}
|
|
417
|
+
console.timeEnd("senat-agendas→videos processing time");
|
|
418
|
+
const { total, accepted } = STATS;
|
|
419
|
+
const ratio = total ? ((accepted / total) * 100).toFixed(1) : "0.0";
|
|
420
|
+
console.log(`[summary] accepted=${accepted} / total=${total} (${ratio}%)`);
|
|
409
421
|
}
|
|
410
422
|
main()
|
|
411
423
|
.then(() => process.exit(0))
|
|
412
|
-
.catch((err) => {
|
|
424
|
+
.catch((err) => {
|
|
425
|
+
console.error(err);
|
|
426
|
+
process.exit(1);
|
|
427
|
+
});
|