@tricoteuses/senat 2.22.11 → 2.22.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/config.d.ts +21 -0
- package/lib/config.js +27 -0
- package/lib/databases.d.ts +2 -0
- package/lib/databases.js +26 -0
- package/lib/datasets.d.ts +34 -0
- package/lib/datasets.js +233 -0
- package/lib/git.d.ts +26 -0
- package/lib/git.js +167 -0
- package/lib/index.d.ts +13 -0
- package/lib/index.js +1 -0
- package/lib/loaders.d.ts +58 -0
- package/lib/loaders.js +286 -0
- package/lib/model/agenda.d.ts +6 -0
- package/lib/model/agenda.js +148 -0
- package/lib/model/ameli.d.ts +51 -0
- package/lib/model/ameli.js +147 -0
- package/lib/model/commission.d.ts +18 -0
- package/lib/model/commission.js +269 -0
- package/lib/model/debats.d.ts +67 -0
- package/lib/model/debats.js +95 -0
- package/lib/model/documents.d.ts +12 -0
- package/lib/model/documents.js +138 -0
- package/lib/model/dosleg.d.ts +7 -0
- package/lib/model/dosleg.js +326 -0
- package/lib/model/index.d.ts +7 -0
- package/lib/model/index.js +7 -0
- package/lib/model/questions.d.ts +45 -0
- package/lib/model/questions.js +89 -0
- package/lib/model/scrutins.d.ts +13 -0
- package/lib/model/scrutins.js +114 -0
- package/lib/model/seance.d.ts +3 -0
- package/lib/model/seance.js +267 -0
- package/lib/model/sens.d.ts +146 -0
- package/lib/model/sens.js +454 -0
- package/lib/model/texte.d.ts +7 -0
- package/lib/model/texte.js +228 -0
- package/lib/model/util.d.ts +9 -0
- package/lib/model/util.js +38 -0
- package/lib/parsers/texte.d.ts +7 -0
- package/lib/parsers/texte.js +228 -0
- package/lib/raw_types/ameli.d.ts +914 -0
- package/lib/raw_types/ameli.js +5 -0
- package/lib/raw_types/debats.d.ts +207 -0
- package/lib/raw_types/debats.js +5 -0
- package/lib/raw_types/dosleg.d.ts +1619 -0
- package/lib/raw_types/dosleg.js +5 -0
- package/lib/raw_types/questions.d.ts +423 -0
- package/lib/raw_types/questions.js +5 -0
- package/lib/raw_types/senat.d.ts +11372 -0
- package/lib/raw_types/senat.js +5 -0
- package/lib/raw_types/sens.d.ts +8248 -0
- package/lib/raw_types/sens.js +5 -0
- package/lib/raw_types_schemats/ameli.d.ts +539 -0
- package/lib/raw_types_schemats/ameli.js +2 -0
- package/lib/raw_types_schemats/debats.d.ts +127 -0
- package/lib/raw_types_schemats/debats.js +2 -0
- package/lib/raw_types_schemats/dosleg.d.ts +977 -0
- package/lib/raw_types_schemats/dosleg.js +2 -0
- package/lib/raw_types_schemats/questions.d.ts +237 -0
- package/lib/raw_types_schemats/questions.js +2 -0
- package/lib/raw_types_schemats/sens.d.ts +6915 -0
- package/lib/raw_types_schemats/sens.js +2 -0
- package/lib/scripts/convert_data.d.ts +1 -0
- package/lib/scripts/convert_data.js +354 -0
- package/lib/scripts/data-download.d.ts +1 -0
- package/lib/scripts/data-download.js +12 -0
- package/lib/scripts/datautil.d.ts +8 -0
- package/lib/scripts/datautil.js +34 -0
- package/lib/scripts/parse_textes.d.ts +1 -0
- package/lib/scripts/parse_textes.js +44 -0
- package/lib/scripts/retrieve_agenda.d.ts +1 -0
- package/lib/scripts/retrieve_agenda.js +132 -0
- package/lib/scripts/retrieve_cr_commission.d.ts +1 -0
- package/lib/scripts/retrieve_cr_commission.js +364 -0
- package/lib/scripts/retrieve_cr_seance.d.ts +6 -0
- package/lib/scripts/retrieve_cr_seance.js +347 -0
- package/lib/scripts/retrieve_documents.d.ts +3 -0
- package/lib/scripts/retrieve_documents.js +219 -0
- package/lib/scripts/retrieve_open_data.d.ts +1 -0
- package/lib/scripts/retrieve_open_data.js +316 -0
- package/lib/scripts/retrieve_senateurs_photos.d.ts +1 -0
- package/lib/scripts/retrieve_senateurs_photos.js +147 -0
- package/lib/scripts/retrieve_videos.d.ts +1 -0
- package/lib/scripts/retrieve_videos.js +461 -0
- package/lib/scripts/shared/cli_helpers.d.ts +95 -0
- package/lib/scripts/shared/cli_helpers.js +91 -0
- package/lib/scripts/shared/util.d.ts +4 -0
- package/lib/scripts/shared/util.js +35 -0
- package/lib/scripts/test_iter_load.d.ts +1 -0
- package/lib/scripts/test_iter_load.js +12 -0
- package/lib/src/model/sens.d.ts +36 -0
- package/lib/src/model/sens.js +35 -4
- package/lib/src/scripts/retrieve_cr_commission.js +12 -0
- package/lib/src/scripts/retrieve_cr_seance.js +12 -0
- package/lib/src/scripts/retrieve_videos.js +13 -1
- package/lib/src/utils/nvs-timecode.d.ts +17 -0
- package/lib/src/utils/nvs-timecode.js +79 -0
- package/lib/src/utils/weights_scoring_config.d.ts +2 -0
- package/lib/src/utils/weights_scoring_config.js +15 -0
- package/lib/strings.d.ts +1 -0
- package/lib/strings.js +18 -0
- package/lib/types/agenda.d.ts +44 -0
- package/lib/types/agenda.js +1 -0
- package/lib/types/ameli.d.ts +5 -0
- package/lib/types/ameli.js +1 -0
- package/lib/types/compte_rendu.d.ts +83 -0
- package/lib/types/compte_rendu.js +1 -0
- package/lib/types/debats.d.ts +2 -0
- package/lib/types/debats.js +1 -0
- package/lib/types/dosleg.d.ts +70 -0
- package/lib/types/dosleg.js +1 -0
- package/lib/types/questions.d.ts +2 -0
- package/lib/types/questions.js +1 -0
- package/lib/types/sens.d.ts +10 -0
- package/lib/types/sens.js +1 -0
- package/lib/types/sessions.d.ts +5 -0
- package/lib/types/sessions.js +84 -0
- package/lib/types/texte.d.ts +74 -0
- package/lib/types/texte.js +16 -0
- package/lib/utils/cr_spliting.d.ts +28 -0
- package/lib/utils/cr_spliting.js +265 -0
- package/lib/utils/date.d.ts +10 -0
- package/lib/utils/date.js +100 -0
- package/lib/utils/nvs-timecode.d.ts +7 -0
- package/lib/utils/nvs-timecode.js +79 -0
- package/lib/utils/reunion_grouping.d.ts +11 -0
- package/lib/utils/reunion_grouping.js +337 -0
- package/lib/utils/reunion_odj_building.d.ts +5 -0
- package/lib/utils/reunion_odj_building.js +154 -0
- package/lib/utils/reunion_parsing.d.ts +23 -0
- package/lib/utils/reunion_parsing.js +209 -0
- package/lib/utils/scoring.d.ts +14 -0
- package/lib/utils/scoring.js +147 -0
- package/lib/utils/string_cleaning.d.ts +7 -0
- package/lib/utils/string_cleaning.js +57 -0
- package/lib/validators/config.d.ts +9 -0
- package/lib/validators/config.js +10 -0
- package/package.json +1 -1
|
@@ -0,0 +1,461 @@
|
|
|
1
|
+
// scripts/retrieve_senat_videos_from_agendas.ts
|
|
2
|
+
import assert from "assert";
|
|
3
|
+
import commandLineArgs from "command-line-args";
|
|
4
|
+
import fs from "fs-extra";
|
|
5
|
+
import fsp from "fs/promises";
|
|
6
|
+
import path from "path";
|
|
7
|
+
import * as cheerio from "cheerio";
|
|
8
|
+
import { AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, iterLoadSenatAgendas } from "../loaders";
|
|
9
|
+
import { getSessionsFromStart } from "../types/sessions";
|
|
10
|
+
import { commonOptions } from "./shared/cli_helpers";
|
|
11
|
+
import { getAgendaSegmentTimecodes } from "../utils/nvs-timecode";
|
|
12
|
+
import { decodeHtmlEntities } from "../utils/string_cleaning";
|
|
13
|
+
import { dice, normalize, scoreVideo } from "../utils/scoring";
|
|
14
|
+
import { epochToParisDateTime, toFRDate, toTargetEpoch } from "../utils/date";
|
|
15
|
+
// ===================== Constants =====================
|
|
16
|
+
const MATCH_THRESHOLD = 0.5;
|
|
17
|
+
const MAX_CANDIDATES = 15;
|
|
18
|
+
const STATS = { total: 0, accepted: 0 };
|
|
19
|
+
const VIDEOS_ROOT_FOLDER = "videos";
|
|
20
|
+
const SENAT_VIDEOS_SEARCH_AJAX = "https://videos.senat.fr/senat_videos_search.php";
|
|
21
|
+
const SENAT_DATAS_ROOT = "https://videos.senat.fr/Datas/senat";
|
|
22
|
+
// ===================== CLI =====================
|
|
23
|
+
const optionsDefinitions = [...commonOptions];
|
|
24
|
+
const options = commandLineArgs(optionsDefinitions);
|
|
25
|
+
// ===================== Utils =====================
|
|
26
|
+
async function fetchText(url) {
|
|
27
|
+
const res = await fetch(url);
|
|
28
|
+
if (!res.ok)
|
|
29
|
+
return null;
|
|
30
|
+
return await res.text();
|
|
31
|
+
}
|
|
32
|
+
async function fetchBuffer(url) {
|
|
33
|
+
const res = await fetch(url);
|
|
34
|
+
if (!res.ok)
|
|
35
|
+
return null;
|
|
36
|
+
const ab = await res.arrayBuffer();
|
|
37
|
+
return Buffer.from(ab);
|
|
38
|
+
}
|
|
39
|
+
async function writeIfChanged(p, content) {
|
|
40
|
+
const exists = await fs.pathExists(p);
|
|
41
|
+
if (exists) {
|
|
42
|
+
const old = await fsp.readFile(p, "utf-8");
|
|
43
|
+
if (old === content)
|
|
44
|
+
return;
|
|
45
|
+
}
|
|
46
|
+
await fsp.writeFile(p, content, "utf-8");
|
|
47
|
+
}
|
|
48
|
+
function queryString(obj) {
|
|
49
|
+
return Object.entries(obj)
|
|
50
|
+
.map(([k, v]) => `${encodeURIComponent(k)}=${encodeURIComponent(v)}`)
|
|
51
|
+
.join("&");
|
|
52
|
+
}
|
|
53
|
+
function extractCandidatesFromSearchHtml(html) {
|
|
54
|
+
const $ = cheerio.load(html);
|
|
55
|
+
const out = [];
|
|
56
|
+
const re = /video\.(\d+)_([a-z0-9]+)/i;
|
|
57
|
+
$('h3.card-title a.stretched-link[href*="video."]').each((_, a) => {
|
|
58
|
+
const href = $(a).attr("href") || "";
|
|
59
|
+
const m = href.match(re);
|
|
60
|
+
if (!m)
|
|
61
|
+
return;
|
|
62
|
+
const id = m[1];
|
|
63
|
+
const hash = m[2];
|
|
64
|
+
const pageUrl = `https://videos.senat.fr/video.${id}_${hash}.html`;
|
|
65
|
+
const title = ($(a).attr("title") || $(a).text() || "").replace(/\s+/g, " ").trim() || undefined;
|
|
66
|
+
const isSeancePublique = title?.toLowerCase().includes("séance publique") ?? false;
|
|
67
|
+
out.push({ id, hash, pageUrl, title, isSeancePublique });
|
|
68
|
+
});
|
|
69
|
+
// dedupe
|
|
70
|
+
const seen = new Set();
|
|
71
|
+
return out.filter((c) => {
|
|
72
|
+
const k = `${c.id}_${c.hash}`;
|
|
73
|
+
if (seen.has(k))
|
|
74
|
+
return false;
|
|
75
|
+
seen.add(k);
|
|
76
|
+
return true;
|
|
77
|
+
});
|
|
78
|
+
}
|
|
79
|
+
function parseDataNvs(nvs) {
|
|
80
|
+
const epochStr = nvs.match(/<metadata\s+name="date"\s+value="(\d+)"/i)?.[1];
|
|
81
|
+
const epoch = epochStr ? Number(epochStr) : undefined;
|
|
82
|
+
// There can be multiple organes for one video in meta
|
|
83
|
+
const organes = [];
|
|
84
|
+
const organesRegex = /<metadata\b[^>]*\bname="organes"[^>]*>/gi;
|
|
85
|
+
let m;
|
|
86
|
+
while ((m = organesRegex.exec(nvs)) !== null) {
|
|
87
|
+
const tag = m[0];
|
|
88
|
+
const label = tag.match(/\blabel="([^"]+)"/i)?.[1];
|
|
89
|
+
if (label) {
|
|
90
|
+
const decoded = decodeHtmlEntities(label).trim();
|
|
91
|
+
if (decoded)
|
|
92
|
+
organes.push(decoded);
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
if (organes.length === 0) {
|
|
96
|
+
organes.push("Séance publique");
|
|
97
|
+
}
|
|
98
|
+
const firstChapterLabelMatch = nvs.match(/<chapter\b[^>]*\blabel="([^"]+)"/i);
|
|
99
|
+
const firstChapterLabel = firstChapterLabelMatch ? decodeHtmlEntities(firstChapterLabelMatch[1]).trim() : undefined;
|
|
100
|
+
return { epoch, organes, firstChapterLabel };
|
|
101
|
+
}
|
|
102
|
+
function buildSenatVodMasterM3u8FromNvs(nvsText) {
|
|
103
|
+
// serverfiles://senat/2025/10/encoder10_20251022084451_2.mp4
|
|
104
|
+
const m = nvsText.match(/serverfiles:\/\/senat\/(\d{4})\/(\d{2})\/(encoder\d+)_([0-9]{14})/i);
|
|
105
|
+
if (!m)
|
|
106
|
+
return null;
|
|
107
|
+
const [, yyyy, mm, encoder, stamp] = m;
|
|
108
|
+
const base = `https://vodsenat.akamaized.net/senat/${yyyy}/${mm}/${encoder}_${stamp}`;
|
|
109
|
+
return `${base}.smil/master.m3u8`;
|
|
110
|
+
}
|
|
111
|
+
function isAmbiguousTimeOriginal(timeOriginal) {
|
|
112
|
+
if (!timeOriginal)
|
|
113
|
+
return false;
|
|
114
|
+
const s = timeOriginal.toLowerCase();
|
|
115
|
+
// Catches "14h", "14 h", "14h30", "14 h 30", "14 heures", "14 heure"
|
|
116
|
+
const timeRe = /\b([01]?\d|2[0-3])\s*(?:h|heures?|heure)\s*(?:([0-5]\d))?\b/g;
|
|
117
|
+
const times = new Set();
|
|
118
|
+
let m;
|
|
119
|
+
while ((m = timeRe.exec(s))) {
|
|
120
|
+
const hh = String(m[1]).padStart(2, "0");
|
|
121
|
+
const mm = m[2] ? String(m[2]).padStart(2, "0") : "00";
|
|
122
|
+
times.add(`${hh}:${mm}`);
|
|
123
|
+
}
|
|
124
|
+
// "midi" / "minuit"
|
|
125
|
+
if (/\bmidi\b/.test(s))
|
|
126
|
+
times.add("12:00");
|
|
127
|
+
if (/\bminuit\b/.test(s))
|
|
128
|
+
times.add("00:00");
|
|
129
|
+
if (times.size >= 2)
|
|
130
|
+
return true;
|
|
131
|
+
const hasDayPeriod = /\b(matin|après-?midi|soir|nuit|journée|toute la journée)\b/.test(s);
|
|
132
|
+
const hasLinking = /,|\bet\b|\bou\b|\bpuis\b/.test(s);
|
|
133
|
+
if (times.size === 1 && hasDayPeriod && hasLinking)
|
|
134
|
+
return true;
|
|
135
|
+
return false;
|
|
136
|
+
}
|
|
137
|
+
function getAgendaType(agenda) {
|
|
138
|
+
const o = agenda.organe || "";
|
|
139
|
+
if (/séance publique/i.test(o))
|
|
140
|
+
return "Séance publique";
|
|
141
|
+
return "Commission";
|
|
142
|
+
}
|
|
143
|
+
async function fetchAllSearchPages(args, maxPages = 3) {
|
|
144
|
+
const pages = [];
|
|
145
|
+
for (let p = 1; p <= maxPages; p++) {
|
|
146
|
+
const url = `${SENAT_VIDEOS_SEARCH_AJAX}?${queryString({ ...args, page: String(p) })}`;
|
|
147
|
+
const html = await fetchText(url);
|
|
148
|
+
if (!html)
|
|
149
|
+
break;
|
|
150
|
+
pages.push(html);
|
|
151
|
+
if (!/href="\/?video\.\d+_[a-z0-9]+\./i.test(html))
|
|
152
|
+
break;
|
|
153
|
+
}
|
|
154
|
+
return pages;
|
|
155
|
+
}
|
|
156
|
+
function getOrgKey(norm) {
|
|
157
|
+
if (!norm)
|
|
158
|
+
return "autre";
|
|
159
|
+
if (norm.includes("seance publique"))
|
|
160
|
+
return "seance_publique";
|
|
161
|
+
if (norm.includes("culture"))
|
|
162
|
+
return "culture";
|
|
163
|
+
if (norm.includes("finances"))
|
|
164
|
+
return "finances";
|
|
165
|
+
if (norm.includes("sociales"))
|
|
166
|
+
return "affaires_sociales";
|
|
167
|
+
if (norm.includes("economiques"))
|
|
168
|
+
return "affaires_economiques";
|
|
169
|
+
if (norm.includes("europeennes"))
|
|
170
|
+
return "affaires_europeennes";
|
|
171
|
+
if (norm.includes("etrangeres") || norm.includes("forces armees") || norm.includes("defense")) {
|
|
172
|
+
return "affaires_etrangeres_defense";
|
|
173
|
+
}
|
|
174
|
+
if (norm.includes("territoire") || norm.includes("durable")) {
|
|
175
|
+
return "amenagement_territoire_dd";
|
|
176
|
+
}
|
|
177
|
+
if (norm.includes("commission des lois"))
|
|
178
|
+
return "lois";
|
|
179
|
+
if (norm.includes("delegation aux collectivites territoriales") || norm.includes("delegation a la decentralisation"))
|
|
180
|
+
return "delegation_collectivites";
|
|
181
|
+
if (norm.includes("delegation aux droits des femmes") ||
|
|
182
|
+
norm.includes("egalite des chances entre les hommes et les femmes"))
|
|
183
|
+
return "delegation_droits_femmes";
|
|
184
|
+
if (norm.includes("delegation aux entreprises"))
|
|
185
|
+
return "delegation_entreprises";
|
|
186
|
+
if (norm.includes("delegation senatoriale aux outre mer") || norm.includes("delegation aux outre mer"))
|
|
187
|
+
return "delegation_outre_mer";
|
|
188
|
+
if (norm.includes("delegation a la prospective"))
|
|
189
|
+
return "delegation_prospective";
|
|
190
|
+
if (norm.includes("office parlementaire d evaluation des choix scientifiques et technologiques") ||
|
|
191
|
+
norm.includes("opecst"))
|
|
192
|
+
return "opecst";
|
|
193
|
+
return "autre";
|
|
194
|
+
}
|
|
195
|
+
async function processGroupedReunion(agenda, session, dataDir) {
|
|
196
|
+
// 1) GuardRails
|
|
197
|
+
if (!agenda.captationVideo) {
|
|
198
|
+
// if (!options["silent"]) console.log(`[skip] ${agenda.uid} captationVideo=false`)
|
|
199
|
+
return;
|
|
200
|
+
}
|
|
201
|
+
if (!agenda.date || !agenda.startTime) {
|
|
202
|
+
// if (!options["silent"]) console.log(`[skip] ${agenda.uid} date/hour missing`)
|
|
203
|
+
return;
|
|
204
|
+
}
|
|
205
|
+
const agendaTs = toTargetEpoch(agenda.startTime, agenda.date);
|
|
206
|
+
const now = Date.now();
|
|
207
|
+
if (agendaTs && agendaTs * 1000 > now) {
|
|
208
|
+
return;
|
|
209
|
+
}
|
|
210
|
+
const reunionUid = agenda.uid;
|
|
211
|
+
const baseDir = path.join(dataDir, VIDEOS_ROOT_FOLDER, String(session), reunionUid);
|
|
212
|
+
await fs.ensureDir(baseDir);
|
|
213
|
+
let skipDownload = false;
|
|
214
|
+
if (options["only-recent"]) {
|
|
215
|
+
const now = Date.now();
|
|
216
|
+
const cutoff = now - options["only-recent"] * 24 * 3600 * 1000;
|
|
217
|
+
const reunionTs = Date.parse(agenda.date);
|
|
218
|
+
if (reunionTs < cutoff) {
|
|
219
|
+
// Check if files already exist
|
|
220
|
+
const dataNvsPath = path.join(baseDir, "data.nvs");
|
|
221
|
+
const finalplayerNvsPath = path.join(baseDir, "finalplayer.nvs");
|
|
222
|
+
if (fs.existsSync(dataNvsPath) && fs.existsSync(finalplayerNvsPath)) {
|
|
223
|
+
skipDownload = true;
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
let master = null;
|
|
228
|
+
let dataTxt = null;
|
|
229
|
+
let finalTxt = null;
|
|
230
|
+
let accepted = false;
|
|
231
|
+
if (!skipDownload) {
|
|
232
|
+
STATS.total++;
|
|
233
|
+
const searchParams = {
|
|
234
|
+
search: "true",
|
|
235
|
+
videotype: getAgendaType(agenda),
|
|
236
|
+
};
|
|
237
|
+
if (agenda.date) {
|
|
238
|
+
const fr = toFRDate(agenda.date);
|
|
239
|
+
searchParams.period = "custom";
|
|
240
|
+
searchParams.begin = fr;
|
|
241
|
+
searchParams.end = fr;
|
|
242
|
+
}
|
|
243
|
+
if (agenda.organe) {
|
|
244
|
+
searchParams.organe = agenda.organe;
|
|
245
|
+
}
|
|
246
|
+
const pages = await fetchAllSearchPages(searchParams);
|
|
247
|
+
if (!pages.length) {
|
|
248
|
+
if (!options["silent"]) {
|
|
249
|
+
console.log(`[miss] ${agenda.uid} no candidates (videotype=${searchParams.videotype}, organe=${searchParams.organe || "-"}, date=${searchParams.begin || "-"})`);
|
|
250
|
+
}
|
|
251
|
+
return;
|
|
252
|
+
}
|
|
253
|
+
const combinedHtml = pages.join("\n<!-- PAGE SPLIT -->\n");
|
|
254
|
+
const candidates = extractCandidatesFromSearchHtml(combinedHtml).slice(0, MAX_CANDIDATES);
|
|
255
|
+
if (!candidates.length) {
|
|
256
|
+
if (!options["silent"]) {
|
|
257
|
+
console.log(`[miss] ${agenda.uid} no candidates after parse (videotype=${searchParams.videotype}, organe=${searchParams.organe || "-"}, date=${searchParams.begin || "-"})`);
|
|
258
|
+
}
|
|
259
|
+
return;
|
|
260
|
+
}
|
|
261
|
+
// ==== 2) Enrich via data.nvs + scoring; pick best ====
|
|
262
|
+
let best = null;
|
|
263
|
+
const timeAmbigious = isAmbiguousTimeOriginal(agenda.events[0].timeOriginal);
|
|
264
|
+
if (timeAmbigious) {
|
|
265
|
+
console.log(`[match] ${agenda.uid} timeOriginal ambiguous => ignoring time scoring: "${agenda.events[0].timeOriginal}"`);
|
|
266
|
+
}
|
|
267
|
+
for (const c of candidates) {
|
|
268
|
+
const dataUrl = `${SENAT_DATAS_ROOT}/${c.id}_${c.hash}/content/data.nvs`;
|
|
269
|
+
const finalUrl = `${SENAT_DATAS_ROOT}/${c.id}_${c.hash}/content/finalplayer.nvs`;
|
|
270
|
+
const dataBuf = await fetchBuffer(dataUrl);
|
|
271
|
+
if (!dataBuf)
|
|
272
|
+
continue;
|
|
273
|
+
const meta = parseDataNvs(dataBuf.toString("utf-8"));
|
|
274
|
+
let sameOrg = false;
|
|
275
|
+
// If organes are too different, go to next candidates
|
|
276
|
+
if (agenda.organe && meta.organes?.length) {
|
|
277
|
+
const agendaOrgNorm = normalize(agenda.organe);
|
|
278
|
+
const agendaKey = getOrgKey(agendaOrgNorm);
|
|
279
|
+
let bestDice = 0;
|
|
280
|
+
let hasSameKey = false;
|
|
281
|
+
for (const vo of meta.organes) {
|
|
282
|
+
const videoOrgNorm = normalize(vo);
|
|
283
|
+
const videoKey = getOrgKey(videoOrgNorm);
|
|
284
|
+
const d = dice(agendaOrgNorm, videoOrgNorm);
|
|
285
|
+
if (videoKey === agendaKey && videoKey !== "autre") {
|
|
286
|
+
hasSameKey = true;
|
|
287
|
+
}
|
|
288
|
+
if (d > bestDice)
|
|
289
|
+
bestDice = d;
|
|
290
|
+
}
|
|
291
|
+
if (hasSameKey) {
|
|
292
|
+
sameOrg = true; // we are sure this is the same org
|
|
293
|
+
}
|
|
294
|
+
else if (bestDice < 0.8) {
|
|
295
|
+
// if diff org and dice too low we skip
|
|
296
|
+
continue;
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
let videoTitle = c.title;
|
|
300
|
+
if (c.isSeancePublique && meta.firstChapterLabel) {
|
|
301
|
+
videoTitle = meta.firstChapterLabel;
|
|
302
|
+
}
|
|
303
|
+
const s = scoreVideo(agenda, agendaTs, sameOrg, videoTitle, meta.epoch, meta.organes, timeAmbigious);
|
|
304
|
+
if (!best || s > best.score) {
|
|
305
|
+
best = {
|
|
306
|
+
id: c.id,
|
|
307
|
+
hash: c.hash,
|
|
308
|
+
pageUrl: c.pageUrl,
|
|
309
|
+
epoch: meta.epoch,
|
|
310
|
+
vtitle: videoTitle,
|
|
311
|
+
score: s,
|
|
312
|
+
vorgane: meta.organes[0],
|
|
313
|
+
};
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
if (!best) {
|
|
317
|
+
if (!options["silent"])
|
|
318
|
+
console.log(`[miss] ${agenda.uid} No candidate found for this reunion`);
|
|
319
|
+
return;
|
|
320
|
+
}
|
|
321
|
+
accepted = best.score >= MATCH_THRESHOLD;
|
|
322
|
+
if (accepted)
|
|
323
|
+
STATS.accepted++;
|
|
324
|
+
if (!options["silent"]) {
|
|
325
|
+
console.log(`[pick] ${agenda.uid} score=${best.score.toFixed(2)}
|
|
326
|
+
agenda title="${agenda.titre ?? ""}" agenda organe="${agenda.organe ?? ""}" agenda heure=${agenda.startTime}
|
|
327
|
+
best title="${best.vtitle ?? ""}" best organe="${best.vorgane ?? ""}"
|
|
328
|
+
accepted=${accepted}`);
|
|
329
|
+
}
|
|
330
|
+
// ==== 3) Write metadata + NVS of the best candidate (always) ====
|
|
331
|
+
const bestDt = best?.epoch ? epochToParisDateTime(best.epoch) : null;
|
|
332
|
+
const metadata = {
|
|
333
|
+
reunionUid,
|
|
334
|
+
session,
|
|
335
|
+
accepted,
|
|
336
|
+
threshold: MATCH_THRESHOLD,
|
|
337
|
+
agenda: {
|
|
338
|
+
date: agenda.date,
|
|
339
|
+
startTime: agenda.startTime,
|
|
340
|
+
titre: agenda.titre,
|
|
341
|
+
organe: agenda.organe ?? undefined,
|
|
342
|
+
uid: agenda.uid,
|
|
343
|
+
},
|
|
344
|
+
best: {
|
|
345
|
+
id: best.id,
|
|
346
|
+
hash: best.hash,
|
|
347
|
+
pageUrl: best.pageUrl,
|
|
348
|
+
epoch: best.epoch ?? null,
|
|
349
|
+
date: bestDt?.date ?? null,
|
|
350
|
+
startTime: bestDt?.startTime ?? null,
|
|
351
|
+
title: best.vtitle ?? null,
|
|
352
|
+
score: best.score,
|
|
353
|
+
},
|
|
354
|
+
};
|
|
355
|
+
await writeIfChanged(path.join(baseDir, "metadata.json"), JSON.stringify(metadata, null, 2));
|
|
356
|
+
const dataUrl = `${SENAT_DATAS_ROOT}/${best.id}_${best.hash}/content/data.nvs`;
|
|
357
|
+
const finalUrl = `${SENAT_DATAS_ROOT}/${best.id}_${best.hash}/content/finalplayer.nvs`;
|
|
358
|
+
dataTxt = await fetchText(dataUrl);
|
|
359
|
+
finalTxt = await fetchText(finalUrl);
|
|
360
|
+
if (dataTxt)
|
|
361
|
+
await fsp.writeFile(path.join(baseDir, "data.nvs"), dataTxt, "utf-8");
|
|
362
|
+
if (finalTxt)
|
|
363
|
+
await fsp.writeFile(path.join(baseDir, "finalplayer.nvs"), finalTxt, "utf-8");
|
|
364
|
+
if (dataTxt) {
|
|
365
|
+
master = buildSenatVodMasterM3u8FromNvs(dataTxt);
|
|
366
|
+
}
|
|
367
|
+
else {
|
|
368
|
+
console.log("Cannot download data nvs");
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
else {
|
|
372
|
+
// Skipped download, but need to read data.nvs for urlVideo
|
|
373
|
+
try {
|
|
374
|
+
dataTxt = await fsp.readFile(path.join(baseDir, "data.nvs"), "utf-8");
|
|
375
|
+
finalTxt = await fsp.readFile(path.join(baseDir, "finalplayer.nvs"), "utf-8");
|
|
376
|
+
master = buildSenatVodMasterM3u8FromNvs(dataTxt);
|
|
377
|
+
}
|
|
378
|
+
catch (e) {
|
|
379
|
+
console.warn(e);
|
|
380
|
+
}
|
|
381
|
+
}
|
|
382
|
+
// ==== 4) Update agenda file (only if accepted + m3u8) ====
|
|
383
|
+
if ((accepted || skipDownload) && master) {
|
|
384
|
+
const agendaJsonPath = path.join(dataDir, AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, String(session), `${agenda.uid}.json`);
|
|
385
|
+
let timecodeDebutVideo = null;
|
|
386
|
+
let timecodeFinVideo = null;
|
|
387
|
+
if (dataTxt && finalTxt) {
|
|
388
|
+
const agendaKey = agenda.titre || agenda.objet || "";
|
|
389
|
+
const seg = getAgendaSegmentTimecodes(dataTxt, finalTxt, agendaKey);
|
|
390
|
+
if (!seg) {
|
|
391
|
+
console.warn(`[warn] Cannot retrieve agenda segment timecodes from reunion ${reunionUid}`);
|
|
392
|
+
}
|
|
393
|
+
else {
|
|
394
|
+
timecodeDebutVideo = seg.start;
|
|
395
|
+
timecodeFinVideo = seg.end;
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
if (await fs.pathExists(agendaJsonPath)) {
|
|
399
|
+
const raw = await fsp.readFile(agendaJsonPath, "utf-8");
|
|
400
|
+
let obj;
|
|
401
|
+
try {
|
|
402
|
+
obj = JSON.parse(raw);
|
|
403
|
+
}
|
|
404
|
+
catch (e) {
|
|
405
|
+
console.warn(`[warn] invalid JSON in ${agendaJsonPath}:`, e?.message);
|
|
406
|
+
obj = null;
|
|
407
|
+
}
|
|
408
|
+
if (obj && typeof obj === "object" && !Array.isArray(obj)) {
|
|
409
|
+
const next = { ...obj, urlVideo: master };
|
|
410
|
+
if (timecodeDebutVideo != null) {
|
|
411
|
+
next.timecodeDebutVideo = timecodeDebutVideo;
|
|
412
|
+
next.timecodeFinVideo = timecodeFinVideo;
|
|
413
|
+
}
|
|
414
|
+
await writeIfChanged(agendaJsonPath, JSON.stringify(next, null, 2));
|
|
415
|
+
if (!options["silent"]) {
|
|
416
|
+
console.log(`[write] ${agenda.uid} urlVideo ← ${master}` +
|
|
417
|
+
(timecodeDebutVideo != null ? ` (timecodeDebutVideo ← ${timecodeDebutVideo}s)` : ""));
|
|
418
|
+
}
|
|
419
|
+
}
|
|
420
|
+
else {
|
|
421
|
+
console.warn(`[warn] expected an object in ${agendaJsonPath}, got ${Array.isArray(obj) ? "array" : typeof obj}`);
|
|
422
|
+
}
|
|
423
|
+
}
|
|
424
|
+
else {
|
|
425
|
+
console.warn(`[warn] agenda file not found for update: ${agendaJsonPath}`);
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
else {
|
|
429
|
+
console.warn(`[warn] The video url could not be built for reunion `, reunionUid);
|
|
430
|
+
}
|
|
431
|
+
}
|
|
432
|
+
async function processAll(dataDir, sessions) {
|
|
433
|
+
console.log("Process all Agendas and fetch video's url");
|
|
434
|
+
for (const session of sessions) {
|
|
435
|
+
for (const { item: agenda } of iterLoadSenatAgendas(dataDir, session)) {
|
|
436
|
+
try {
|
|
437
|
+
await processGroupedReunion(agenda, session, dataDir);
|
|
438
|
+
}
|
|
439
|
+
catch (e) {
|
|
440
|
+
console.error(`[error] ${agenda?.uid ?? "unknown-uid"}:`, e?.message || e);
|
|
441
|
+
}
|
|
442
|
+
}
|
|
443
|
+
}
|
|
444
|
+
}
|
|
445
|
+
async function main() {
|
|
446
|
+
const dataDir = options["dataDir"];
|
|
447
|
+
assert(dataDir, "Missing argument: data directory");
|
|
448
|
+
const sessions = getSessionsFromStart(options["fromSession"]);
|
|
449
|
+
console.time("senat-agendas→videos start processing time");
|
|
450
|
+
await processAll(dataDir, sessions);
|
|
451
|
+
console.timeEnd("senat-agendas→videos processing time");
|
|
452
|
+
const { total, accepted } = STATS;
|
|
453
|
+
const ratio = total ? ((accepted / total) * 100).toFixed(1) : "0.0";
|
|
454
|
+
console.log(`[summary] accepted=${accepted} / total=${total} (${ratio}%)`);
|
|
455
|
+
}
|
|
456
|
+
main()
|
|
457
|
+
.then(() => process.exit(0))
|
|
458
|
+
.catch((err) => {
|
|
459
|
+
console.error(err);
|
|
460
|
+
process.exit(1);
|
|
461
|
+
});
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
export declare const categoriesOption: {
|
|
2
|
+
alias: string;
|
|
3
|
+
defaultValue: string[];
|
|
4
|
+
help: string;
|
|
5
|
+
multiple: boolean;
|
|
6
|
+
name: string;
|
|
7
|
+
type: StringConstructor;
|
|
8
|
+
};
|
|
9
|
+
export declare const dataDirDefaultOption: {
|
|
10
|
+
defaultOption: boolean;
|
|
11
|
+
help: string;
|
|
12
|
+
name: string;
|
|
13
|
+
type: StringConstructor;
|
|
14
|
+
};
|
|
15
|
+
export declare const fromSessionOption: {
|
|
16
|
+
defaultValue: number;
|
|
17
|
+
help: string;
|
|
18
|
+
name: string;
|
|
19
|
+
type: NumberConstructor;
|
|
20
|
+
};
|
|
21
|
+
export declare const silentOption: {
|
|
22
|
+
alias: string;
|
|
23
|
+
help: string;
|
|
24
|
+
name: string;
|
|
25
|
+
type: BooleanConstructor;
|
|
26
|
+
};
|
|
27
|
+
export declare const verboseOption: {
|
|
28
|
+
alias: string;
|
|
29
|
+
help: string;
|
|
30
|
+
name: string;
|
|
31
|
+
type: BooleanConstructor;
|
|
32
|
+
};
|
|
33
|
+
export declare const onlyRecentOption: {
|
|
34
|
+
help: string;
|
|
35
|
+
name: string;
|
|
36
|
+
type: NumberConstructor;
|
|
37
|
+
};
|
|
38
|
+
export declare const keepDirOption: {
|
|
39
|
+
help: string;
|
|
40
|
+
name: string;
|
|
41
|
+
type: BooleanConstructor;
|
|
42
|
+
};
|
|
43
|
+
export declare const cloneOption: {
|
|
44
|
+
alias: string;
|
|
45
|
+
help: string;
|
|
46
|
+
name: string;
|
|
47
|
+
type: StringConstructor;
|
|
48
|
+
};
|
|
49
|
+
export declare const commitOption: {
|
|
50
|
+
help: string;
|
|
51
|
+
name: string;
|
|
52
|
+
type: BooleanConstructor;
|
|
53
|
+
};
|
|
54
|
+
export declare const remoteOption: {
|
|
55
|
+
alias: string;
|
|
56
|
+
help: string;
|
|
57
|
+
multiple: boolean;
|
|
58
|
+
name: string;
|
|
59
|
+
type: StringConstructor;
|
|
60
|
+
};
|
|
61
|
+
export declare const pullOption: {
|
|
62
|
+
alias: string;
|
|
63
|
+
help: string;
|
|
64
|
+
name: string;
|
|
65
|
+
type: BooleanConstructor;
|
|
66
|
+
};
|
|
67
|
+
export declare const fetchDocumentsOption: {
|
|
68
|
+
help: string;
|
|
69
|
+
name: string;
|
|
70
|
+
type: BooleanConstructor;
|
|
71
|
+
};
|
|
72
|
+
export declare const parseDocumentsOption: {
|
|
73
|
+
help: string;
|
|
74
|
+
name: string;
|
|
75
|
+
type: BooleanConstructor;
|
|
76
|
+
};
|
|
77
|
+
export declare const commonOptions: ({
|
|
78
|
+
defaultOption: boolean;
|
|
79
|
+
help: string;
|
|
80
|
+
name: string;
|
|
81
|
+
type: StringConstructor;
|
|
82
|
+
} | {
|
|
83
|
+
help: string;
|
|
84
|
+
name: string;
|
|
85
|
+
type: NumberConstructor;
|
|
86
|
+
} | {
|
|
87
|
+
help: string;
|
|
88
|
+
name: string;
|
|
89
|
+
type: BooleanConstructor;
|
|
90
|
+
} | {
|
|
91
|
+
alias: string;
|
|
92
|
+
help: string;
|
|
93
|
+
name: string;
|
|
94
|
+
type: StringConstructor;
|
|
95
|
+
})[];
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
export const categoriesOption = {
|
|
2
|
+
alias: "k",
|
|
3
|
+
defaultValue: ["All"],
|
|
4
|
+
help: "categories of datasets to reorganize; default All",
|
|
5
|
+
multiple: true,
|
|
6
|
+
name: "categories",
|
|
7
|
+
type: String,
|
|
8
|
+
};
|
|
9
|
+
export const dataDirDefaultOption = {
|
|
10
|
+
defaultOption: true,
|
|
11
|
+
help: "directory containing Sénat open data files",
|
|
12
|
+
name: "dataDir",
|
|
13
|
+
type: String,
|
|
14
|
+
};
|
|
15
|
+
export const fromSessionOption = {
|
|
16
|
+
defaultValue: 2022,
|
|
17
|
+
help: "session year to retrieve data from; default 2022",
|
|
18
|
+
name: "fromSession",
|
|
19
|
+
type: Number,
|
|
20
|
+
};
|
|
21
|
+
export const silentOption = {
|
|
22
|
+
alias: "s",
|
|
23
|
+
help: "don't log anything",
|
|
24
|
+
name: "silent",
|
|
25
|
+
type: Boolean,
|
|
26
|
+
};
|
|
27
|
+
export const verboseOption = {
|
|
28
|
+
alias: "v",
|
|
29
|
+
help: "verbose logs",
|
|
30
|
+
name: "verbose",
|
|
31
|
+
type: Boolean,
|
|
32
|
+
};
|
|
33
|
+
export const onlyRecentOption = {
|
|
34
|
+
help: "retrieve only documents created within the last N days (that are not already downloaded)",
|
|
35
|
+
name: "only-recent",
|
|
36
|
+
type: Number,
|
|
37
|
+
};
|
|
38
|
+
export const keepDirOption = {
|
|
39
|
+
help: "keep directories when cleaning data",
|
|
40
|
+
name: "keepDir",
|
|
41
|
+
type: Boolean,
|
|
42
|
+
};
|
|
43
|
+
export const cloneOption = {
|
|
44
|
+
alias: "C",
|
|
45
|
+
help: "clone repositories from given group (or organization) git URL",
|
|
46
|
+
name: "clone",
|
|
47
|
+
type: String,
|
|
48
|
+
};
|
|
49
|
+
export const commitOption = {
|
|
50
|
+
help: "commit clean files",
|
|
51
|
+
name: "commit",
|
|
52
|
+
type: Boolean,
|
|
53
|
+
};
|
|
54
|
+
export const remoteOption = {
|
|
55
|
+
alias: "r",
|
|
56
|
+
help: "push commit to given remote",
|
|
57
|
+
multiple: true,
|
|
58
|
+
name: "remote",
|
|
59
|
+
type: String,
|
|
60
|
+
};
|
|
61
|
+
export const pullOption = {
|
|
62
|
+
alias: "p",
|
|
63
|
+
help: "pull repositories before proceeding",
|
|
64
|
+
name: "pull",
|
|
65
|
+
type: Boolean,
|
|
66
|
+
};
|
|
67
|
+
export const fetchDocumentsOption = {
|
|
68
|
+
help: "download documents",
|
|
69
|
+
name: "fetchDocuments",
|
|
70
|
+
type: Boolean,
|
|
71
|
+
};
|
|
72
|
+
export const parseDocumentsOption = {
|
|
73
|
+
help: "parse documents",
|
|
74
|
+
name: "parseDocuments",
|
|
75
|
+
type: Boolean,
|
|
76
|
+
};
|
|
77
|
+
export const commonOptions = [
|
|
78
|
+
categoriesOption,
|
|
79
|
+
dataDirDefaultOption,
|
|
80
|
+
fromSessionOption,
|
|
81
|
+
silentOption,
|
|
82
|
+
verboseOption,
|
|
83
|
+
onlyRecentOption,
|
|
84
|
+
keepDirOption,
|
|
85
|
+
cloneOption,
|
|
86
|
+
commitOption,
|
|
87
|
+
remoteOption,
|
|
88
|
+
pullOption,
|
|
89
|
+
fetchDocumentsOption,
|
|
90
|
+
parseDocumentsOption,
|
|
91
|
+
];
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
export declare function isOptionEmptyOrHasValue(option: string, value: string): boolean;
|
|
2
|
+
export declare function ensureAndClearDirSync(dir: string): void;
|
|
3
|
+
export declare function ensureAndClearDir(path: string): void;
|
|
4
|
+
export declare function fetchWithRetry(url: string, retries?: number, backoff?: number): Promise<Response>;
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import fs from "fs-extra";
|
|
2
|
+
import path from "path";
|
|
3
|
+
export function isOptionEmptyOrHasValue(option, value) {
|
|
4
|
+
return !option || option.length === 0 || option.includes(value);
|
|
5
|
+
}
|
|
6
|
+
export function ensureAndClearDirSync(dir) {
|
|
7
|
+
fs.ensureDirSync(dir);
|
|
8
|
+
for (const name of fs.readdirSync(dir)) {
|
|
9
|
+
fs.rmSync(path.join(dir, name), { recursive: true, force: true });
|
|
10
|
+
}
|
|
11
|
+
}
|
|
12
|
+
export function ensureAndClearDir(path) {
|
|
13
|
+
if (!fs.existsSync(path)) {
|
|
14
|
+
fs.mkdirSync(path, { recursive: true });
|
|
15
|
+
}
|
|
16
|
+
else {
|
|
17
|
+
fs.emptyDirSync(path);
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
export async function fetchWithRetry(url, retries = 3, backoff = 300) {
|
|
21
|
+
for (let attempt = 0; attempt < retries; attempt++) {
|
|
22
|
+
try {
|
|
23
|
+
return await fetch(url);
|
|
24
|
+
}
|
|
25
|
+
catch (error) {
|
|
26
|
+
if (attempt === retries - 1) {
|
|
27
|
+
throw error;
|
|
28
|
+
}
|
|
29
|
+
console.warn(`Fetch attempt ${attempt + 1} for ${url} failed. Retrying in ${backoff}ms…`);
|
|
30
|
+
await new Promise((resolve) => setTimeout(resolve, backoff));
|
|
31
|
+
backoff *= 2;
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
throw new Error(`Failed to fetch ${url} after ${retries} attempts`);
|
|
35
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|