@tricoteuses/senat 2.9.1 → 2.9.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE.md +22 -22
- package/README.md +116 -116
- package/lib/loaders.d.ts +5 -1
- package/lib/loaders.js +8 -0
- package/lib/model/agenda.js +2 -0
- package/lib/model/compte_rendu.d.ts +1 -2
- package/lib/model/compte_rendu.js +303 -22
- package/lib/scripts/retrieve_comptes_rendus.js +27 -14
- package/lib/scripts/retrieve_videos.d.ts +1 -0
- package/lib/scripts/retrieve_videos.js +420 -0
- package/lib/types/agenda.d.ts +2 -0
- package/lib/types/compte_rendu.d.ts +72 -7
- package/lib/validators/senat.d.ts +0 -0
- package/lib/validators/senat.js +24 -0
- package/package.json +96 -94
|
@@ -2,11 +2,11 @@ import assert from "assert";
|
|
|
2
2
|
import commandLineArgs from "command-line-args";
|
|
3
3
|
import fs from "fs-extra";
|
|
4
4
|
import path from "path";
|
|
5
|
-
import { COMPTES_RENDUS_FOLDER, DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER, iterLoadSenatDebats } from "../loaders";
|
|
6
|
-
import { parseCompteRenduFromFile } from "../model/compte_rendu";
|
|
5
|
+
import { COMPTES_RENDUS_FOLDER, DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER, iterLoadSenatDebats, } from "../loaders";
|
|
7
6
|
import { getSessionsFromStart } from "../types/sessions";
|
|
8
7
|
import { commonOptions } from "./shared/cli_helpers";
|
|
9
8
|
import { ensureAndClearDir } from "./shared/util";
|
|
9
|
+
import { parseCompteRenduFromFile } from "../model/compte_rendu";
|
|
10
10
|
const optionsDefinitions = [
|
|
11
11
|
...commonOptions,
|
|
12
12
|
{
|
|
@@ -22,6 +22,22 @@ class CompteRenduError extends Error {
|
|
|
22
22
|
super(`An error occurred while retrieving Compte-Rendu ${compteRenduUrl}: ${message}`);
|
|
23
23
|
}
|
|
24
24
|
}
|
|
25
|
+
async function fetchWithRetry(url, retries = 5, backoffMs = 1000) {
|
|
26
|
+
for (let attempt = 0; attempt < retries; attempt++) {
|
|
27
|
+
try {
|
|
28
|
+
return await fetch(url);
|
|
29
|
+
}
|
|
30
|
+
catch (e) {
|
|
31
|
+
if (attempt === retries)
|
|
32
|
+
break;
|
|
33
|
+
console.warn(`Fetch attempt ${attempt + 1} for ${url} failed. Retrying in ${backoffMs}ms...`);
|
|
34
|
+
await new Promise((resolve) => setTimeout(resolve, backoffMs));
|
|
35
|
+
backoffMs *= 2;
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
console.log(`Failed to fetch ${url} after ${retries} attempts.`);
|
|
39
|
+
return null;
|
|
40
|
+
}
|
|
25
41
|
async function retrieveComptesRendus(dataDir, sessions) {
|
|
26
42
|
const comptesRendusRootDir = path.join(dataDir, COMPTES_RENDUS_FOLDER);
|
|
27
43
|
ensureAndClearDir(comptesRendusRootDir);
|
|
@@ -38,16 +54,15 @@ async function retrieveComptesRendus(dataDir, sessions) {
|
|
|
38
54
|
if (options["parseDebats"]) {
|
|
39
55
|
fs.ensureDirSync(transformedComptesRendusSessionDir);
|
|
40
56
|
}
|
|
41
|
-
for (const { item: debat
|
|
42
|
-
if (!debat.url)
|
|
57
|
+
for (const { item: debat } of iterLoadSenatDebats(dataDir, session)) {
|
|
58
|
+
if (!debat.url)
|
|
43
59
|
continue;
|
|
44
|
-
}
|
|
45
60
|
try {
|
|
46
61
|
const debatMonoUrl = `${path.parse(debat.url).dir}/s${debat.id}_mono.html`;
|
|
47
62
|
const compteRenduPath = path.join(originalComptesRendusSessionDir, `${debat.id}.html`);
|
|
48
63
|
await downloadCompteRendu(debatMonoUrl, compteRenduPath);
|
|
49
64
|
if (options["parseDebats"]) {
|
|
50
|
-
await
|
|
65
|
+
await parseAndWriteJSON(transformedComptesRendusSessionDir, compteRenduPath, debat);
|
|
51
66
|
}
|
|
52
67
|
}
|
|
53
68
|
catch (error) {
|
|
@@ -61,7 +76,7 @@ async function downloadCompteRendu(debatUrl, compteRenduPath) {
|
|
|
61
76
|
if (!options["silent"]) {
|
|
62
77
|
console.log(`Downloading Compte-Rendu ${compteRenduUrl}…`);
|
|
63
78
|
}
|
|
64
|
-
const response = await
|
|
79
|
+
const response = await fetchWithRetry(compteRenduUrl);
|
|
65
80
|
if (!response.ok) {
|
|
66
81
|
if (response.status === 404) {
|
|
67
82
|
console.warn(`Compte-Rendu ${compteRenduUrl} not found`);
|
|
@@ -72,21 +87,19 @@ async function downloadCompteRendu(debatUrl, compteRenduPath) {
|
|
|
72
87
|
return;
|
|
73
88
|
}
|
|
74
89
|
const compteRenduContent = await response.arrayBuffer();
|
|
75
|
-
if (!compteRenduContent)
|
|
90
|
+
if (!compteRenduContent)
|
|
76
91
|
return;
|
|
77
|
-
}
|
|
78
92
|
fs.writeFileSync(compteRenduPath, Buffer.from(compteRenduContent));
|
|
79
93
|
}
|
|
80
|
-
async function
|
|
94
|
+
async function parseAndWriteJSON(transformedComptesRendusSessionDir, compteRenduPath, debat) {
|
|
81
95
|
if (!options["silent"]) {
|
|
82
96
|
console.log(`Parsing compte-rendu ${compteRenduPath}…`);
|
|
83
97
|
}
|
|
84
|
-
const
|
|
85
|
-
if (!
|
|
98
|
+
const parsed = await parseCompteRenduFromFile(compteRenduPath);
|
|
99
|
+
if (!parsed)
|
|
86
100
|
return;
|
|
87
|
-
}
|
|
88
101
|
const parsedFilePath = path.parse(compteRenduPath);
|
|
89
|
-
fs.writeJSONSync(path.join(transformedComptesRendusSessionDir, `${parsedFilePath.name}.json`),
|
|
102
|
+
fs.writeJSONSync(path.join(transformedComptesRendusSessionDir, `${parsedFilePath.name}.json`), parsed, { spaces: 2 });
|
|
90
103
|
}
|
|
91
104
|
async function main() {
|
|
92
105
|
const dataDir = options["dataDir"];
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,420 @@
|
|
|
1
|
+
// scripts/retrieve_senat_videos_from_agendas.ts
|
|
2
|
+
import assert from "assert";
|
|
3
|
+
import commandLineArgs from "command-line-args";
|
|
4
|
+
import fs from "fs-extra";
|
|
5
|
+
import fsp from "fs/promises";
|
|
6
|
+
import path from "path";
|
|
7
|
+
import { AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, iterLoadSenatAgendas, } from "../loaders";
|
|
8
|
+
import { getSessionsFromStart } from "../types/sessions";
|
|
9
|
+
import { commonOptions } from "./shared/cli_helpers";
|
|
10
|
+
// ===================== Constants =====================
|
|
11
|
+
const MATCH_THRESHOLD = 0.60;
|
|
12
|
+
const MAX_CANDIDATES = 15;
|
|
13
|
+
const MAX_PAGES = 3;
|
|
14
|
+
const STATS = { total: 0, accepted: 0 };
|
|
15
|
+
const VIDEOS_ROOT_FOLDER = "videos";
|
|
16
|
+
const SENAT_VIDEOS_SEARCH_AJAX = "https://videos.senat.fr/senat_videos_search.php";
|
|
17
|
+
const SENAT_DATAS_ROOT = "https://videos.senat.fr/Datas/senat";
|
|
18
|
+
const SENAT_VOD_HOST = "https://vodsenat.akamaized.net";
|
|
19
|
+
// ===================== CLI =====================
|
|
20
|
+
const optionsDefinitions = [
|
|
21
|
+
...commonOptions,
|
|
22
|
+
];
|
|
23
|
+
const options = commandLineArgs(optionsDefinitions);
|
|
24
|
+
// ===================== Utils =====================
|
|
25
|
+
function normalize(s) {
|
|
26
|
+
return (s ?? "")
|
|
27
|
+
.toLowerCase()
|
|
28
|
+
.normalize("NFD")
|
|
29
|
+
.replace(/[\u0300-\u036f]/g, "")
|
|
30
|
+
.replace(/[^\p{L}\p{N}\s-]/gu, " ")
|
|
31
|
+
.replace(/\s+/g, " ")
|
|
32
|
+
.trim();
|
|
33
|
+
}
|
|
34
|
+
function tokens(s) { return normalize(s).split(" ").filter(Boolean); }
|
|
35
|
+
function dice(a, b) {
|
|
36
|
+
const A = new Set(tokens(a)), B = new Set(tokens(b));
|
|
37
|
+
if (!A.size || !B.size)
|
|
38
|
+
return 0;
|
|
39
|
+
let inter = 0;
|
|
40
|
+
for (const t of A)
|
|
41
|
+
if (B.has(t))
|
|
42
|
+
inter++;
|
|
43
|
+
return (2 * inter) / (A.size + B.size);
|
|
44
|
+
}
|
|
45
|
+
// Heuristic for Europe/Paris DST: +02:00 ≈ April→October, +01:00 otherwise.
|
|
46
|
+
function parisOffsetForDate(dateYYYYMMDD) {
|
|
47
|
+
const m = Number(dateYYYYMMDD.split("-")[1] || "1");
|
|
48
|
+
return (m >= 4 && m <= 10) ? "+02:00" : "+01:00";
|
|
49
|
+
}
|
|
50
|
+
function epochToParisDateTime(epochSec) {
|
|
51
|
+
if (!Number.isFinite(epochSec))
|
|
52
|
+
return null;
|
|
53
|
+
const dUtc = new Date(epochSec * 1000);
|
|
54
|
+
// Offset heuristic (same logique que parisOffsetForDate)
|
|
55
|
+
const m = dUtc.getUTCMonth() + 1; // 1..12
|
|
56
|
+
const offsetHours = (m >= 4 && m <= 10) ? 2 : 1;
|
|
57
|
+
const offsetStr = offsetHours === 2 ? "+02:00" : "+01:00";
|
|
58
|
+
// Applique l'offset pour obtenir la date/heure locales Paris
|
|
59
|
+
const localMs = dUtc.getTime() + offsetHours * 3600 * 1000;
|
|
60
|
+
const dl = new Date(localMs);
|
|
61
|
+
const yyyy = String(dl.getUTCFullYear());
|
|
62
|
+
const mm = String(dl.getUTCMonth() + 1).padStart(2, "0");
|
|
63
|
+
const dd = String(dl.getUTCDate()).padStart(2, "0");
|
|
64
|
+
const hh = String(dl.getUTCHours()).padStart(2, "0");
|
|
65
|
+
const mi = String(dl.getUTCMinutes()).padStart(2, "0");
|
|
66
|
+
const ss = String(dl.getUTCSeconds()).padStart(2, "0");
|
|
67
|
+
const ms = String(dl.getUTCMilliseconds()).padStart(3, "0");
|
|
68
|
+
return {
|
|
69
|
+
date: `${yyyy}-${mm}-${dd}`,
|
|
70
|
+
startTime: `${hh}:${mi}:${ss}.${ms}${offsetStr}`,
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
function toTargetEpoch(date, time) {
|
|
74
|
+
if (!date)
|
|
75
|
+
return null;
|
|
76
|
+
let t = (time ?? "00:00").trim();
|
|
77
|
+
// Si l'heure contient déjà un fuseau (Z ou ±HH:MM), on la fait simplement précéder de la date.
|
|
78
|
+
const hasTz = /(?:Z|[+-]\d{2}:\d{2})$/i.test(t);
|
|
79
|
+
let iso;
|
|
80
|
+
if (hasTz) {
|
|
81
|
+
// Exemple: 2022-10-04T18:00:00.000+02:00
|
|
82
|
+
iso = `${date}T${t}`;
|
|
83
|
+
}
|
|
84
|
+
else {
|
|
85
|
+
// Normalise pour avoir au moins HH:mm:ss
|
|
86
|
+
if (/^\d{1,2}$/.test(t)) {
|
|
87
|
+
t = `${t.padStart(2, "0")}:00:00`;
|
|
88
|
+
}
|
|
89
|
+
else if (/^\d{1,2}:\d{2}$/.test(t)) {
|
|
90
|
+
t = `${t}:00`;
|
|
91
|
+
} // sinon, on garde tel quel (gère HH:mm:ss et HH:mm:ss.SSS)
|
|
92
|
+
// Ajoute l’offset Paris (heuristique saisonnière)
|
|
93
|
+
iso = `${date}T${t}${parisOffsetForDate(date)}`;
|
|
94
|
+
}
|
|
95
|
+
const ms = Date.parse(iso);
|
|
96
|
+
return Number.isNaN(ms) ? null : Math.floor(ms / 1000);
|
|
97
|
+
}
|
|
98
|
+
async function fetchText(url) {
|
|
99
|
+
const res = await fetch(url);
|
|
100
|
+
if (!res.ok)
|
|
101
|
+
return null;
|
|
102
|
+
return await res.text();
|
|
103
|
+
}
|
|
104
|
+
async function fetchBuffer(url) {
|
|
105
|
+
const res = await fetch(url);
|
|
106
|
+
if (!res.ok)
|
|
107
|
+
return null;
|
|
108
|
+
const ab = await res.arrayBuffer();
|
|
109
|
+
return Buffer.from(ab);
|
|
110
|
+
}
|
|
111
|
+
async function writeIfChanged(p, content) {
|
|
112
|
+
const exists = await fs.pathExists(p);
|
|
113
|
+
if (exists) {
|
|
114
|
+
const old = await fsp.readFile(p, "utf-8");
|
|
115
|
+
if (old === content)
|
|
116
|
+
return;
|
|
117
|
+
}
|
|
118
|
+
await fsp.writeFile(p, content, "utf-8");
|
|
119
|
+
}
|
|
120
|
+
function queryString(obj) {
|
|
121
|
+
return Object.entries(obj)
|
|
122
|
+
.map(([k, v]) => `${encodeURIComponent(k)}=${encodeURIComponent(v)}`)
|
|
123
|
+
.join("&");
|
|
124
|
+
}
|
|
125
|
+
function simplifyTitleForKeywords(input) {
|
|
126
|
+
return (input || "")
|
|
127
|
+
.replace(/\baudition\s+de\b/gi, " ")
|
|
128
|
+
.replace(/\breunion\b/gi, " ")
|
|
129
|
+
.replace(/\bsur\b/gi, " ")
|
|
130
|
+
.replace(/\b(la|le|les|des|de|du|d’|d')\b/gi, " ")
|
|
131
|
+
.replace(/[–—-]/g, " ")
|
|
132
|
+
.replace(/\s+/g, " ")
|
|
133
|
+
.trim();
|
|
134
|
+
}
|
|
135
|
+
function toFRDate(dateYYYYMMDD) {
|
|
136
|
+
const [y, m, d] = dateYYYYMMDD.split("-");
|
|
137
|
+
return `${d}/${m}/${y}`; // DD/MM/YYYY
|
|
138
|
+
}
|
|
139
|
+
function formatYYYYMMDD(dateYYYYMMDD) {
|
|
140
|
+
const [y, m, d] = dateYYYYMMDD.split("-");
|
|
141
|
+
return `${y}${m}${d}`;
|
|
142
|
+
}
|
|
143
|
+
function makeReunionUid(agenda) {
|
|
144
|
+
// agenda.date is expected as "YYYY-MM-DD"
|
|
145
|
+
const ymd = agenda.date ? formatYYYYMMDD(agenda.date) : "00000000";
|
|
146
|
+
return `${ymd}-${agenda.id}`;
|
|
147
|
+
}
|
|
148
|
+
function extractCandidatesFromSearchHtml(html) {
|
|
149
|
+
const out = [];
|
|
150
|
+
const re = /href="\/?video\.(\d+)_([a-z0-9]+)\.[^"]+"/gi;
|
|
151
|
+
let m;
|
|
152
|
+
while ((m = re.exec(html))) {
|
|
153
|
+
const id = m[1], hash = m[2];
|
|
154
|
+
const pageUrl = `https://videos.senat.fr/video.${id}_${hash}.html`;
|
|
155
|
+
const ctx = html.slice(Math.max(0, m.index - 240), Math.min(html.length, m.index + 240));
|
|
156
|
+
const t = ctx.match(/title="([^"]+)"/i) || ctx.match(/>([^<]{10,200})</);
|
|
157
|
+
out.push({ id, hash, pageUrl, title: t?.[1] });
|
|
158
|
+
}
|
|
159
|
+
const seen = new Set();
|
|
160
|
+
return out.filter(c => {
|
|
161
|
+
const k = `${c.id}_${c.hash}`;
|
|
162
|
+
if (seen.has(k))
|
|
163
|
+
return false;
|
|
164
|
+
seen.add(k);
|
|
165
|
+
return true;
|
|
166
|
+
});
|
|
167
|
+
}
|
|
168
|
+
function parseDataNvs(nvs) {
|
|
169
|
+
const epoch = nvs.match(/<metadata\s+name="date"\s+value="(\d+)"/i)?.[1];
|
|
170
|
+
const title = nvs.match(/<metadata\s+name="title"\s+value="([^"]+)"/i)?.[1];
|
|
171
|
+
return { epoch: epoch ? Number(epoch) : undefined, title };
|
|
172
|
+
}
|
|
173
|
+
function buildSenatVodMasterM3u8FromNvs(xml, host = SENAT_VOD_HOST) {
|
|
174
|
+
if (!xml)
|
|
175
|
+
return null;
|
|
176
|
+
// (a) Déjà un lien VOD complet en .smil/playlist.m3u8
|
|
177
|
+
const mVod = xml.match(/https?:\/\/[^"'<>]*vodsenat[^"'<>]*\.smil\/(?:playlist|master)\.m3u8/i);
|
|
178
|
+
if (mVod)
|
|
179
|
+
return mVod[0];
|
|
180
|
+
// (b) Chemin senat/YYYY/MM/<basename>.smil -> normalise en playlist.m3u8
|
|
181
|
+
const mSmilPath = xml.match(/senat\/(\d{4})\/(\d{2})\/([^"'<>\/]+?)\.smil/i);
|
|
182
|
+
if (mSmilPath) {
|
|
183
|
+
const [, y, m, base] = mSmilPath;
|
|
184
|
+
return `${host}/senat/${y}/${m}/${base}.smil/playlist.m3u8`;
|
|
185
|
+
}
|
|
186
|
+
// (c) Chemin senat/YYYY/MM/<basename>.mp4 -> transforme en .smil/playlist.m3u8
|
|
187
|
+
const mMp4Path = xml.match(/senat\/(\d{4})\/(\d{2})\/([^"'<>\/]+?)\.mp4/i);
|
|
188
|
+
if (mMp4Path) {
|
|
189
|
+
const [, y, m, base] = mMp4Path;
|
|
190
|
+
return `${host}/senat/${y}/${m}/${base}.smil/playlist.m3u8`;
|
|
191
|
+
}
|
|
192
|
+
// (d) À défaut, n’importe quel .m3u8 présent (faible priorité — peut être du live)
|
|
193
|
+
const mAny = xml.match(/https?:\/\/[^"'<>]+\.m3u8/i);
|
|
194
|
+
return mAny ? mAny[0] : null;
|
|
195
|
+
}
|
|
196
|
+
function score(agenda, agendaTs, videoTitle, videoEpoch) {
|
|
197
|
+
const titleScore = dice(agenda.titre || "", videoTitle || "");
|
|
198
|
+
let timeScore = 0;
|
|
199
|
+
if (agendaTs && videoEpoch) {
|
|
200
|
+
const deltaMin = Math.abs(videoEpoch - agendaTs) / 60;
|
|
201
|
+
timeScore = Math.max(0, 1 - (deltaMin / 180));
|
|
202
|
+
}
|
|
203
|
+
let orgBonus = 0;
|
|
204
|
+
if (agenda.organe && videoTitle) {
|
|
205
|
+
const o = normalize(agenda.organe);
|
|
206
|
+
const t = normalize(videoTitle);
|
|
207
|
+
if (o && t.includes(o.split(" ")[0]))
|
|
208
|
+
orgBonus = 0.15;
|
|
209
|
+
}
|
|
210
|
+
return 0.3 * titleScore + 0.7 * timeScore + orgBonus;
|
|
211
|
+
}
|
|
212
|
+
function buildSearchStrategies(agenda) {
|
|
213
|
+
const fr = agenda.date ? toFRDate(agenda.date) : undefined;
|
|
214
|
+
const kw = simplifyTitleForKeywords(agenda.titre || "");
|
|
215
|
+
const commission = agenda.organe || undefined;
|
|
216
|
+
// common base
|
|
217
|
+
const base = { search: "true", videotype: "Commission" };
|
|
218
|
+
if (fr)
|
|
219
|
+
Object.assign(base, { period: "custom", begin: fr, end: fr });
|
|
220
|
+
const strategies = [];
|
|
221
|
+
// 1) keywords + commission
|
|
222
|
+
if (kw && commission)
|
|
223
|
+
strategies.push({ ...base, motscles: kw, commission });
|
|
224
|
+
// 2) keywords without commission
|
|
225
|
+
if (kw)
|
|
226
|
+
strategies.push({ ...base, motscles: kw });
|
|
227
|
+
// 3) full-text (AND) + commission
|
|
228
|
+
if (kw && commission)
|
|
229
|
+
strategies.push({ ...base, text: `AND${kw}`, commission });
|
|
230
|
+
// 4) full-text (AND) without commission
|
|
231
|
+
if (kw)
|
|
232
|
+
strategies.push({ ...base, text: `AND${kw}` });
|
|
233
|
+
// 5) no keywords (just type + period)
|
|
234
|
+
strategies.push({ ...base });
|
|
235
|
+
return strategies;
|
|
236
|
+
}
|
|
237
|
+
async function fetchAllSearchPages(args, baseDir, strategyIndex, maxPages = MAX_PAGES) {
|
|
238
|
+
const pages = [];
|
|
239
|
+
for (let p = 1; p <= maxPages; p++) {
|
|
240
|
+
const url = `${SENAT_VIDEOS_SEARCH_AJAX}?${queryString({ ...args, page: String(p) })}`;
|
|
241
|
+
const html = await fetchText(url);
|
|
242
|
+
if (!html)
|
|
243
|
+
break;
|
|
244
|
+
pages.push(html);
|
|
245
|
+
if (!/href="\/?video\.\d+_[a-z0-9]+\./i.test(html))
|
|
246
|
+
break;
|
|
247
|
+
}
|
|
248
|
+
return pages;
|
|
249
|
+
}
|
|
250
|
+
async function processAgenda(agenda, session, dataDir) {
|
|
251
|
+
if (!agenda)
|
|
252
|
+
return;
|
|
253
|
+
if (!agenda.captationVideo) {
|
|
254
|
+
if (!options["silent"])
|
|
255
|
+
console.log(`[skip] ${agenda.id} captationVideo=false`);
|
|
256
|
+
return;
|
|
257
|
+
}
|
|
258
|
+
if (!agenda.date || !agenda.startTime) {
|
|
259
|
+
if (!options["silent"])
|
|
260
|
+
console.log(`[skip] ${agenda.id} date/hour missing`);
|
|
261
|
+
return;
|
|
262
|
+
}
|
|
263
|
+
STATS.total++;
|
|
264
|
+
const reunionUid = makeReunionUid(agenda);
|
|
265
|
+
const baseDir = path.join(dataDir, VIDEOS_ROOT_FOLDER, String(session), reunionUid);
|
|
266
|
+
await fs.ensureDir(baseDir);
|
|
267
|
+
const agendaTs = toTargetEpoch(agenda.date, agenda.startTime);
|
|
268
|
+
// ==== 1) Multi-strategy searches ====
|
|
269
|
+
const strategies = buildSearchStrategies(agenda);
|
|
270
|
+
let combinedHtml = "";
|
|
271
|
+
let usedStrategy = -1;
|
|
272
|
+
let candidates = [];
|
|
273
|
+
for (let i = 0; i < strategies.length; i++) {
|
|
274
|
+
const pages = await fetchAllSearchPages(strategies[i], baseDir, i + 1, MAX_PAGES);
|
|
275
|
+
if (pages.length === 0)
|
|
276
|
+
continue;
|
|
277
|
+
const combined = pages.join("\n<!-- PAGE SPLIT -->\n");
|
|
278
|
+
const cs = extractCandidatesFromSearchHtml(combined);
|
|
279
|
+
if (cs.length) {
|
|
280
|
+
combinedHtml = combined;
|
|
281
|
+
candidates = cs.slice(0, MAX_CANDIDATES);
|
|
282
|
+
usedStrategy = i + 1;
|
|
283
|
+
break;
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
if (usedStrategy === -1 || !candidates.length) {
|
|
287
|
+
if (!options["silent"])
|
|
288
|
+
console.log(`[miss] ${agenda.id} no candidates (triedStrategies=${strategies.length})`);
|
|
289
|
+
return;
|
|
290
|
+
}
|
|
291
|
+
// ==== 2) Enrich via data.nvs + scoring; pick best ====
|
|
292
|
+
let best = null;
|
|
293
|
+
for (const c of candidates) {
|
|
294
|
+
const dataUrl = `${SENAT_DATAS_ROOT}/${c.id}_${c.hash}/content/data.nvs`;
|
|
295
|
+
const buf = await fetchBuffer(dataUrl);
|
|
296
|
+
if (!buf)
|
|
297
|
+
continue;
|
|
298
|
+
const meta = parseDataNvs(buf.toString("utf-8"));
|
|
299
|
+
const s = score(agenda, agendaTs, c.title ?? meta.title, meta.epoch);
|
|
300
|
+
if (!best || s > best.score) {
|
|
301
|
+
best = { id: c.id, hash: c.hash, pageUrl: c.pageUrl, epoch: meta.epoch, vtitle: c.title ?? meta.title, score: s };
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
if (!best) {
|
|
305
|
+
if (!options["silent"])
|
|
306
|
+
console.log(`[miss] ${agenda.id} candidats without data.nvs`);
|
|
307
|
+
return;
|
|
308
|
+
}
|
|
309
|
+
const accepted = best.score >= MATCH_THRESHOLD;
|
|
310
|
+
if (accepted)
|
|
311
|
+
STATS.accepted++;
|
|
312
|
+
if (!options["silent"]) {
|
|
313
|
+
console.log(`[pick] ${agenda.id} best id=${best.id} hash=${best.hash} score=${best.score.toFixed(2)} accepted=${accepted} (strategy=${usedStrategy})`);
|
|
314
|
+
}
|
|
315
|
+
// ==== 3) Write metadata + NVS of the best candidate (always) ====
|
|
316
|
+
const bestDt = best?.epoch ? epochToParisDateTime(best.epoch) : null;
|
|
317
|
+
const metadata = {
|
|
318
|
+
reunionUid,
|
|
319
|
+
session,
|
|
320
|
+
accepted,
|
|
321
|
+
threshold: MATCH_THRESHOLD,
|
|
322
|
+
strategy: usedStrategy,
|
|
323
|
+
agenda: {
|
|
324
|
+
date: agenda.date,
|
|
325
|
+
startTime: agenda.startTime,
|
|
326
|
+
titre: agenda.titre,
|
|
327
|
+
organe: agenda.organe ?? undefined,
|
|
328
|
+
id: agenda.id,
|
|
329
|
+
},
|
|
330
|
+
best: {
|
|
331
|
+
id: best.id,
|
|
332
|
+
hash: best.hash,
|
|
333
|
+
pageUrl: best.pageUrl,
|
|
334
|
+
epoch: best.epoch ?? null,
|
|
335
|
+
date: bestDt?.date ?? null,
|
|
336
|
+
startTime: bestDt?.startTime ?? null,
|
|
337
|
+
title: best.vtitle ?? null,
|
|
338
|
+
score: best.score,
|
|
339
|
+
},
|
|
340
|
+
};
|
|
341
|
+
await writeIfChanged(path.join(baseDir, "metadata.json"), JSON.stringify(metadata, null, 2));
|
|
342
|
+
const dataUrl = `${SENAT_DATAS_ROOT}/${best.id}_${best.hash}/content/data.nvs`;
|
|
343
|
+
const finalUrl = `${SENAT_DATAS_ROOT}/${best.id}_${best.hash}/content/finalplayer.nvs`;
|
|
344
|
+
const dataTxt = await fetchText(dataUrl);
|
|
345
|
+
const finalTxt = await fetchText(finalUrl);
|
|
346
|
+
if (dataTxt)
|
|
347
|
+
await fsp.writeFile(path.join(baseDir, "data.nvs"), dataTxt, "utf-8");
|
|
348
|
+
if (finalTxt)
|
|
349
|
+
await fsp.writeFile(path.join(baseDir, "finalplayer.nvs"), finalTxt, "utf-8");
|
|
350
|
+
let master = null;
|
|
351
|
+
if (dataTxt)
|
|
352
|
+
master = buildSenatVodMasterM3u8FromNvs(dataTxt);
|
|
353
|
+
// ==== 4) Update agenda file (only if accepted + m3u8) ====
|
|
354
|
+
if (accepted && master) {
|
|
355
|
+
const agendaJsonPath = path.join(dataDir, AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, String(session), `${formatYYYYMMDD(agenda.date)}.json`);
|
|
356
|
+
if (await fs.pathExists(agendaJsonPath)) {
|
|
357
|
+
const raw = await fsp.readFile(agendaJsonPath, "utf-8");
|
|
358
|
+
let items;
|
|
359
|
+
try {
|
|
360
|
+
items = JSON.parse(raw);
|
|
361
|
+
}
|
|
362
|
+
catch (e) {
|
|
363
|
+
console.warn(`[warn] invalid JSON in ${agendaJsonPath}:`, e?.message);
|
|
364
|
+
items = null;
|
|
365
|
+
}
|
|
366
|
+
if (Array.isArray(items)) {
|
|
367
|
+
const idx = items.findIndex((e) => String(e?.id) === String(agenda.id));
|
|
368
|
+
if (idx === -1) {
|
|
369
|
+
console.warn(`[warn] agenda id ${agenda.id} not found in ${agendaJsonPath}`);
|
|
370
|
+
}
|
|
371
|
+
else {
|
|
372
|
+
// add/update urlVideo on the matching item
|
|
373
|
+
items[idx] = { ...items[idx], urlVideo: master };
|
|
374
|
+
await writeIfChanged(agendaJsonPath, JSON.stringify(items, null, 2));
|
|
375
|
+
if (!options["silent"]) {
|
|
376
|
+
console.log(`[write] ${agenda.id} urlVideo ← ${master}`);
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
else {
|
|
381
|
+
console.warn(`[warn] expected an array in ${agendaJsonPath}, got ${typeof items}`);
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
else {
|
|
385
|
+
console.warn(`[warn] agenda file not found for update: ${agendaJsonPath}`);
|
|
386
|
+
}
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
async function processAll(dataDir, sessions) {
|
|
390
|
+
for (const session of sessions) {
|
|
391
|
+
for (const { item: agendas } of iterLoadSenatAgendas(dataDir, session, {})) {
|
|
392
|
+
for (const agenda of agendas) {
|
|
393
|
+
try {
|
|
394
|
+
await processAgenda(agenda, session, dataDir);
|
|
395
|
+
}
|
|
396
|
+
catch (e) {
|
|
397
|
+
console.error(`[error] ${agenda.id}:`, e?.message || e);
|
|
398
|
+
}
|
|
399
|
+
}
|
|
400
|
+
}
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
async function main() {
|
|
404
|
+
const dataDir = options["dataDir"];
|
|
405
|
+
assert(dataDir, "Missing argument: data directory");
|
|
406
|
+
const sessions = getSessionsFromStart(options["fromSession"]);
|
|
407
|
+
if (!options["silent"])
|
|
408
|
+
console.time("senat-agendas→videos start processing time");
|
|
409
|
+
await processAll(dataDir, sessions);
|
|
410
|
+
if (!options["silent"])
|
|
411
|
+
console.timeEnd("senat-agendas→videos processing time");
|
|
412
|
+
if (!options["silent"]) {
|
|
413
|
+
const { total, accepted } = STATS;
|
|
414
|
+
const ratio = total ? (accepted / total * 100).toFixed(1) : "0.0";
|
|
415
|
+
console.log(`[summary] accepted=${accepted} / total=${total} (${ratio}%)`);
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
main()
|
|
419
|
+
.then(() => process.exit(0))
|
|
420
|
+
.catch((err) => { console.error(err); process.exit(1); });
|
package/lib/types/agenda.d.ts
CHANGED
|
@@ -1,11 +1,76 @@
|
|
|
1
1
|
export interface CompteRendu {
|
|
2
|
-
|
|
2
|
+
uid?: string;
|
|
3
|
+
seanceRef?: string;
|
|
4
|
+
sessionRef?: string;
|
|
5
|
+
metadonnees?: Metadonnees;
|
|
6
|
+
contenu?: Contenu;
|
|
3
7
|
}
|
|
4
|
-
export interface
|
|
5
|
-
|
|
6
|
-
|
|
8
|
+
export interface Metadonnees {
|
|
9
|
+
dateSeance: string;
|
|
10
|
+
dateSeanceJour: string;
|
|
11
|
+
numSeanceJour: string;
|
|
12
|
+
numSeance: string;
|
|
13
|
+
typeAssemblee: "AN" | "SN";
|
|
14
|
+
legislature: string;
|
|
15
|
+
session: string;
|
|
16
|
+
nomFichierJo: string;
|
|
17
|
+
validite: string;
|
|
18
|
+
etat: string;
|
|
19
|
+
diffusion: string;
|
|
20
|
+
version: string;
|
|
21
|
+
environnement: string;
|
|
22
|
+
heureGeneration: Date;
|
|
23
|
+
sommaire: Sommaire;
|
|
7
24
|
}
|
|
8
|
-
export interface
|
|
9
|
-
|
|
10
|
-
|
|
25
|
+
export interface Contenu {
|
|
26
|
+
quantiemes: {
|
|
27
|
+
journee: string;
|
|
28
|
+
session: string;
|
|
29
|
+
};
|
|
30
|
+
point: Point[];
|
|
31
|
+
}
|
|
32
|
+
export interface Point {
|
|
33
|
+
ordre_absolu_seance: string;
|
|
34
|
+
code_grammaire: string;
|
|
35
|
+
roledebat?: string;
|
|
36
|
+
orateurs?: {
|
|
37
|
+
orateur: {
|
|
38
|
+
nom: string;
|
|
39
|
+
id: string;
|
|
40
|
+
qualite: string;
|
|
41
|
+
};
|
|
42
|
+
};
|
|
43
|
+
texte: {
|
|
44
|
+
_: string;
|
|
45
|
+
};
|
|
46
|
+
code_style?: string;
|
|
47
|
+
}
|
|
48
|
+
export interface Texte {
|
|
49
|
+
_?: string;
|
|
50
|
+
id_syceron?: string;
|
|
51
|
+
stime?: string;
|
|
52
|
+
sup?: string;
|
|
53
|
+
lienAdt?: Texte[] | Texte;
|
|
54
|
+
}
|
|
55
|
+
export interface Sommaire {
|
|
56
|
+
presidentSeance: Texte;
|
|
57
|
+
sommaire1: SommaireElement[] | SommaireElement;
|
|
58
|
+
sommaire3?: SommaireElement[] | SommaireElement;
|
|
59
|
+
sommaire2?: SommaireElement[] | SommaireElement;
|
|
60
|
+
para?: Texte[] | Texte;
|
|
61
|
+
}
|
|
62
|
+
export interface SommaireElement {
|
|
63
|
+
valeur_pts_odj: string | undefined;
|
|
64
|
+
titreStruct: TitreStruct;
|
|
65
|
+
para?: Array<Texte | string> | Texte;
|
|
66
|
+
sommaire2?: SommaireElement[] | SommaireElement;
|
|
67
|
+
sommaire3?: SommaireElement[] | SommaireElement;
|
|
68
|
+
presidentSeance?: Texte[] | Texte;
|
|
69
|
+
type_debat?: string;
|
|
70
|
+
}
|
|
71
|
+
export interface TitreStruct {
|
|
72
|
+
id_syceron: string;
|
|
73
|
+
intitule?: string;
|
|
74
|
+
sousIntitule?: string;
|
|
75
|
+
type_debat?: string;
|
|
11
76
|
}
|
|
File without changes
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
// import { validateNonEmptyTrimmedString } from "@biryani/core"
|
|
3
|
+
// const acteurUidRegExp = /^PA\d+$/
|
|
4
|
+
// const organeUidRegExp = /^PO\d+$/
|
|
5
|
+
// export function validateSenateurUid(input: any): [any, any] {
|
|
6
|
+
// const [value, error] = validateNonEmptyTrimmedString(input)
|
|
7
|
+
// if (error !== null) {
|
|
8
|
+
// return [value, error]
|
|
9
|
+
// }
|
|
10
|
+
// if (!acteurUidRegExp.test(value)) {
|
|
11
|
+
// return [value, 'Invalid "acteur" unique ID']
|
|
12
|
+
// }
|
|
13
|
+
// return [value, null]
|
|
14
|
+
// }
|
|
15
|
+
// export function validateOrganeUid(input: any): [any, any] {
|
|
16
|
+
// const [value, error] = validateNonEmptyTrimmedString(input)
|
|
17
|
+
// if (error !== null) {
|
|
18
|
+
// return [value, error]
|
|
19
|
+
// }
|
|
20
|
+
// if (!organeUidRegExp.test(value)) {
|
|
21
|
+
// return [value, 'Invalid "organe" unique ID']
|
|
22
|
+
// }
|
|
23
|
+
// return [value, null]
|
|
24
|
+
// }
|