@tricoteuses/senat 2.16.5 → 2.16.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/loaders.d.ts +2 -0
- package/lib/model/agenda.d.ts +4 -0
- package/lib/model/agenda.js +27 -8
- package/lib/model/documents.js +4 -8
- package/lib/scripts/convert_data.js +2 -0
- package/lib/scripts/retrieve_documents.js +43 -10
- package/lib/scripts/retrieve_videos.d.ts +1 -5
- package/lib/scripts/retrieve_videos.js +124 -111
- package/lib/scripts/shared/cli_helpers.d.ts +5 -0
- package/lib/scripts/shared/cli_helpers.js +6 -7
- package/package.json +1 -1
- package/lib/model/compte_rendu.d.ts +0 -9
- package/lib/model/compte_rendu.js +0 -325
- package/lib/raw_types/db.d.ts +0 -11389
- package/lib/raw_types/db.js +0 -5
- package/lib/scripts/retrieve_comptes_rendus.d.ts +0 -6
- package/lib/scripts/retrieve_comptes_rendus.js +0 -274
package/lib/loaders.d.ts
CHANGED
|
@@ -29,6 +29,7 @@ export type IterItem<T> = {
|
|
|
29
29
|
export interface TexteMetadata {
|
|
30
30
|
name: string;
|
|
31
31
|
session: number | null | undefined;
|
|
32
|
+
date?: string | null;
|
|
32
33
|
url_expose_des_motifs?: URL;
|
|
33
34
|
url_xml: URL;
|
|
34
35
|
url_html: URL;
|
|
@@ -37,6 +38,7 @@ export interface TexteMetadata {
|
|
|
37
38
|
export interface RapportMetadata {
|
|
38
39
|
name: string;
|
|
39
40
|
session: number | null | undefined;
|
|
41
|
+
date?: string | null;
|
|
40
42
|
url_html: URL;
|
|
41
43
|
url_pdf: URL;
|
|
42
44
|
}
|
package/lib/model/agenda.d.ts
CHANGED
|
@@ -1,2 +1,6 @@
|
|
|
1
1
|
import { AgendaEvent } from "../types/agenda";
|
|
2
|
+
export declare function getStartAndEndTimes(timeStr: string | null | undefined, dateISO: string): {
|
|
3
|
+
startTime: string | null;
|
|
4
|
+
endTime: string | null;
|
|
5
|
+
};
|
|
2
6
|
export declare function parseAgendaFromFile(htmlFilePath: string): Promise<AgendaEvent[] | null>;
|
package/lib/model/agenda.js
CHANGED
|
@@ -64,18 +64,37 @@ function normalizeTime(timeStr) {
|
|
|
64
64
|
?.replace(/\s\(hors hémicycle\)/i, "")
|
|
65
65
|
?.replace(/\s*h\s*/gi, "h");
|
|
66
66
|
}
|
|
67
|
-
function getStartAndEndTimes(timeStr) {
|
|
67
|
+
export function getStartAndEndTimes(timeStr, dateISO) {
|
|
68
68
|
const normalizedTime = normalizeTime(timeStr);
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
69
|
+
if (!normalizedTime) {
|
|
70
|
+
return { startTime: null, endTime: null };
|
|
71
|
+
}
|
|
72
|
+
const rangeMatch = normalizedTime.match(/^de (?<start>\d{1,2}h\d{2}) à (?<end>\d{1,2}h\d{2})$/i);
|
|
73
|
+
const toUtcIso = (value) => {
|
|
74
|
+
if (!value)
|
|
75
|
+
return null;
|
|
76
|
+
const time = DateTime.fromFormat(value, "H'h'mm", { zone: FR_TZ });
|
|
77
|
+
if (!time.isValid)
|
|
78
|
+
return null;
|
|
79
|
+
const local = DateTime.fromISO(dateISO, { zone: FR_TZ }).set({
|
|
80
|
+
hour: time.hour,
|
|
81
|
+
minute: time.minute,
|
|
82
|
+
second: 0,
|
|
83
|
+
millisecond: 0,
|
|
84
|
+
});
|
|
85
|
+
if (!local.isValid)
|
|
86
|
+
return null;
|
|
87
|
+
return local.toUTC().toISO();
|
|
88
|
+
};
|
|
89
|
+
if (rangeMatch?.groups) {
|
|
90
|
+
const { start, end } = rangeMatch.groups;
|
|
72
91
|
return {
|
|
73
|
-
startTime:
|
|
74
|
-
endTime:
|
|
92
|
+
startTime: toUtcIso(start),
|
|
93
|
+
endTime: toUtcIso(end),
|
|
75
94
|
};
|
|
76
95
|
}
|
|
77
96
|
return {
|
|
78
|
-
startTime:
|
|
97
|
+
startTime: toUtcIso(normalizedTime),
|
|
79
98
|
endTime: null,
|
|
80
99
|
};
|
|
81
100
|
}
|
|
@@ -91,7 +110,7 @@ function transformAgenda(document, fileName) {
|
|
|
91
110
|
const type = getEventType(eventElement.classList);
|
|
92
111
|
const date = DateTime.fromFormat(fileName, ID_DATE_FORMAT).toFormat(STANDARD_DATE_FORMAT);
|
|
93
112
|
const timeOriginal = eventElement.querySelector(".time")?.textContent || null;
|
|
94
|
-
const { startTime, endTime } = getStartAndEndTimes(timeOriginal);
|
|
113
|
+
const { startTime, endTime } = getStartAndEndTimes(timeOriginal, date);
|
|
95
114
|
const titre = eventElement.querySelector(".titre")?.textContent?.trim() || "";
|
|
96
115
|
const organe = eventElement.querySelector(".organe")?.textContent?.trim() || null;
|
|
97
116
|
const objet = eventElement.querySelector(".objet")?.textContent?.trim()?.replace(/^- /, "") || null;
|
package/lib/model/documents.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { dbSenat } from "../databases";
|
|
2
|
-
import { rtrim } from "./util";
|
|
2
|
+
import { rtrim, toDateString } from "./util";
|
|
3
3
|
export function findSenatTexteUrls(sessions = []) {
|
|
4
4
|
return dbSenat
|
|
5
5
|
.withSchema("dosleg")
|
|
@@ -10,13 +10,8 @@ export function findSenatTexteUrls(sessions = []) {
|
|
|
10
10
|
.select(({ eb, ref }) => [
|
|
11
11
|
"sesann as session",
|
|
12
12
|
rtrim(ref("texurl")).as("url"),
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
.when("oritxtcod", "=", "1")
|
|
16
|
-
.then(true)
|
|
17
|
-
.else(false)
|
|
18
|
-
.end()
|
|
19
|
-
.as("hasExposeDesMotifs"),
|
|
13
|
+
toDateString(ref("txtoritxtdat")).as("date"),
|
|
14
|
+
eb.case().when("oritxtcod", "=", "1").then(true).else(false).end().as("hasExposeDesMotifs"),
|
|
20
15
|
])
|
|
21
16
|
.$narrowType()
|
|
22
17
|
.stream();
|
|
@@ -31,6 +26,7 @@ export function findSenatRapportUrls(sessions = []) {
|
|
|
31
26
|
.select(({ ref }) => [
|
|
32
27
|
"sesann as session",
|
|
33
28
|
rtrim(ref("rapurl")).as("url"),
|
|
29
|
+
toDateString(ref("date_depot")).as("date"),
|
|
34
30
|
])
|
|
35
31
|
.$narrowType()
|
|
36
32
|
.stream();
|
|
@@ -178,6 +178,7 @@ async function convertTexteUrls(dataDir) {
|
|
|
178
178
|
const metadata = {
|
|
179
179
|
name: texteName,
|
|
180
180
|
session: texte.session,
|
|
181
|
+
date: texte.date,
|
|
181
182
|
url_expose_des_motifs: texte.hasExposeDesMotifs
|
|
182
183
|
? new URL(`${texteName}-expose.html`, SENAT_EXPOSE_DES_MOTIFS_BASE_URL)
|
|
183
184
|
: undefined,
|
|
@@ -212,6 +213,7 @@ async function convertRapportUrls(dataDir) {
|
|
|
212
213
|
const metadata = {
|
|
213
214
|
name: rapportName,
|
|
214
215
|
session: rapport.session,
|
|
216
|
+
date: rapport.date,
|
|
215
217
|
url_html: new URL(rapportHtmlUrl, SENAT_RAPPORT_BASE_URL),
|
|
216
218
|
url_pdf: new URL(rapportPdfUrl, SENAT_RAPPORT_BASE_URL),
|
|
217
219
|
};
|
|
@@ -1,14 +1,16 @@
|
|
|
1
1
|
import assert from "assert";
|
|
2
2
|
import commandLineArgs from "command-line-args";
|
|
3
3
|
import fs from "fs-extra";
|
|
4
|
+
import { DateTime } from "luxon";
|
|
4
5
|
import path from "path";
|
|
5
6
|
import { DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER, iterLoadSenatDossiersLegislatifsRapportUrls, iterLoadSenatDossiersLegislatifsTexteUrls, RAPPORT_FOLDER, TEXTE_FOLDER, } from "../loaders";
|
|
6
7
|
import { parseExposeDesMotifs, parseTexte, parseTexteFromFile } from "../model/texte";
|
|
7
8
|
import { getSessionsFromStart, UNDEFINED_SESSION } from "../types/sessions";
|
|
8
|
-
import { commonOptions } from "./shared/cli_helpers";
|
|
9
|
+
import { commonOptions, onlyRecentOption } from "./shared/cli_helpers";
|
|
9
10
|
import { ensureAndClearDir, fetchWithRetry, isOptionEmptyOrHasValue } from "./shared/util";
|
|
10
11
|
const optionsDefinitions = [
|
|
11
12
|
...commonOptions,
|
|
13
|
+
onlyRecentOption,
|
|
12
14
|
{
|
|
13
15
|
help: "parse and convert documents into JSON (textes only for now, requires format xml)",
|
|
14
16
|
name: "parseDocuments",
|
|
@@ -35,6 +37,18 @@ const optionsDefinitions = [
|
|
|
35
37
|
];
|
|
36
38
|
const options = commandLineArgs(optionsDefinitions);
|
|
37
39
|
const textDecoder = new TextDecoder("utf8");
|
|
40
|
+
const today = DateTime.now();
|
|
41
|
+
function isDocumentRecent(documentDate, daysThreshold) {
|
|
42
|
+
if (!documentDate) {
|
|
43
|
+
return false;
|
|
44
|
+
}
|
|
45
|
+
const docDate = DateTime.fromISO(documentDate);
|
|
46
|
+
if (!docDate.isValid) {
|
|
47
|
+
return false;
|
|
48
|
+
}
|
|
49
|
+
const daysDiff = today.diff(docDate, "days").days;
|
|
50
|
+
return daysDiff <= daysThreshold;
|
|
51
|
+
}
|
|
38
52
|
async function retrieveTextes(dataDir, sessions) {
|
|
39
53
|
const textesDir = path.join(dataDir, TEXTE_FOLDER);
|
|
40
54
|
fs.ensureDirSync(textesDir);
|
|
@@ -47,7 +61,7 @@ async function retrieveTextes(dataDir, sessions) {
|
|
|
47
61
|
const texteUrlsNotFoundOrError = [];
|
|
48
62
|
const texteUrlsParseError = [];
|
|
49
63
|
for (const session of sessions) {
|
|
50
|
-
for (const { item: texteMetadata
|
|
64
|
+
for (const { item: texteMetadata } of iterLoadSenatDossiersLegislatifsTexteUrls(dataDir, session)) {
|
|
51
65
|
const texteDir = path.join(originalTextesDir, `${texteMetadata.session ?? UNDEFINED_SESSION}`, texteMetadata.name);
|
|
52
66
|
fs.ensureDirSync(texteDir);
|
|
53
67
|
let exposeDesMotifsContent = null;
|
|
@@ -57,7 +71,11 @@ async function retrieveTextes(dataDir, sessions) {
|
|
|
57
71
|
if (isOptionEmptyOrHasValue(options["formats"], "xml")) {
|
|
58
72
|
const textePath = path.join(texteDir, `${texteMetadata.name}.xml`);
|
|
59
73
|
let texteBuffer = null;
|
|
60
|
-
if
|
|
74
|
+
// Check if document should be skipped based on onlyRecent option
|
|
75
|
+
const shouldSkip = !options["force"] &&
|
|
76
|
+
fs.existsSync(textePath) &&
|
|
77
|
+
(options["only-recent"] === undefined || !isDocumentRecent(texteMetadata.date, options["only-recent"]));
|
|
78
|
+
if (shouldSkip) {
|
|
61
79
|
if (!options["silent"]) {
|
|
62
80
|
console.info(`Already downloaded texte ${textePath}…`);
|
|
63
81
|
}
|
|
@@ -80,7 +98,11 @@ async function retrieveTextes(dataDir, sessions) {
|
|
|
80
98
|
}
|
|
81
99
|
if (isOptionEmptyOrHasValue(options["formats"], "html")) {
|
|
82
100
|
const textePath = path.join(texteDir, `${texteMetadata.name}.html`);
|
|
83
|
-
if
|
|
101
|
+
// Check if document should be skipped based on onlyRecent option
|
|
102
|
+
const shouldSkip = !options["force"] &&
|
|
103
|
+
fs.existsSync(textePath) &&
|
|
104
|
+
(options["only-recent"] === undefined || !isDocumentRecent(texteMetadata.date, options["only-recent"]));
|
|
105
|
+
if (shouldSkip) {
|
|
84
106
|
if (!options["silent"]) {
|
|
85
107
|
console.info(`Already downloaded texte ${textePath}…`);
|
|
86
108
|
}
|
|
@@ -97,7 +119,11 @@ async function retrieveTextes(dataDir, sessions) {
|
|
|
97
119
|
}
|
|
98
120
|
if (isOptionEmptyOrHasValue(options["formats"], "pdf")) {
|
|
99
121
|
const textePath = path.join(texteDir, `${texteMetadata.name}.pdf`);
|
|
100
|
-
if
|
|
122
|
+
// Check if document should be skipped based on onlyRecent option
|
|
123
|
+
const shouldSkip = !options["force"] &&
|
|
124
|
+
fs.existsSync(textePath) &&
|
|
125
|
+
(options["only-recent"] === undefined || !isDocumentRecent(texteMetadata.date, options["only-recent"]));
|
|
126
|
+
if (shouldSkip) {
|
|
101
127
|
if (!options["silent"]) {
|
|
102
128
|
console.info(`Already downloaded texte ${textePath}…`);
|
|
103
129
|
}
|
|
@@ -129,12 +155,16 @@ async function retrieveRapports(dataDir, sessions) {
|
|
|
129
155
|
let retrievedRapportsCount = 0;
|
|
130
156
|
const rapportUrlsNotFoundOrError = [];
|
|
131
157
|
for (const session of sessions) {
|
|
132
|
-
for (const { item: rapportMetadata
|
|
158
|
+
for (const { item: rapportMetadata } of iterLoadSenatDossiersLegislatifsRapportUrls(dataDir, session)) {
|
|
133
159
|
const rapportDir = path.join(originalRapportsDir, `${rapportMetadata.session ?? UNDEFINED_SESSION}`, rapportMetadata.name);
|
|
134
160
|
fs.ensureDirSync(rapportDir);
|
|
135
161
|
if (isOptionEmptyOrHasValue(options["formats"], "html")) {
|
|
136
162
|
const rapportPath = path.join(rapportDir, `${rapportMetadata.name}.html`);
|
|
137
|
-
if
|
|
163
|
+
// Check if document should be skipped based on onlyRecent option
|
|
164
|
+
const shouldSkip = !options["force"] &&
|
|
165
|
+
fs.existsSync(rapportPath) &&
|
|
166
|
+
(options["only-recent"] === undefined || !isDocumentRecent(rapportMetadata.date, options["only-recent"]));
|
|
167
|
+
if (shouldSkip) {
|
|
138
168
|
if (!options["silent"]) {
|
|
139
169
|
console.info(`Already downloaded rapport ${rapportPath}…`);
|
|
140
170
|
}
|
|
@@ -150,7 +180,11 @@ async function retrieveRapports(dataDir, sessions) {
|
|
|
150
180
|
}
|
|
151
181
|
if (isOptionEmptyOrHasValue(options["formats"], "pdf")) {
|
|
152
182
|
const rapportPath = path.join(rapportDir, `${rapportMetadata.name}.pdf`);
|
|
153
|
-
if
|
|
183
|
+
// Check if document should be skipped based on onlyRecent option
|
|
184
|
+
const shouldSkip = !options["force"] &&
|
|
185
|
+
fs.existsSync(rapportPath) &&
|
|
186
|
+
(options["only-recent"] === undefined || !isDocumentRecent(rapportMetadata.date, options["only-recent"]));
|
|
187
|
+
if (shouldSkip) {
|
|
154
188
|
if (!options["silent"]) {
|
|
155
189
|
console.info(`Already downloaded rapport ${rapportPath}…`);
|
|
156
190
|
}
|
|
@@ -222,8 +256,7 @@ async function parseDocument(session, transformedTextesDir, textePath, texteName
|
|
|
222
256
|
console.log("Parsing exposé des motifs…");
|
|
223
257
|
}
|
|
224
258
|
const exposeDesMotifsHtml = textDecoder.decode(exposeDesMotifs);
|
|
225
|
-
parsedTexte.exposeDesMotifs =
|
|
226
|
-
parseExposeDesMotifs(exposeDesMotifsHtml);
|
|
259
|
+
parsedTexte.exposeDesMotifs = parseExposeDesMotifs(exposeDesMotifsHtml);
|
|
227
260
|
}
|
|
228
261
|
const transformedTexteDir = path.join(transformedTextesDir, `${session ?? UNDEFINED_SESSION}`, texteName);
|
|
229
262
|
fs.ensureDirSync(transformedTexteDir);
|
|
@@ -8,10 +8,10 @@ import { AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, iterLoadSenatAgendas } from "..
|
|
|
8
8
|
import { getSessionsFromStart } from "../types/sessions";
|
|
9
9
|
import { commonOptions } from "./shared/cli_helpers";
|
|
10
10
|
import { decodeHtmlEntities } from "../model/util";
|
|
11
|
+
import { DateTime } from "luxon";
|
|
11
12
|
// ===================== Constants =====================
|
|
12
|
-
const MATCH_THRESHOLD = 0.
|
|
13
|
+
const MATCH_THRESHOLD = 0.6;
|
|
13
14
|
const MAX_CANDIDATES = 15;
|
|
14
|
-
const MAX_PAGES = 3;
|
|
15
15
|
const STATS = { total: 0, accepted: 0 };
|
|
16
16
|
const VIDEOS_ROOT_FOLDER = "videos";
|
|
17
17
|
const SENAT_VIDEOS_SEARCH_AJAX = "https://videos.senat.fr/senat_videos_search.php";
|
|
@@ -42,11 +42,6 @@ function dice(a, b) {
|
|
|
42
42
|
inter++;
|
|
43
43
|
return (2 * inter) / (A.size + B.size);
|
|
44
44
|
}
|
|
45
|
-
// Heuristic for Europe/Paris DST: +02:00 ≈ April→October, +01:00 otherwise.
|
|
46
|
-
function parisOffsetForDate(dateYYYYMMDD) {
|
|
47
|
-
const m = Number(dateYYYYMMDD.split("-")[1] || "1");
|
|
48
|
-
return m >= 4 && m <= 10 ? "+02:00" : "+01:00";
|
|
49
|
-
}
|
|
50
45
|
function epochToParisDateTime(epochSec) {
|
|
51
46
|
if (!Number.isFinite(epochSec))
|
|
52
47
|
return null;
|
|
@@ -70,30 +65,13 @@ function epochToParisDateTime(epochSec) {
|
|
|
70
65
|
startTime: `${hh}:${mi}:${ss}.${ms}${offsetStr}`,
|
|
71
66
|
};
|
|
72
67
|
}
|
|
73
|
-
function toTargetEpoch(
|
|
74
|
-
if (!
|
|
68
|
+
function toTargetEpoch(time) {
|
|
69
|
+
if (!time)
|
|
75
70
|
return null;
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
if (hasTz) {
|
|
81
|
-
// Exemple: 2022-10-04T18:00:00.000+02:00
|
|
82
|
-
iso = `${date}T${t}`;
|
|
83
|
-
}
|
|
84
|
-
else {
|
|
85
|
-
// Normalise pour avoir au moins HH:mm:ss
|
|
86
|
-
if (/^\d{1,2}$/.test(t)) {
|
|
87
|
-
t = `${t.padStart(2, "0")}:00:00`;
|
|
88
|
-
}
|
|
89
|
-
else if (/^\d{1,2}:\d{2}$/.test(t)) {
|
|
90
|
-
t = `${t}:00`;
|
|
91
|
-
} // sinon, on garde tel quel (gère HH:mm:ss et HH:mm:ss.SSS)
|
|
92
|
-
// Ajoute l’offset Paris (heuristique saisonnière)
|
|
93
|
-
iso = `${date}T${t}${parisOffsetForDate(date)}`;
|
|
94
|
-
}
|
|
95
|
-
const ms = Date.parse(iso);
|
|
96
|
-
return Number.isNaN(ms) ? null : Math.floor(ms / 1000);
|
|
71
|
+
const dtLocal = DateTime.fromISO(time, { zone: "Europe/Paris" });
|
|
72
|
+
if (!dtLocal.isValid)
|
|
73
|
+
return null;
|
|
74
|
+
return Math.floor(dtLocal.toUTC().toSeconds());
|
|
97
75
|
}
|
|
98
76
|
async function fetchText(url) {
|
|
99
77
|
const res = await fetch(url);
|
|
@@ -122,16 +100,6 @@ function queryString(obj) {
|
|
|
122
100
|
.map(([k, v]) => `${encodeURIComponent(k)}=${encodeURIComponent(v)}`)
|
|
123
101
|
.join("&");
|
|
124
102
|
}
|
|
125
|
-
function simplifyTitleForKeywords(input) {
|
|
126
|
-
return (input || "")
|
|
127
|
-
.replace(/\baudition\s+de\b/gi, " ")
|
|
128
|
-
.replace(/\breunion\b/gi, " ")
|
|
129
|
-
.replace(/\bsur\b/gi, " ")
|
|
130
|
-
.replace(/\b(la|le|les|des|de|du|d’|d')\b/gi, " ")
|
|
131
|
-
.replace(/[–—-]/g, " ")
|
|
132
|
-
.replace(/\s+/g, " ")
|
|
133
|
-
.trim();
|
|
134
|
-
}
|
|
135
103
|
function toFRDate(dateYYYYMMDD) {
|
|
136
104
|
const [y, m, d] = dateYYYYMMDD.split("-");
|
|
137
105
|
return `${d}/${m}/${y}`; // DD/MM/YYYY
|
|
@@ -158,17 +126,24 @@ function extractCandidatesFromSearchHtml(html) {
|
|
|
158
126
|
return true;
|
|
159
127
|
});
|
|
160
128
|
}
|
|
129
|
+
function parseFinalNvs(nvs) {
|
|
130
|
+
const playerTag = nvs.match(/<player\b[^>]*>/i)?.[0];
|
|
131
|
+
if (!playerTag)
|
|
132
|
+
return {};
|
|
133
|
+
const sessionStartStr = playerTag.match(/\bsessionstart="(\d+)"/i)?.[1];
|
|
134
|
+
return {
|
|
135
|
+
sessionStart: sessionStartStr ? Number(sessionStartStr) : undefined,
|
|
136
|
+
};
|
|
137
|
+
}
|
|
161
138
|
function parseDataNvs(nvs) {
|
|
162
139
|
const epochStr = nvs.match(/<metadata\s+name="date"\s+value="(\d+)"/i)?.[1];
|
|
163
140
|
const epoch = epochStr ? Number(epochStr) : undefined;
|
|
164
141
|
const organesTag = nvs.match(/<metadata\b[^>]*\bname="organes"[^>]*>/i)?.[0];
|
|
165
142
|
let organeLabel;
|
|
166
|
-
let organeValue;
|
|
167
143
|
if (organesTag) {
|
|
168
144
|
organeLabel = organesTag.match(/\blabel="([^"]+)"/i)?.[1];
|
|
169
|
-
organeValue = organesTag.match(/\bvalue="([^"]+)"/i)?.[1];
|
|
170
145
|
}
|
|
171
|
-
const organeRaw = organeLabel ??
|
|
146
|
+
const organeRaw = organeLabel ?? "Séance publique";
|
|
172
147
|
const organe = decodeHtmlEntities(organeRaw)?.trim();
|
|
173
148
|
const firstChapterLabel = decodeHtmlEntities(nvs.match(/<chapter\b[^>]*\blabel="([^"]+)"/i)[1]).trim();
|
|
174
149
|
return { epoch, organe, firstChapterLabel };
|
|
@@ -207,50 +182,27 @@ function buildSenatVodMasterM3u8FromNvs(nvsText, finalText) {
|
|
|
207
182
|
}
|
|
208
183
|
return `${base}.smil/master.m3u8`;
|
|
209
184
|
}
|
|
210
|
-
function score(agenda, agendaTs, videoTitle, videoEpoch, videoOrgane) {
|
|
211
|
-
const
|
|
185
|
+
function score(agenda, agendaTs, sameOrg, videoTitle, videoEpoch, videoOrgane) {
|
|
186
|
+
const objetS = dice(agenda.objet || "", videoTitle || "");
|
|
187
|
+
const titleS = dice(agenda.titre || "", videoTitle || "");
|
|
188
|
+
const titleScore = Math.max(objetS, titleS);
|
|
212
189
|
let timeScore = 0;
|
|
213
190
|
if (agendaTs && videoEpoch) {
|
|
214
191
|
// second
|
|
215
192
|
const deltaMin = Math.abs(videoEpoch - agendaTs) / 60;
|
|
216
|
-
// delta :
|
|
217
|
-
timeScore = Math.
|
|
193
|
+
// delta : 60min
|
|
194
|
+
timeScore = Math.exp(-deltaMin / 60);
|
|
218
195
|
}
|
|
219
196
|
const orgScore = videoOrgane && agenda.organe ? dice(agenda.organe, videoOrgane) : 0;
|
|
220
|
-
|
|
221
|
-
return 0.3 * titleScore + 0.7 * timeScore;
|
|
222
|
-
}
|
|
223
|
-
return 0.2 * titleScore + 0.4 * timeScore + orgScore * 0.4;
|
|
197
|
+
return 0.2 * titleScore + 0.4 * timeScore + (sameOrg ? 0.4 : orgScore * 0.4);
|
|
224
198
|
}
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
const kw = simplifyTitleForKeywords(agenda.titre || "");
|
|
231
|
-
const commission = agenda.organe || undefined;
|
|
232
|
-
// common base
|
|
233
|
-
const base = { search: "true", videotype: "Commission" };
|
|
234
|
-
if (fr)
|
|
235
|
-
Object.assign(base, { period: "custom", begin: fr, end: fr });
|
|
236
|
-
const strategies = [];
|
|
237
|
-
// 1) keywords + commission
|
|
238
|
-
if (kw && commission)
|
|
239
|
-
strategies.push({ ...base, motscles: kw, commission });
|
|
240
|
-
// 2) keywords without commission
|
|
241
|
-
if (kw)
|
|
242
|
-
strategies.push({ ...base, motscles: kw });
|
|
243
|
-
// 3) full-text (AND) + commission
|
|
244
|
-
if (kw && commission)
|
|
245
|
-
strategies.push({ ...base, text: `AND${kw}`, commission });
|
|
246
|
-
// 4) full-text (AND) without commission
|
|
247
|
-
if (kw)
|
|
248
|
-
strategies.push({ ...base, text: `AND${kw}` });
|
|
249
|
-
// 5) no keywords (just type + period)
|
|
250
|
-
strategies.push({ ...base });
|
|
251
|
-
return strategies;
|
|
199
|
+
function getAgendaType(agenda) {
|
|
200
|
+
const o = agenda.organe || "";
|
|
201
|
+
if (/séance publique/i.test(o))
|
|
202
|
+
return "Séance publique";
|
|
203
|
+
return "Commission";
|
|
252
204
|
}
|
|
253
|
-
async function fetchAllSearchPages(args,
|
|
205
|
+
async function fetchAllSearchPages(args, maxPages = 3) {
|
|
254
206
|
const pages = [];
|
|
255
207
|
for (let p = 1; p <= maxPages; p++) {
|
|
256
208
|
const url = `${SENAT_VIDEOS_SEARCH_AJAX}?${queryString({ ...args, page: String(p) })}`;
|
|
@@ -263,9 +215,46 @@ async function fetchAllSearchPages(args, baseDir, strategyIndex, maxPages = MAX_
|
|
|
263
215
|
}
|
|
264
216
|
return pages;
|
|
265
217
|
}
|
|
218
|
+
function getOrgKey(norm) {
|
|
219
|
+
if (!norm)
|
|
220
|
+
return "autre";
|
|
221
|
+
if (norm.includes("seance publique"))
|
|
222
|
+
return "seance_publique";
|
|
223
|
+
if (norm.includes("culture"))
|
|
224
|
+
return "culture";
|
|
225
|
+
if (norm.includes("finances"))
|
|
226
|
+
return "finances";
|
|
227
|
+
if (norm.includes("sociales"))
|
|
228
|
+
return "affaires_sociales";
|
|
229
|
+
if (norm.includes("economiques"))
|
|
230
|
+
return "affaires_economiques";
|
|
231
|
+
if (norm.includes("europeennes"))
|
|
232
|
+
return "affaires_europeennes";
|
|
233
|
+
if (norm.includes("etrangeres") || norm.includes("forces armees") || norm.includes("defense")) {
|
|
234
|
+
return "affaires_etrangeres_defense";
|
|
235
|
+
}
|
|
236
|
+
if (norm.includes("territoire") || norm.includes("durable")) {
|
|
237
|
+
return "amenagement_territoire_dd";
|
|
238
|
+
}
|
|
239
|
+
if (norm.includes("commission des lois"))
|
|
240
|
+
return "lois";
|
|
241
|
+
if (norm.includes("delegation aux collectivites territoriales") || norm.includes("delegation a la decentralisation"))
|
|
242
|
+
return "delegation_collectivites";
|
|
243
|
+
if (norm.includes("delegation aux droits des femmes") ||
|
|
244
|
+
norm.includes("egalite des chances entre les hommes et les femmes"))
|
|
245
|
+
return "delegation_droits_femmes";
|
|
246
|
+
if (norm.includes("delegation aux entreprises"))
|
|
247
|
+
return "delegation_entreprises";
|
|
248
|
+
if (norm.includes("delegation senatoriale aux outre mer") || norm.includes("delegation aux outre mer"))
|
|
249
|
+
return "delegation_outre_mer";
|
|
250
|
+
if (norm.includes("delegation a la prospective"))
|
|
251
|
+
return "delegation_prospective";
|
|
252
|
+
if (norm.includes("office parlementaire d evaluation des choix scientifiques et technologiques") ||
|
|
253
|
+
norm.includes("opecst"))
|
|
254
|
+
return "opecst";
|
|
255
|
+
return "autre";
|
|
256
|
+
}
|
|
266
257
|
async function processGroupedReunion(agenda, session, dataDir) {
|
|
267
|
-
if (!agenda)
|
|
268
|
-
return;
|
|
269
258
|
// 1) GuardRails
|
|
270
259
|
if (!agenda.captationVideo) {
|
|
271
260
|
// if (!options["silent"]) console.log(`[skip] ${agenda.uid} captationVideo=false`)
|
|
@@ -275,32 +264,40 @@ async function processGroupedReunion(agenda, session, dataDir) {
|
|
|
275
264
|
// if (!options["silent"]) console.log(`[skip] ${agenda.uid} date/hour missing`)
|
|
276
265
|
return;
|
|
277
266
|
}
|
|
267
|
+
const agendaTs = toTargetEpoch(agenda.startTime);
|
|
268
|
+
const now = Date.now();
|
|
269
|
+
if (agendaTs && agendaTs * 1000 > now) {
|
|
270
|
+
return;
|
|
271
|
+
}
|
|
278
272
|
STATS.total++;
|
|
279
273
|
const reunionUid = agenda.uid;
|
|
280
274
|
const baseDir = path.join(dataDir, VIDEOS_ROOT_FOLDER, String(session), reunionUid);
|
|
281
275
|
await fs.ensureDir(baseDir);
|
|
282
|
-
const
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
276
|
+
const searchParams = {
|
|
277
|
+
search: "true",
|
|
278
|
+
videotype: getAgendaType(agenda),
|
|
279
|
+
};
|
|
280
|
+
if (agenda.date) {
|
|
281
|
+
const fr = toFRDate(agenda.date);
|
|
282
|
+
searchParams.period = "custom";
|
|
283
|
+
searchParams.begin = fr;
|
|
284
|
+
searchParams.end = fr;
|
|
285
|
+
}
|
|
286
|
+
if (agenda.organe) {
|
|
287
|
+
searchParams.organe = agenda.organe;
|
|
288
|
+
}
|
|
289
|
+
const pages = await fetchAllSearchPages(searchParams);
|
|
290
|
+
if (!pages.length) {
|
|
291
|
+
if (!options["silent"]) {
|
|
292
|
+
console.log(`[miss] ${agenda.uid} no candidates (videotype=${searchParams.videotype}, organe=${searchParams.organe || "-"}, date=${searchParams.begin || "-"})`);
|
|
299
293
|
}
|
|
294
|
+
return;
|
|
300
295
|
}
|
|
301
|
-
|
|
296
|
+
const combinedHtml = pages.join("\n<!-- PAGE SPLIT -->\n");
|
|
297
|
+
const candidates = extractCandidatesFromSearchHtml(combinedHtml).slice(0, MAX_CANDIDATES);
|
|
298
|
+
if (!candidates.length) {
|
|
302
299
|
if (!options["silent"]) {
|
|
303
|
-
console.log(`[miss] ${agenda.uid} no candidates (
|
|
300
|
+
console.log(`[miss] ${agenda.uid} no candidates after parse (videotype=${searchParams.videotype}, organe=${searchParams.organe || "-"}, date=${searchParams.begin || "-"})`);
|
|
304
301
|
}
|
|
305
302
|
return;
|
|
306
303
|
}
|
|
@@ -308,15 +305,32 @@ async function processGroupedReunion(agenda, session, dataDir) {
|
|
|
308
305
|
let best = null;
|
|
309
306
|
for (const c of candidates) {
|
|
310
307
|
const dataUrl = `${SENAT_DATAS_ROOT}/${c.id}_${c.hash}/content/data.nvs`;
|
|
311
|
-
const
|
|
312
|
-
|
|
308
|
+
const finalUrl = `${SENAT_DATAS_ROOT}/${c.id}_${c.hash}/content/finalplayer.nvs`;
|
|
309
|
+
const dataBuf = await fetchBuffer(dataUrl);
|
|
310
|
+
if (!dataBuf)
|
|
313
311
|
continue;
|
|
314
|
-
const meta = parseDataNvs(
|
|
315
|
-
|
|
312
|
+
const meta = parseDataNvs(dataBuf.toString("utf-8"));
|
|
313
|
+
let sessionStart;
|
|
314
|
+
const finalBuf = await fetchBuffer(finalUrl);
|
|
315
|
+
if (finalBuf) {
|
|
316
|
+
const finalMeta = parseFinalNvs(finalBuf.toString("utf-8"));
|
|
317
|
+
sessionStart = finalMeta.sessionStart;
|
|
318
|
+
}
|
|
319
|
+
const videoEpoch = sessionStart ?? meta.epoch;
|
|
320
|
+
let sameOrg = false;
|
|
321
|
+
// If organes are too different, go to next candidates
|
|
316
322
|
if (meta.organe && agenda.organe) {
|
|
317
323
|
const videoOrgNorm = normalize(meta.organe);
|
|
318
324
|
const agendaOrgNorm = normalize(agenda.organe);
|
|
319
|
-
|
|
325
|
+
const videoKey = getOrgKey(videoOrgNorm);
|
|
326
|
+
const agendaKey = getOrgKey(agendaOrgNorm);
|
|
327
|
+
const d = dice(agendaOrgNorm, videoOrgNorm);
|
|
328
|
+
if (videoKey === agendaKey && videoKey !== "autre") {
|
|
329
|
+
// same org we keep it
|
|
330
|
+
sameOrg = true;
|
|
331
|
+
}
|
|
332
|
+
else if (d < 0.7) {
|
|
333
|
+
// if diff org and dice too low we skip
|
|
320
334
|
continue;
|
|
321
335
|
}
|
|
322
336
|
}
|
|
@@ -324,7 +338,7 @@ async function processGroupedReunion(agenda, session, dataDir) {
|
|
|
324
338
|
if (c.isSeancePublique && meta.firstChapterLabel) {
|
|
325
339
|
videoTitle = meta.firstChapterLabel;
|
|
326
340
|
}
|
|
327
|
-
const s = score(agenda, agendaTs, videoTitle,
|
|
341
|
+
const s = score(agenda, agendaTs, sameOrg, videoTitle, videoEpoch, meta.organe);
|
|
328
342
|
if (!best || s > best.score) {
|
|
329
343
|
best = {
|
|
330
344
|
id: c.id,
|
|
@@ -339,17 +353,17 @@ async function processGroupedReunion(agenda, session, dataDir) {
|
|
|
339
353
|
}
|
|
340
354
|
if (!best) {
|
|
341
355
|
if (!options["silent"])
|
|
342
|
-
console.log(`[miss] ${agenda.uid}
|
|
356
|
+
console.log(`[miss] ${agenda.uid} No candidate found for this reunion`);
|
|
343
357
|
return;
|
|
344
358
|
}
|
|
345
359
|
const accepted = best.score >= MATCH_THRESHOLD;
|
|
346
360
|
if (accepted)
|
|
347
361
|
STATS.accepted++;
|
|
348
362
|
if (!options["silent"]) {
|
|
349
|
-
console.log(`[pick] ${agenda.uid} score=${best.score.toFixed(2)}
|
|
350
|
-
agenda title="${agenda.titre ?? ""}" agenda organe="${agenda.organe ?? ""}"
|
|
363
|
+
console.log(`[pick] ${agenda.uid} score=${best.score.toFixed(2)}
|
|
364
|
+
agenda title="${agenda.titre ?? ""}" agenda organe="${agenda.organe ?? ""}" agenda heure=${agenda.startTime}
|
|
351
365
|
best title="${best.vtitle ?? ""}" best organe="${best.vorgane ?? ""}"
|
|
352
|
-
accepted=${accepted}
|
|
366
|
+
accepted=${accepted}`);
|
|
353
367
|
}
|
|
354
368
|
// ==== 3) Write metadata + NVS of the best candidate (always) ====
|
|
355
369
|
const bestDt = best?.epoch ? epochToParisDateTime(best.epoch) : null;
|
|
@@ -358,7 +372,6 @@ async function processGroupedReunion(agenda, session, dataDir) {
|
|
|
358
372
|
session,
|
|
359
373
|
accepted,
|
|
360
374
|
threshold: MATCH_THRESHOLD,
|
|
361
|
-
strategy: usedStrategy,
|
|
362
375
|
agenda: {
|
|
363
376
|
date: agenda.date,
|
|
364
377
|
startTime: agenda.startTime,
|
|
@@ -30,6 +30,11 @@ export declare const verboseOption: {
|
|
|
30
30
|
name: string;
|
|
31
31
|
type: BooleanConstructor;
|
|
32
32
|
};
|
|
33
|
+
export declare const onlyRecentOption: {
|
|
34
|
+
help: string;
|
|
35
|
+
name: string;
|
|
36
|
+
type: NumberConstructor;
|
|
37
|
+
};
|
|
33
38
|
export declare const commonOptions: ({
|
|
34
39
|
alias: string;
|
|
35
40
|
defaultValue: string[];
|
|
@@ -30,10 +30,9 @@ export const verboseOption = {
|
|
|
30
30
|
name: "verbose",
|
|
31
31
|
type: Boolean,
|
|
32
32
|
};
|
|
33
|
-
export const
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
];
|
|
33
|
+
export const onlyRecentOption = {
|
|
34
|
+
help: "retrieve only documents created within the last N days (that are not already downloaded)",
|
|
35
|
+
name: "only-recent",
|
|
36
|
+
type: Number,
|
|
37
|
+
};
|
|
38
|
+
export const commonOptions = [categoriesOption, dataDirDefaultOption, fromSessionOption, silentOption, verboseOption];
|