@tricoteuses/senat 2.16.5 → 2.16.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/loaders.d.ts CHANGED
@@ -29,6 +29,7 @@ export type IterItem<T> = {
29
29
  export interface TexteMetadata {
30
30
  name: string;
31
31
  session: number | null | undefined;
32
+ date?: string | null;
32
33
  url_expose_des_motifs?: URL;
33
34
  url_xml: URL;
34
35
  url_html: URL;
@@ -37,6 +38,7 @@ export interface TexteMetadata {
37
38
  export interface RapportMetadata {
38
39
  name: string;
39
40
  session: number | null | undefined;
41
+ date?: string | null;
40
42
  url_html: URL;
41
43
  url_pdf: URL;
42
44
  }
@@ -1,2 +1,6 @@
1
1
  import { AgendaEvent } from "../types/agenda";
2
+ export declare function getStartAndEndTimes(timeStr: string | null | undefined, dateISO: string): {
3
+ startTime: string | null;
4
+ endTime: string | null;
5
+ };
2
6
  export declare function parseAgendaFromFile(htmlFilePath: string): Promise<AgendaEvent[] | null>;
@@ -64,18 +64,37 @@ function normalizeTime(timeStr) {
64
64
  ?.replace(/\s\(hors hémicycle\)/i, "")
65
65
  ?.replace(/\s*h\s*/gi, "h");
66
66
  }
67
- function getStartAndEndTimes(timeStr) {
67
+ export function getStartAndEndTimes(timeStr, dateISO) {
68
68
  const normalizedTime = normalizeTime(timeStr);
69
- const timeMatches = normalizedTime?.match(/^de (?<startTime>\d{2}h\d{2}) à (?<endTime>\d{2}h\d{2})$/i);
70
- if (timeMatches?.groups) {
71
- const { startTime, endTime } = timeMatches.groups;
69
+ if (!normalizedTime) {
70
+ return { startTime: null, endTime: null };
71
+ }
72
+ const rangeMatch = normalizedTime.match(/^de (?<start>\d{1,2}h\d{2}) à (?<end>\d{1,2}h\d{2})$/i);
73
+ const toUtcIso = (value) => {
74
+ if (!value)
75
+ return null;
76
+ const time = DateTime.fromFormat(value, "H'h'mm", { zone: FR_TZ });
77
+ if (!time.isValid)
78
+ return null;
79
+ const local = DateTime.fromISO(dateISO, { zone: FR_TZ }).set({
80
+ hour: time.hour,
81
+ minute: time.minute,
82
+ second: 0,
83
+ millisecond: 0,
84
+ });
85
+ if (!local.isValid)
86
+ return null;
87
+ return local.toUTC().toISO();
88
+ };
89
+ if (rangeMatch?.groups) {
90
+ const { start, end } = rangeMatch.groups;
72
91
  return {
73
- startTime: startTime ? DateTime.fromFormat(startTime, "H'h'mm", { zone: FR_TZ }).toISOTime() : null,
74
- endTime: endTime ? DateTime.fromFormat(endTime, "H'h'mm", { zone: FR_TZ }).toISOTime() : null,
92
+ startTime: toUtcIso(start),
93
+ endTime: toUtcIso(end),
75
94
  };
76
95
  }
77
96
  return {
78
- startTime: normalizedTime ? DateTime.fromFormat(normalizedTime, "H'h'mm", { zone: FR_TZ }).toISOTime() : null,
97
+ startTime: toUtcIso(normalizedTime),
79
98
  endTime: null,
80
99
  };
81
100
  }
@@ -91,7 +110,7 @@ function transformAgenda(document, fileName) {
91
110
  const type = getEventType(eventElement.classList);
92
111
  const date = DateTime.fromFormat(fileName, ID_DATE_FORMAT).toFormat(STANDARD_DATE_FORMAT);
93
112
  const timeOriginal = eventElement.querySelector(".time")?.textContent || null;
94
- const { startTime, endTime } = getStartAndEndTimes(timeOriginal);
113
+ const { startTime, endTime } = getStartAndEndTimes(timeOriginal, date);
95
114
  const titre = eventElement.querySelector(".titre")?.textContent?.trim() || "";
96
115
  const organe = eventElement.querySelector(".organe")?.textContent?.trim() || null;
97
116
  const objet = eventElement.querySelector(".objet")?.textContent?.trim()?.replace(/^- /, "") || null;
@@ -1,5 +1,5 @@
1
1
  import { dbSenat } from "../databases";
2
- import { rtrim } from "./util";
2
+ import { rtrim, toDateString } from "./util";
3
3
  export function findSenatTexteUrls(sessions = []) {
4
4
  return dbSenat
5
5
  .withSchema("dosleg")
@@ -10,13 +10,8 @@ export function findSenatTexteUrls(sessions = []) {
10
10
  .select(({ eb, ref }) => [
11
11
  "sesann as session",
12
12
  rtrim(ref("texurl")).as("url"),
13
- eb
14
- .case()
15
- .when("oritxtcod", "=", "1")
16
- .then(true)
17
- .else(false)
18
- .end()
19
- .as("hasExposeDesMotifs"),
13
+ toDateString(ref("txtoritxtdat")).as("date"),
14
+ eb.case().when("oritxtcod", "=", "1").then(true).else(false).end().as("hasExposeDesMotifs"),
20
15
  ])
21
16
  .$narrowType()
22
17
  .stream();
@@ -31,6 +26,7 @@ export function findSenatRapportUrls(sessions = []) {
31
26
  .select(({ ref }) => [
32
27
  "sesann as session",
33
28
  rtrim(ref("rapurl")).as("url"),
29
+ toDateString(ref("date_depot")).as("date"),
34
30
  ])
35
31
  .$narrowType()
36
32
  .stream();
@@ -178,6 +178,7 @@ async function convertTexteUrls(dataDir) {
178
178
  const metadata = {
179
179
  name: texteName,
180
180
  session: texte.session,
181
+ date: texte.date,
181
182
  url_expose_des_motifs: texte.hasExposeDesMotifs
182
183
  ? new URL(`${texteName}-expose.html`, SENAT_EXPOSE_DES_MOTIFS_BASE_URL)
183
184
  : undefined,
@@ -212,6 +213,7 @@ async function convertRapportUrls(dataDir) {
212
213
  const metadata = {
213
214
  name: rapportName,
214
215
  session: rapport.session,
216
+ date: rapport.date,
215
217
  url_html: new URL(rapportHtmlUrl, SENAT_RAPPORT_BASE_URL),
216
218
  url_pdf: new URL(rapportPdfUrl, SENAT_RAPPORT_BASE_URL),
217
219
  };
@@ -1,14 +1,16 @@
1
1
  import assert from "assert";
2
2
  import commandLineArgs from "command-line-args";
3
3
  import fs from "fs-extra";
4
+ import { DateTime } from "luxon";
4
5
  import path from "path";
5
6
  import { DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER, iterLoadSenatDossiersLegislatifsRapportUrls, iterLoadSenatDossiersLegislatifsTexteUrls, RAPPORT_FOLDER, TEXTE_FOLDER, } from "../loaders";
6
7
  import { parseExposeDesMotifs, parseTexte, parseTexteFromFile } from "../model/texte";
7
8
  import { getSessionsFromStart, UNDEFINED_SESSION } from "../types/sessions";
8
- import { commonOptions } from "./shared/cli_helpers";
9
+ import { commonOptions, onlyRecentOption } from "./shared/cli_helpers";
9
10
  import { ensureAndClearDir, fetchWithRetry, isOptionEmptyOrHasValue } from "./shared/util";
10
11
  const optionsDefinitions = [
11
12
  ...commonOptions,
13
+ onlyRecentOption,
12
14
  {
13
15
  help: "parse and convert documents into JSON (textes only for now, requires format xml)",
14
16
  name: "parseDocuments",
@@ -35,6 +37,18 @@ const optionsDefinitions = [
35
37
  ];
36
38
  const options = commandLineArgs(optionsDefinitions);
37
39
  const textDecoder = new TextDecoder("utf8");
40
+ const today = DateTime.now();
41
+ function isDocumentRecent(documentDate, daysThreshold) {
42
+ if (!documentDate) {
43
+ return false;
44
+ }
45
+ const docDate = DateTime.fromISO(documentDate);
46
+ if (!docDate.isValid) {
47
+ return false;
48
+ }
49
+ const daysDiff = today.diff(docDate, "days").days;
50
+ return daysDiff <= daysThreshold;
51
+ }
38
52
  async function retrieveTextes(dataDir, sessions) {
39
53
  const textesDir = path.join(dataDir, TEXTE_FOLDER);
40
54
  fs.ensureDirSync(textesDir);
@@ -47,7 +61,7 @@ async function retrieveTextes(dataDir, sessions) {
47
61
  const texteUrlsNotFoundOrError = [];
48
62
  const texteUrlsParseError = [];
49
63
  for (const session of sessions) {
50
- for (const { item: texteMetadata, } of iterLoadSenatDossiersLegislatifsTexteUrls(dataDir, session)) {
64
+ for (const { item: texteMetadata } of iterLoadSenatDossiersLegislatifsTexteUrls(dataDir, session)) {
51
65
  const texteDir = path.join(originalTextesDir, `${texteMetadata.session ?? UNDEFINED_SESSION}`, texteMetadata.name);
52
66
  fs.ensureDirSync(texteDir);
53
67
  let exposeDesMotifsContent = null;
@@ -57,7 +71,11 @@ async function retrieveTextes(dataDir, sessions) {
57
71
  if (isOptionEmptyOrHasValue(options["formats"], "xml")) {
58
72
  const textePath = path.join(texteDir, `${texteMetadata.name}.xml`);
59
73
  let texteBuffer = null;
60
- if (!options["force"] && fs.existsSync(textePath)) {
74
+ // Check if document should be skipped based on onlyRecent option
75
+ const shouldSkip = !options["force"] &&
76
+ fs.existsSync(textePath) &&
77
+ (options["only-recent"] === undefined || !isDocumentRecent(texteMetadata.date, options["only-recent"]));
78
+ if (shouldSkip) {
61
79
  if (!options["silent"]) {
62
80
  console.info(`Already downloaded texte ${textePath}…`);
63
81
  }
@@ -80,7 +98,11 @@ async function retrieveTextes(dataDir, sessions) {
80
98
  }
81
99
  if (isOptionEmptyOrHasValue(options["formats"], "html")) {
82
100
  const textePath = path.join(texteDir, `${texteMetadata.name}.html`);
83
- if (!options["force"] && fs.existsSync(textePath)) {
101
+ // Check if document should be skipped based on onlyRecent option
102
+ const shouldSkip = !options["force"] &&
103
+ fs.existsSync(textePath) &&
104
+ (options["only-recent"] === undefined || !isDocumentRecent(texteMetadata.date, options["only-recent"]));
105
+ if (shouldSkip) {
84
106
  if (!options["silent"]) {
85
107
  console.info(`Already downloaded texte ${textePath}…`);
86
108
  }
@@ -97,7 +119,11 @@ async function retrieveTextes(dataDir, sessions) {
97
119
  }
98
120
  if (isOptionEmptyOrHasValue(options["formats"], "pdf")) {
99
121
  const textePath = path.join(texteDir, `${texteMetadata.name}.pdf`);
100
- if (!options["force"] && fs.existsSync(textePath)) {
122
+ // Check if document should be skipped based on onlyRecent option
123
+ const shouldSkip = !options["force"] &&
124
+ fs.existsSync(textePath) &&
125
+ (options["only-recent"] === undefined || !isDocumentRecent(texteMetadata.date, options["only-recent"]));
126
+ if (shouldSkip) {
101
127
  if (!options["silent"]) {
102
128
  console.info(`Already downloaded texte ${textePath}…`);
103
129
  }
@@ -129,12 +155,16 @@ async function retrieveRapports(dataDir, sessions) {
129
155
  let retrievedRapportsCount = 0;
130
156
  const rapportUrlsNotFoundOrError = [];
131
157
  for (const session of sessions) {
132
- for (const { item: rapportMetadata, } of iterLoadSenatDossiersLegislatifsRapportUrls(dataDir, session)) {
158
+ for (const { item: rapportMetadata } of iterLoadSenatDossiersLegislatifsRapportUrls(dataDir, session)) {
133
159
  const rapportDir = path.join(originalRapportsDir, `${rapportMetadata.session ?? UNDEFINED_SESSION}`, rapportMetadata.name);
134
160
  fs.ensureDirSync(rapportDir);
135
161
  if (isOptionEmptyOrHasValue(options["formats"], "html")) {
136
162
  const rapportPath = path.join(rapportDir, `${rapportMetadata.name}.html`);
137
- if (!options["force"] && fs.existsSync(rapportPath)) {
163
+ // Check if document should be skipped based on onlyRecent option
164
+ const shouldSkip = !options["force"] &&
165
+ fs.existsSync(rapportPath) &&
166
+ (options["only-recent"] === undefined || !isDocumentRecent(rapportMetadata.date, options["only-recent"]));
167
+ if (shouldSkip) {
138
168
  if (!options["silent"]) {
139
169
  console.info(`Already downloaded rapport ${rapportPath}…`);
140
170
  }
@@ -150,7 +180,11 @@ async function retrieveRapports(dataDir, sessions) {
150
180
  }
151
181
  if (isOptionEmptyOrHasValue(options["formats"], "pdf")) {
152
182
  const rapportPath = path.join(rapportDir, `${rapportMetadata.name}.pdf`);
153
- if (!options["force"] && fs.existsSync(rapportPath)) {
183
+ // Check if document should be skipped based on onlyRecent option
184
+ const shouldSkip = !options["force"] &&
185
+ fs.existsSync(rapportPath) &&
186
+ (options["only-recent"] === undefined || !isDocumentRecent(rapportMetadata.date, options["only-recent"]));
187
+ if (shouldSkip) {
154
188
  if (!options["silent"]) {
155
189
  console.info(`Already downloaded rapport ${rapportPath}…`);
156
190
  }
@@ -222,8 +256,7 @@ async function parseDocument(session, transformedTextesDir, textePath, texteName
222
256
  console.log("Parsing exposé des motifs…");
223
257
  }
224
258
  const exposeDesMotifsHtml = textDecoder.decode(exposeDesMotifs);
225
- parsedTexte.exposeDesMotifs =
226
- parseExposeDesMotifs(exposeDesMotifsHtml);
259
+ parsedTexte.exposeDesMotifs = parseExposeDesMotifs(exposeDesMotifsHtml);
227
260
  }
228
261
  const transformedTexteDir = path.join(transformedTextesDir, `${session ?? UNDEFINED_SESSION}`, texteName);
229
262
  fs.ensureDirSync(transformedTexteDir);
@@ -1,5 +1 @@
1
- import { GroupedReunion } from "../types/agenda";
2
- /**
3
- * Build search strategies for senat's videos
4
- */
5
- export declare function buildSearchStrategies(agenda: GroupedReunion): Array<Record<string, string>>;
1
+ export {};
@@ -8,10 +8,10 @@ import { AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, iterLoadSenatAgendas } from "..
8
8
  import { getSessionsFromStart } from "../types/sessions";
9
9
  import { commonOptions } from "./shared/cli_helpers";
10
10
  import { decodeHtmlEntities } from "../model/util";
11
+ import { DateTime } from "luxon";
11
12
  // ===================== Constants =====================
12
- const MATCH_THRESHOLD = 0.5;
13
+ const MATCH_THRESHOLD = 0.6;
13
14
  const MAX_CANDIDATES = 15;
14
- const MAX_PAGES = 3;
15
15
  const STATS = { total: 0, accepted: 0 };
16
16
  const VIDEOS_ROOT_FOLDER = "videos";
17
17
  const SENAT_VIDEOS_SEARCH_AJAX = "https://videos.senat.fr/senat_videos_search.php";
@@ -42,11 +42,6 @@ function dice(a, b) {
42
42
  inter++;
43
43
  return (2 * inter) / (A.size + B.size);
44
44
  }
45
- // Heuristic for Europe/Paris DST: +02:00 ≈ April→October, +01:00 otherwise.
46
- function parisOffsetForDate(dateYYYYMMDD) {
47
- const m = Number(dateYYYYMMDD.split("-")[1] || "1");
48
- return m >= 4 && m <= 10 ? "+02:00" : "+01:00";
49
- }
50
45
  function epochToParisDateTime(epochSec) {
51
46
  if (!Number.isFinite(epochSec))
52
47
  return null;
@@ -70,30 +65,13 @@ function epochToParisDateTime(epochSec) {
70
65
  startTime: `${hh}:${mi}:${ss}.${ms}${offsetStr}`,
71
66
  };
72
67
  }
73
- function toTargetEpoch(date, time) {
74
- if (!date)
68
+ function toTargetEpoch(time) {
69
+ if (!time)
75
70
  return null;
76
- let t = (time ?? "00:00").trim();
77
- // Si l'heure contient déjà un fuseau (Z ou ±HH:MM), on la fait simplement précéder de la date.
78
- const hasTz = /(?:Z|[+-]\d{2}:\d{2})$/i.test(t);
79
- let iso;
80
- if (hasTz) {
81
- // Exemple: 2022-10-04T18:00:00.000+02:00
82
- iso = `${date}T${t}`;
83
- }
84
- else {
85
- // Normalise pour avoir au moins HH:mm:ss
86
- if (/^\d{1,2}$/.test(t)) {
87
- t = `${t.padStart(2, "0")}:00:00`;
88
- }
89
- else if (/^\d{1,2}:\d{2}$/.test(t)) {
90
- t = `${t}:00`;
91
- } // sinon, on garde tel quel (gère HH:mm:ss et HH:mm:ss.SSS)
92
- // Ajoute l’offset Paris (heuristique saisonnière)
93
- iso = `${date}T${t}${parisOffsetForDate(date)}`;
94
- }
95
- const ms = Date.parse(iso);
96
- return Number.isNaN(ms) ? null : Math.floor(ms / 1000);
71
+ const dtLocal = DateTime.fromISO(time, { zone: "Europe/Paris" });
72
+ if (!dtLocal.isValid)
73
+ return null;
74
+ return Math.floor(dtLocal.toUTC().toSeconds());
97
75
  }
98
76
  async function fetchText(url) {
99
77
  const res = await fetch(url);
@@ -122,16 +100,6 @@ function queryString(obj) {
122
100
  .map(([k, v]) => `${encodeURIComponent(k)}=${encodeURIComponent(v)}`)
123
101
  .join("&");
124
102
  }
125
- function simplifyTitleForKeywords(input) {
126
- return (input || "")
127
- .replace(/\baudition\s+de\b/gi, " ")
128
- .replace(/\breunion\b/gi, " ")
129
- .replace(/\bsur\b/gi, " ")
130
- .replace(/\b(la|le|les|des|de|du|d’|d')\b/gi, " ")
131
- .replace(/[–—-]/g, " ")
132
- .replace(/\s+/g, " ")
133
- .trim();
134
- }
135
103
  function toFRDate(dateYYYYMMDD) {
136
104
  const [y, m, d] = dateYYYYMMDD.split("-");
137
105
  return `${d}/${m}/${y}`; // DD/MM/YYYY
@@ -158,17 +126,24 @@ function extractCandidatesFromSearchHtml(html) {
158
126
  return true;
159
127
  });
160
128
  }
129
+ function parseFinalNvs(nvs) {
130
+ const playerTag = nvs.match(/<player\b[^>]*>/i)?.[0];
131
+ if (!playerTag)
132
+ return {};
133
+ const sessionStartStr = playerTag.match(/\bsessionstart="(\d+)"/i)?.[1];
134
+ return {
135
+ sessionStart: sessionStartStr ? Number(sessionStartStr) : undefined,
136
+ };
137
+ }
161
138
  function parseDataNvs(nvs) {
162
139
  const epochStr = nvs.match(/<metadata\s+name="date"\s+value="(\d+)"/i)?.[1];
163
140
  const epoch = epochStr ? Number(epochStr) : undefined;
164
141
  const organesTag = nvs.match(/<metadata\b[^>]*\bname="organes"[^>]*>/i)?.[0];
165
142
  let organeLabel;
166
- let organeValue;
167
143
  if (organesTag) {
168
144
  organeLabel = organesTag.match(/\blabel="([^"]+)"/i)?.[1];
169
- organeValue = organesTag.match(/\bvalue="([^"]+)"/i)?.[1];
170
145
  }
171
- const organeRaw = organeLabel ?? organeValue;
146
+ const organeRaw = organeLabel ?? "Séance publique";
172
147
  const organe = decodeHtmlEntities(organeRaw)?.trim();
173
148
  const firstChapterLabel = decodeHtmlEntities(nvs.match(/<chapter\b[^>]*\blabel="([^"]+)"/i)[1]).trim();
174
149
  return { epoch, organe, firstChapterLabel };
@@ -207,50 +182,27 @@ function buildSenatVodMasterM3u8FromNvs(nvsText, finalText) {
207
182
  }
208
183
  return `${base}.smil/master.m3u8`;
209
184
  }
210
- function score(agenda, agendaTs, videoTitle, videoEpoch, videoOrgane) {
211
- const titleScore = dice(agenda.titre || "", videoTitle || "");
185
+ function score(agenda, agendaTs, sameOrg, videoTitle, videoEpoch, videoOrgane) {
186
+ const objetS = dice(agenda.objet || "", videoTitle || "");
187
+ const titleS = dice(agenda.titre || "", videoTitle || "");
188
+ const titleScore = Math.max(objetS, titleS);
212
189
  let timeScore = 0;
213
190
  if (agendaTs && videoEpoch) {
214
191
  // second
215
192
  const deltaMin = Math.abs(videoEpoch - agendaTs) / 60;
216
- // delta : 180min
217
- timeScore = Math.max(0, 1 - deltaMin / 180);
193
+ // delta : 60min
194
+ timeScore = Math.exp(-deltaMin / 60);
218
195
  }
219
196
  const orgScore = videoOrgane && agenda.organe ? dice(agenda.organe, videoOrgane) : 0;
220
- if (orgScore === 0 && agenda.organe === "Séance publique") {
221
- return 0.3 * titleScore + 0.7 * timeScore;
222
- }
223
- return 0.2 * titleScore + 0.4 * timeScore + orgScore * 0.4;
197
+ return 0.2 * titleScore + 0.4 * timeScore + (sameOrg ? 0.4 : orgScore * 0.4);
224
198
  }
225
- /**
226
- * Build search strategies for senat's videos
227
- */
228
- export function buildSearchStrategies(agenda) {
229
- const fr = agenda.date ? toFRDate(agenda.date) : undefined;
230
- const kw = simplifyTitleForKeywords(agenda.titre || "");
231
- const commission = agenda.organe || undefined;
232
- // common base
233
- const base = { search: "true", videotype: "Commission" };
234
- if (fr)
235
- Object.assign(base, { period: "custom", begin: fr, end: fr });
236
- const strategies = [];
237
- // 1) keywords + commission
238
- if (kw && commission)
239
- strategies.push({ ...base, motscles: kw, commission });
240
- // 2) keywords without commission
241
- if (kw)
242
- strategies.push({ ...base, motscles: kw });
243
- // 3) full-text (AND) + commission
244
- if (kw && commission)
245
- strategies.push({ ...base, text: `AND${kw}`, commission });
246
- // 4) full-text (AND) without commission
247
- if (kw)
248
- strategies.push({ ...base, text: `AND${kw}` });
249
- // 5) no keywords (just type + period)
250
- strategies.push({ ...base });
251
- return strategies;
199
+ function getAgendaType(agenda) {
200
+ const o = agenda.organe || "";
201
+ if (/séance publique/i.test(o))
202
+ return "Séance publique";
203
+ return "Commission";
252
204
  }
253
- async function fetchAllSearchPages(args, baseDir, strategyIndex, maxPages = MAX_PAGES) {
205
+ async function fetchAllSearchPages(args, maxPages = 3) {
254
206
  const pages = [];
255
207
  for (let p = 1; p <= maxPages; p++) {
256
208
  const url = `${SENAT_VIDEOS_SEARCH_AJAX}?${queryString({ ...args, page: String(p) })}`;
@@ -263,9 +215,46 @@ async function fetchAllSearchPages(args, baseDir, strategyIndex, maxPages = MAX_
263
215
  }
264
216
  return pages;
265
217
  }
218
+ function getOrgKey(norm) {
219
+ if (!norm)
220
+ return "autre";
221
+ if (norm.includes("seance publique"))
222
+ return "seance_publique";
223
+ if (norm.includes("culture"))
224
+ return "culture";
225
+ if (norm.includes("finances"))
226
+ return "finances";
227
+ if (norm.includes("sociales"))
228
+ return "affaires_sociales";
229
+ if (norm.includes("economiques"))
230
+ return "affaires_economiques";
231
+ if (norm.includes("europeennes"))
232
+ return "affaires_europeennes";
233
+ if (norm.includes("etrangeres") || norm.includes("forces armees") || norm.includes("defense")) {
234
+ return "affaires_etrangeres_defense";
235
+ }
236
+ if (norm.includes("territoire") || norm.includes("durable")) {
237
+ return "amenagement_territoire_dd";
238
+ }
239
+ if (norm.includes("commission des lois"))
240
+ return "lois";
241
+ if (norm.includes("delegation aux collectivites territoriales") || norm.includes("delegation a la decentralisation"))
242
+ return "delegation_collectivites";
243
+ if (norm.includes("delegation aux droits des femmes") ||
244
+ norm.includes("egalite des chances entre les hommes et les femmes"))
245
+ return "delegation_droits_femmes";
246
+ if (norm.includes("delegation aux entreprises"))
247
+ return "delegation_entreprises";
248
+ if (norm.includes("delegation senatoriale aux outre mer") || norm.includes("delegation aux outre mer"))
249
+ return "delegation_outre_mer";
250
+ if (norm.includes("delegation a la prospective"))
251
+ return "delegation_prospective";
252
+ if (norm.includes("office parlementaire d evaluation des choix scientifiques et technologiques") ||
253
+ norm.includes("opecst"))
254
+ return "opecst";
255
+ return "autre";
256
+ }
266
257
  async function processGroupedReunion(agenda, session, dataDir) {
267
- if (!agenda)
268
- return;
269
258
  // 1) GuardRails
270
259
  if (!agenda.captationVideo) {
271
260
  // if (!options["silent"]) console.log(`[skip] ${agenda.uid} captationVideo=false`)
@@ -275,32 +264,40 @@ async function processGroupedReunion(agenda, session, dataDir) {
275
264
  // if (!options["silent"]) console.log(`[skip] ${agenda.uid} date/hour missing`)
276
265
  return;
277
266
  }
267
+ const agendaTs = toTargetEpoch(agenda.startTime);
268
+ const now = Date.now();
269
+ if (agendaTs && agendaTs * 1000 > now) {
270
+ return;
271
+ }
278
272
  STATS.total++;
279
273
  const reunionUid = agenda.uid;
280
274
  const baseDir = path.join(dataDir, VIDEOS_ROOT_FOLDER, String(session), reunionUid);
281
275
  await fs.ensureDir(baseDir);
282
- const agendaTs = toTargetEpoch(agenda.date, agenda.startTime);
283
- // ==== 1) Multi-strategy searches ====
284
- const strategies = buildSearchStrategies(agenda);
285
- let combinedHtml = "";
286
- let usedStrategy = -1;
287
- let candidates = [];
288
- for (let i = 0; i < strategies.length; i++) {
289
- const pages = await fetchAllSearchPages(strategies[i], baseDir, i + 1, MAX_PAGES);
290
- if (pages.length === 0)
291
- continue;
292
- const combined = pages.join("\n<!-- PAGE SPLIT -->\n");
293
- const cs = extractCandidatesFromSearchHtml(combined);
294
- if (cs.length) {
295
- combinedHtml = combined;
296
- candidates = cs.slice(0, MAX_CANDIDATES);
297
- usedStrategy = i + 1;
298
- break;
276
+ const searchParams = {
277
+ search: "true",
278
+ videotype: getAgendaType(agenda),
279
+ };
280
+ if (agenda.date) {
281
+ const fr = toFRDate(agenda.date);
282
+ searchParams.period = "custom";
283
+ searchParams.begin = fr;
284
+ searchParams.end = fr;
285
+ }
286
+ if (agenda.organe) {
287
+ searchParams.organe = agenda.organe;
288
+ }
289
+ const pages = await fetchAllSearchPages(searchParams);
290
+ if (!pages.length) {
291
+ if (!options["silent"]) {
292
+ console.log(`[miss] ${agenda.uid} no candidates (videotype=${searchParams.videotype}, organe=${searchParams.organe || "-"}, date=${searchParams.begin || "-"})`);
299
293
  }
294
+ return;
300
295
  }
301
- if (usedStrategy === -1 || !candidates.length) {
296
+ const combinedHtml = pages.join("\n<!-- PAGE SPLIT -->\n");
297
+ const candidates = extractCandidatesFromSearchHtml(combinedHtml).slice(0, MAX_CANDIDATES);
298
+ if (!candidates.length) {
302
299
  if (!options["silent"]) {
303
- console.log(`[miss] ${agenda.uid} no candidates (triedStrategies=${strategies.length})`);
300
+ console.log(`[miss] ${agenda.uid} no candidates after parse (videotype=${searchParams.videotype}, organe=${searchParams.organe || "-"}, date=${searchParams.begin || "-"})`);
304
301
  }
305
302
  return;
306
303
  }
@@ -308,15 +305,32 @@ async function processGroupedReunion(agenda, session, dataDir) {
308
305
  let best = null;
309
306
  for (const c of candidates) {
310
307
  const dataUrl = `${SENAT_DATAS_ROOT}/${c.id}_${c.hash}/content/data.nvs`;
311
- const buf = await fetchBuffer(dataUrl);
312
- if (!buf)
308
+ const finalUrl = `${SENAT_DATAS_ROOT}/${c.id}_${c.hash}/content/finalplayer.nvs`;
309
+ const dataBuf = await fetchBuffer(dataUrl);
310
+ if (!dataBuf)
313
311
  continue;
314
- const meta = parseDataNvs(buf.toString("utf-8"));
315
- // If organes are different, go to next candidates
312
+ const meta = parseDataNvs(dataBuf.toString("utf-8"));
313
+ let sessionStart;
314
+ const finalBuf = await fetchBuffer(finalUrl);
315
+ if (finalBuf) {
316
+ const finalMeta = parseFinalNvs(finalBuf.toString("utf-8"));
317
+ sessionStart = finalMeta.sessionStart;
318
+ }
319
+ const videoEpoch = sessionStart ?? meta.epoch;
320
+ let sameOrg = false;
321
+ // If organes are too different, go to next candidates
316
322
  if (meta.organe && agenda.organe) {
317
323
  const videoOrgNorm = normalize(meta.organe);
318
324
  const agendaOrgNorm = normalize(agenda.organe);
319
- if (dice(agendaOrgNorm, videoOrgNorm) < 0.5) {
325
+ const videoKey = getOrgKey(videoOrgNorm);
326
+ const agendaKey = getOrgKey(agendaOrgNorm);
327
+ const d = dice(agendaOrgNorm, videoOrgNorm);
328
+ if (videoKey === agendaKey && videoKey !== "autre") {
329
+ // same org we keep it
330
+ sameOrg = true;
331
+ }
332
+ else if (d < 0.7) {
333
+ // if diff org and dice too low we skip
320
334
  continue;
321
335
  }
322
336
  }
@@ -324,7 +338,7 @@ async function processGroupedReunion(agenda, session, dataDir) {
324
338
  if (c.isSeancePublique && meta.firstChapterLabel) {
325
339
  videoTitle = meta.firstChapterLabel;
326
340
  }
327
- const s = score(agenda, agendaTs, videoTitle, meta.epoch, meta.organe);
341
+ const s = score(agenda, agendaTs, sameOrg, videoTitle, videoEpoch, meta.organe);
328
342
  if (!best || s > best.score) {
329
343
  best = {
330
344
  id: c.id,
@@ -339,17 +353,17 @@ async function processGroupedReunion(agenda, session, dataDir) {
339
353
  }
340
354
  if (!best) {
341
355
  if (!options["silent"])
342
- console.log(`[miss] ${agenda.uid} candidates without data.nvs`);
356
+ console.log(`[miss] ${agenda.uid} No candidate found for this reunion`);
343
357
  return;
344
358
  }
345
359
  const accepted = best.score >= MATCH_THRESHOLD;
346
360
  if (accepted)
347
361
  STATS.accepted++;
348
362
  if (!options["silent"]) {
349
- console.log(`[pick] ${agenda.uid} score=${best.score.toFixed(2)}
350
- agenda title="${agenda.titre ?? ""}" agenda organe="${agenda.organe ?? ""}"
363
+ console.log(`[pick] ${agenda.uid} score=${best.score.toFixed(2)}
364
+ agenda title="${agenda.titre ?? ""}" agenda organe="${agenda.organe ?? ""}" agenda heure=${agenda.startTime}
351
365
  best title="${best.vtitle ?? ""}" best organe="${best.vorgane ?? ""}"
352
- accepted=${accepted} (strategy=${usedStrategy})`);
366
+ accepted=${accepted}`);
353
367
  }
354
368
  // ==== 3) Write metadata + NVS of the best candidate (always) ====
355
369
  const bestDt = best?.epoch ? epochToParisDateTime(best.epoch) : null;
@@ -358,7 +372,6 @@ async function processGroupedReunion(agenda, session, dataDir) {
358
372
  session,
359
373
  accepted,
360
374
  threshold: MATCH_THRESHOLD,
361
- strategy: usedStrategy,
362
375
  agenda: {
363
376
  date: agenda.date,
364
377
  startTime: agenda.startTime,
@@ -30,6 +30,11 @@ export declare const verboseOption: {
30
30
  name: string;
31
31
  type: BooleanConstructor;
32
32
  };
33
+ export declare const onlyRecentOption: {
34
+ help: string;
35
+ name: string;
36
+ type: NumberConstructor;
37
+ };
33
38
  export declare const commonOptions: ({
34
39
  alias: string;
35
40
  defaultValue: string[];
@@ -30,10 +30,9 @@ export const verboseOption = {
30
30
  name: "verbose",
31
31
  type: Boolean,
32
32
  };
33
- export const commonOptions = [
34
- categoriesOption,
35
- dataDirDefaultOption,
36
- fromSessionOption,
37
- silentOption,
38
- verboseOption,
39
- ];
33
+ export const onlyRecentOption = {
34
+ help: "retrieve only documents created within the last N days (that are not already downloaded)",
35
+ name: "only-recent",
36
+ type: Number,
37
+ };
38
+ export const commonOptions = [categoriesOption, dataDirDefaultOption, fromSessionOption, silentOption, verboseOption];
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tricoteuses/senat",
3
- "version": "2.16.5",
3
+ "version": "2.16.7",
4
4
  "description": "Handle French Sénat's open data",
5
5
  "keywords": [
6
6
  "France",