npm - @tricoteuses/senat - Versions diffs - 2.18.11 → 2.18.13 - Mend

@tricoteuses/senat 2.18.11 → 2.18.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/LICENSE.md +22 -22
package/README.md +123 -120
package/lib/model/ameli.js +1 -2
package/lib/scripts/retrieve_open_data.js +76 -75
package/lib/scripts/retrieve_videos.js +146 -131
package/lib/scripts/shared/cli_helpers.d.ts +3 -4
package/lib/scripts/shared/cli_helpers.js +8 -1
package/package.json +101 -101

package/LICENSE.md CHANGED Viewed

@@ -1,22 +1,22 @@
-# Tricoteuses-Senat
-## _Handle French Sénat's open data_
-By: Emmanuel Raviart <mailto:emmanuel@raviart.com>
-Copyright (C) 2019, 2020, 2021 Emmanuel Raviart
-https://git.tricoteuses.fr/logiciels/tricoteuses-senat
-> Tricoteuses-Senat is free software; you can redistribute it and/or modify
-> it under the terms of the GNU Affero General Public License as
-> published by the Free Software Foundation, either version 3 of the
-> License, or (at your option) any later version.
->
-> Tricoteuses-Senat is distributed in the hope that it will be useful,
-> but WITHOUT ANY WARRANTY; without even the implied warranty of
-> MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-> GNU Affero General Public License for more details.
->
-> You should have received a copy of the GNU Affero General Public License
-> along with this program. If not, see <http://www.gnu.org/licenses/>.
+# Tricoteuses-Senat
+## _Handle French Sénat's open data_
+By: Emmanuel Raviart <mailto:emmanuel@raviart.com>
+Copyright (C) 2019, 2020, 2021 Emmanuel Raviart
+https://git.tricoteuses.fr/logiciels/tricoteuses-senat
+> Tricoteuses-Senat is free software; you can redistribute it and/or modify
+> it under the terms of the GNU Affero General Public License as
+> published by the Free Software Foundation, either version 3 of the
+> License, or (at your option) any later version.
+>
+> Tricoteuses-Senat is distributed in the hope that it will be useful,
+> but WITHOUT ANY WARRANTY; without even the implied warranty of
+> MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+> GNU Affero General Public License for more details.
+>
+> You should have received a copy of the GNU Affero General Public License
+> along with this program. If not, see <http://www.gnu.org/licenses/>.

package/README.md CHANGED Viewed

@@ -1,120 +1,123 @@
-# Tricoteuses-Senat
-## _Retrieve, clean up & handle French Sénat's open data_
-## Requirements
-- Node >= 22
-## Installation
-```bash
-git clone https://git.tricoteuses.fr/logiciels/tricoteuses-senat
-cd tricoteuses-senat/
-```
-Create a `.env` file to set PostgreSQL database informations and other configuration variables (you can use `example.env` as a template). Then
-```bash
-npm install
-```
-### Database creation (not needed if downloading with Docker image)
-#### Using Docker
-```bash
-docker run --name local-postgres -d -p 5432:5432 -e POSTGRES_PASSWORD=$YOUR_CUSTOM_DB_PASSWORD postgres
-# Default Postgres user is postgres
-# But scripts require an "opendata" role
-docker exec -it local-postgres psql -U postgres -c "CREATE ROLE opendata;"
-```
-## Download data
-Create a folder where the data will be downloaded and run the following command to download the data and convert it into JSON files.
-```bash
-mkdir ../senat-data/
-# Available options for optional `categories` parameter : All,  Ameli, Debats, DosLeg, Questions, Sens
-npm run data:download ../senat-data -- [--categories All]
-```
-Data from other sources is also available :
-```bash
-# Retrieval of textes and rapports from Sénat's website
-# Available options for optional `formats` parameter : xml, html, pdf
-# Available options for optional `types` parameter : textes, rapports
-npm run data:retrieve_documents ../senat-data -- --fromSession 2022 [--formats xml pdf] [--types textes]
-# Retrieval & parsing (textes in xml format only for now)
-npm run data:retrieve_documents ../senat-data -- --fromSession 2022 --parseDocuments
-# Parsing only
-npm run data:parse_textes_lois ../senat-data
-# Retrieval (& parsing) of agenda from Sénat's website
-npm run data:retrieve_agenda ../senat-data -- --fromSession 2022 [--parseAgenda]
-# Retrieval (& parsing) of comptes-rendus de séance from Sénat's data
-npm run data:retrieve_cr_seance ../senat-data -- [--parseDebats]
-# Retrieval (& parsing) of comptes-rendus de commissions from Sénat's website
-npm run data:retrieve_cr_commission ../senat-data -- [--parseDebats]
-# Retrieval of sénateurs' pictures from Sénat's website
-npm run data:retrieve_senateurs_photos ../senat-data
-```
-## Data download using Docker
-A Docker image that downloads and converts the data all at once is available. Build it locally or run it from the container registry.
-Use the environment variables `FROM_SESSION` and `CATEGORIES` if needed.
-```bash
-docker run --pull always --name tricoteuses-senat -v ../senat-data:/app/senat-data -d git.tricoteuses.fr/logiciels/tricoteuses-senat:latest
-```
-Use the environment variable `CATEGORIES` and `FROM_SESSION` if needed.
-## Using the data
-Once the data is downloaded, you can use loaders to retrieve it.
-To use loaders in your project, you can install the _@tricoteuses/senat_ package, and import the iterator functions that you need.
-```bash
-npm install @tricoteuses/senat
-```
-```js
-import { iterLoadSenatQuestions } from "@tricoteuses/senat/loaders"
-// Pass data directory and legislature as arguments
-for (const { item: question } of iterLoadSenatQuestions("../senat-data", 17)) {
-  console.log(question.id)
-}
-```
-## Generation of raw types from SQL schema (for contributors only)
-```bash
-npm run data:generate_schemas ../senat-data
-```
-## Publishing
-To publish a new version of this package onto npm, bump the package version and publish.
-```bash
-npm version x.y.z # Bumps version in package.json and creates a new tag x.y.z
-npx tsc
-npm publish
-```
-The Docker image will be automatically built during a CI Workflow if you push the tag to the remote repository.
-```bash
-git push --tags
-```
+# Tricoteuses-Senat
+## _Retrieve, clean up & handle French Sénat's open data_
+## Requirements
+- Node >= 22
+## Installation
+```bash
+git clone https://git.tricoteuses.fr/logiciels/tricoteuses-senat
+cd tricoteuses-senat/
+```
+Create a `.env` file to set PostgreSQL database informations and other configuration variables (you can use `example.env` as a template). Then
+```bash
+npm install
+```
+### Database creation (not needed if downloading with Docker image)
+#### Using Docker
+```bash
+docker run --name local-postgres -d -p 5432:5432 -e POSTGRES_PASSWORD=$YOUR_CUSTOM_DB_PASSWORD postgres
+# Default Postgres user is postgres
+# But scripts require an "opendata" role
+docker exec -it local-postgres psql -U postgres -c "CREATE ROLE opendata;"
+```
+## Download data
+Create a folder where the data will be downloaded and run the following command to download the data and convert it into JSON files.
+```bash
+mkdir ../senat-data/
+# Available options for optional `categories` parameter : All,  Ameli, Debats, DosLeg, Questions, Sens
+npm run data:download ../senat-data -- [--categories All]
+```
+Data from other sources is also available :
+```bash
+# Retrieval of textes and rapports from Sénat's website
+# Available options for optional `formats` parameter : xml, html, pdf
+# Available options for optional `types` parameter : textes, rapports
+npm run data:retrieve_documents ../senat-data -- --fromSession 2022 [--formats xml pdf] [--types textes]
+# Retrieval & parsing (textes in xml format only for now)
+npm run data:retrieve_documents ../senat-data -- --fromSession 2022 --parseDocuments
+# Parsing only
+npm run data:parse_textes_lois ../senat-data
+# Retrieval (& parsing) of agenda from Sénat's website
+npm run data:retrieve_agenda ../senat-data -- --fromSession 2022 [--parseAgenda]
+# Retrieval (& parsing) of comptes-rendus de séance from Sénat's data
+npm run data:retrieve_cr_seance ../senat-data -- [--parseDebats]
+# Retrieval (& parsing) of comptes-rendus de commissions from Sénat's website
+npm run data:retrieve_cr_commission ../senat-data -- [--parseDebats]
+# Retrieval of sénateurs' pictures from Sénat's website
+npm run data:retrieve_senateurs_photos ../senat-data
+```
+## Data download using Docker
+A Docker image that downloads and converts the data all at once is available. Build it locally or run it from the container registry.
+Use the environment variables `FROM_SESSION` and `CATEGORIES` if needed.
+```bash
+docker run --pull always --name tricoteuses-senat -v ../senat-data:/app/senat-data -d git.tricoteuses.fr/logiciels/tricoteuses-senat:latest
+```
+Use the environment variable `CATEGORIES` and `FROM_SESSION` if needed.
+## Using the data
+Once the data is downloaded, you can use loaders to retrieve it.
+To use loaders in your project, you can install the _@tricoteuses/senat_ package, and import the iterator functions that you need.
+```bash
+npm install @tricoteuses/senat
+```
+```js
+import { iterLoadSenatQuestions } from "@tricoteuses/senat/loaders"
+// Pass data directory and legislature as arguments
+for (const { item: question } of iterLoadSenatQuestions("../senat-data", 17)) {
+  console.log(question.id)
+}
+```
+## Generation of raw types from SQL schema (for contributors only)
+```bash
+npm run data:generate_schemas ../senat-data
+```
+## Publishing
+To publish a new version of this package onto npm, bump the package version and publish.
+```bash
+# Increment version and create a new Git tag automatically
+npm version patch   # +0.0.1 → small fixes
+npm version minor   # +0.1.0 → new features
+npm version major   # +1.0.0 → breaking changes
+npx tsc
+npm publish
+```
+The Docker image will be automatically built during a CI Workflow if you push the tag to the remote repository.
+```bash
+git push --tags
+```

package/lib/model/ameli.js CHANGED Viewed

@@ -116,8 +116,7 @@ const findAllAmendementsQuery = dbSenat
     "ameli.com_ameli.lil as au_nom_de_commission",
     eb.case().when("ameli.cab.entid", "is not", null).then(true).else(false).end().as("auteur_est_gouvernement"),
     auteurs(ref("ameli.amd.id")).as("auteurs"),
-])
-    .distinctOn("ameli.amd.id");
+]);
 export function findAllAmendements(fromSession) {
     if (fromSession !== undefined) {
         return findAllAmendementsQuery.where("ameli.ses.ann", ">=", fromSession).stream();

package/lib/scripts/retrieve_open_data.js CHANGED Viewed

@@ -1,11 +1,13 @@
 import assert from "assert";
-import { execSync, spawn } from "child_process";
+import { execSync } from "child_process";
 import commandLineArgs from "command-line-args";
 import fs from "fs-extra";
+// import fetch from "node-fetch"
 import path from "path";
 import StreamZip from "node-stream-zip";
+import readline from "readline";
 import windows1252 from "windows-1252";
-import { pipeline, Transform } from "stream";
+import { pipeline } from "stream";
 import { promisify } from "util";
 import config from "../config";
 import { getChosenDatasets, getEnabledDatasets } from "../datasets";
@@ -67,88 +69,60 @@ async function downloadFile(url, dest) {
 }
 /**
  * Copy a dataset database to the main Senat database (overwriting its contents).
- * Optimized to combine encoding repair and schema transformation in a single pass.
  */
 async function copyToSenat(dataset, dataDir, options) {
     if (!options["silent"]) {
         console.log(`Copying ${dataset.database} to Senat database...`);
     }
     const sqlFilePath = path.join(dataDir, `${dataset.database}.sql`);
-    // Helper function to replace 'public' schema outside single-quoted strings
-    function replacePublicOutsideStrings(line, schema) {
-        const parts = line.split(/(')/);
-        let inString = false;
-        for (let i = 0; i < parts.length; i++) {
-            if (parts[i] === "'") {
-                inString = !inString;
-            }
-            else if (!inString) {
-                parts[i] = parts[i].replace(/\bpublic\b(?=(\s*\.|\s*[,;]|\s|$))/g, schema);
-            }
-        }
-        return parts.join('');
-    }
-    // Spawn psql process
-    const psqlArgs = options["sudo"]
-        ? ["-u", options["sudo"], "psql", "--quiet", "-d", "senat"]
-        : ["--quiet", "-d", "senat"];
-    const psql = spawn(options["sudo"] ? "sudo" : "psql", psqlArgs, {
-        stdio: ["pipe", "ignore", "pipe"],
-        env: process.env,
+    const schemaDumpFile = path.join(dataDir, `${dataset.database}_schema_dump.sql`);
+    // Write the header and then stream the rest of the SQL file
+    const schemaSqlWriter = fs.createWriteStream(schemaDumpFile, { encoding: "utf8" });
+    // Add CREATE SCHEMA statement at the top
+    schemaSqlWriter.write(`CREATE SCHEMA IF NOT EXISTS ${dataset.database};\n`);
+    const lineReader = readline.createInterface({
+        input: fs.createReadStream(sqlFilePath, { encoding: "utf8" }),
+        crlfDelay: Infinity,
     });
-    psql.stdin.write(`DROP SCHEMA IF EXISTS ${dataset.database} CASCADE;\n`);
-    psql.stdin.write(`CREATE SCHEMA IF NOT EXISTS ${dataset.database};\n`);
-    let buffer = '';
-    const combinedTransform = new Transform({
-        transform(chunk, encoding, callback) {
-            // Encoding repair if needed (decode from latin1 and fix Windows-1252 characters)
-            let data = dataset.repairEncoding
-                ? chunk.toString('latin1').replace(badWindows1252CharacterRegex, (match) => windows1252.decode(match, { mode: "fatal" }))
-                : chunk.toString();
-            buffer += data;
-            const lines = buffer.split('\n');
-            buffer = lines.pop() || '';
-            let processedData = '';
-            for (const line of lines) {
-                let newLine = replacePublicOutsideStrings(line, dataset.database);
-                newLine = newLine.replace(/SET client_encoding = 'LATIN1';/i, "SET client_encoding = 'UTF8';");
-                processedData += newLine + '\n';
-            }
-            callback(null, processedData);
-        },
-        flush(callback) {
-            // Process any remaining data in buffer
-            if (buffer) {
-                let newLine = replacePublicOutsideStrings(buffer, dataset.database);
-                newLine = newLine.replace(/SET client_encoding = 'LATIN1';/i, "SET client_encoding = 'UTF8';");
-                callback(null, newLine);
-            }
-            else {
-                callback();
+    for await (const line of lineReader) {
+        let newLine = line;
+        // Replace 'public' schema outside single-quoted strings
+        function replacePublicOutsideStrings(line, schema) {
+            const parts = line.split(/(')/);
+            let inString = false;
+            for (let i = 0; i < parts.length; i++) {
+                if (parts[i] === "'") {
+                    inString = !inString;
+                }
+                else if (!inString) {
+                    // Only replace outside of strings, including before comma
+                    parts[i] = parts[i].replace(/\bpublic\b(?=(\s*\.|\s*[,;]|\s|$))/g, schema);
+                }
             }
+            return parts.join('');
         }
-    });
-    let stderrData = '';
-    psql.stderr.on('data', (data) => {
-        stderrData += data.toString();
-    });
-    const pipelinePromise = streamPipeline(fs.createReadStream(sqlFilePath, {
-        encoding: dataset.repairEncoding ? undefined : "utf8",
-        highWaterMark: 4 * 1024 * 1024
-    }), combinedTransform, psql.stdin);
-    await pipelinePromise;
-    return new Promise((resolve, reject) => {
-        psql.on("close", (code) => {
-            if (code === 0) {
-                resolve();
+        newLine = replacePublicOutsideStrings(line, dataset.database);
+        // Replace SET client_encoding to UTF8
+        newLine = newLine.replace(/SET client_encoding = 'LATIN1';/i, "SET client_encoding = 'UTF8';");
+        schemaSqlWriter.write(newLine + "\n");
+    }
+    schemaSqlWriter.end();
+    await new Promise((resolve, reject) => {
+        schemaSqlWriter.on("finish", () => {
+            try {
+                execSync(`${options["sudo"] ? `sudo -u ${options["sudo"]} ` : ""}psql --quiet -d senat -f ${schemaDumpFile}`, {
+                    env: process.env,
+                    encoding: "utf-8",
+                    stdio: ["ignore", "ignore", "pipe"],
+                });
             }
-            else {
-                if (!options["silent"] && stderrData) {
-                    console.error(`psql stderr: ${stderrData}`);
-                }
-                reject(new Error(`psql exited with code ${code}`));
+            finally {
+                try { }
+                catch { }
             }
+            resolve();
         });
+        schemaSqlWriter.on("error", reject);
     });
 }
 async function retrieveDataset(dataDir, dataset) {
@@ -202,9 +176,31 @@ async function retrieveDataset(dataDir, dataset) {
             dataset.repairZip(dataset, dataDir);
         }
     }
-    // Encoding repair is now handled in copyToSenat for better performance (single-pass processing)
-    // The separate encoding repair step has been removed
+    if ((options["all"] || options["repairEncoding"]) && dataset.repairEncoding) {
+        if (!options["silent"]) {
+            console.log(`Repairing Windows CP1252 encoding in ${dataset.title}: ${sqlFilename}…`);
+        }
+        const repairedSqlFilePath = sqlFilePath + ".repaired";
+        const repairedSqlWriter = fs.createWriteStream(repairedSqlFilePath, {
+            encoding: "utf8",
+        });
+        // Read the file as latin1 (ISO-8859-1/CP1252) and write as UTF-8
+        const lineReader = readline.createInterface({
+            input: fs.createReadStream(sqlFilePath, { encoding: "latin1" }),
+            crlfDelay: Infinity,
+        });
+        for await (const line of lineReader) {
+            // Optionally repair Windows-1252 control characters
+            let repairedLine = line.replace(badWindows1252CharacterRegex, (match) => windows1252.decode(match, { mode: "fatal" }));
+            repairedSqlWriter.write(repairedLine + "\n");
+        }
+        repairedSqlWriter.end();
+        await fs.move(repairedSqlFilePath, sqlFilePath, { overwrite: true });
+    }
     if (options["all"] || options["import"] || options["schema"]) {
+        if (!options["silent"]) {
+            console.log(`Importing ${dataset.title}: ${sqlFilename}…`);
+        }
         await copyToSenat(dataset, dataDir, options);
         // Create indexes programmatically after import
         if (dataset.indexes) {
@@ -274,7 +270,12 @@ async function retrieveOpenData() {
         process.env["PGUSER"] &&
         process.env["PGPASSWORD"], "Missing database configuration: environment variables PGHOST, PGPORT, PGUSER and PGPASSWORD or TRICOTEUSES_SENAT_DB_* in .env file");
     console.time("data extraction time");
-    execSync(`${options["sudo"] ? `sudo -u ${options["sudo"]} ` : ""}psql --quiet -c "CREATE DATABASE senat WITH OWNER opendata" || true`, {
+    execSync(`${options["sudo"] ? `sudo -u ${options["sudo"]} ` : ""}psql --quiet -c "DROP DATABASE IF EXISTS senat"`, {
+        cwd: dataDir,
+        env: process.env,
+        encoding: "utf-8",
+    });
+    execSync(`${options["sudo"] ? `sudo -u ${options["sudo"]} ` : ""}psql --quiet -c "CREATE DATABASE senat WITH OWNER opendata"`, {
         cwd: dataDir,
         env: process.env,
         encoding: "utf-8",

package/lib/scripts/retrieve_videos.js CHANGED Viewed

@@ -135,15 +135,6 @@ function extractCandidatesFromSearchHtml(html) {
         return true;
     });
 }
-function parseFinalNvs(nvs) {
-    const playerTag = nvs.match(/<player\b[^>]*>/i)?.[0];
-    if (!playerTag)
-        return {};
-    const sessionStartStr = playerTag.match(/\bsessionstart="(\d+)"/i)?.[1];
-    return {
-        sessionStart: sessionStartStr ? Number(sessionStartStr) : undefined,
-    };
-}
 function parseDataNvs(nvs) {
     const epochStr = nvs.match(/<metadata\s+name="date"\s+value="(\d+)"/i)?.[1];
     const epoch = epochStr ? Number(epochStr) : undefined;
@@ -207,8 +198,6 @@ function score(agenda, agendaTs, sameOrg, videoTitle, videoEpoch, videoOrganes)
     const titleScore = Math.max(objetS, titleS);
     let timeScore = 0;
     if (agendaTs && videoEpoch) {
-        console.log("agendaTs", agendaTs);
-        console.log("videoEpoch", videoEpoch);
         const deltaMin = Math.abs(videoEpoch - agendaTs) / 60;
         timeScore = Math.exp(-deltaMin / 60);
     }
@@ -292,142 +281,168 @@ async function processGroupedReunion(agenda, session, dataDir) {
     if (agendaTs && agendaTs * 1000 > now) {
         return;
     }
-    STATS.total++;
     const reunionUid = agenda.uid;
     const baseDir = path.join(dataDir, VIDEOS_ROOT_FOLDER, String(session), reunionUid);
     await fs.ensureDir(baseDir);
-    const searchParams = {
-        search: "true",
-        videotype: getAgendaType(agenda),
-    };
-    if (agenda.date) {
-        const fr = toFRDate(agenda.date);
-        searchParams.period = "custom";
-        searchParams.begin = fr;
-        searchParams.end = fr;
-    }
-    if (agenda.organe) {
-        searchParams.organe = agenda.organe;
-    }
-    const pages = await fetchAllSearchPages(searchParams);
-    if (!pages.length) {
-        if (!options["silent"]) {
-            console.log(`[miss] ${agenda.uid} no candidates (videotype=${searchParams.videotype}, organe=${searchParams.organe || "-"}, date=${searchParams.begin || "-"})`);
+    let skipDownload = false;
+    if (options["only-recent"]) {
+        const now = Date.now();
+        const cutoff = now - options["only-recent"] * 24 * 3600 * 1000;
+        const reunionTs = Date.parse(agenda.date);
+        if (reunionTs < cutoff) {
+            // Check if files already exist
+            const dataNvsPath = path.join(baseDir, "data.nvs");
+            const finalplayerNvsPath = path.join(baseDir, "finalplayer.nvs");
+            if (fs.existsSync(dataNvsPath) && fs.existsSync(finalplayerNvsPath)) {
+                skipDownload = true;
+            }
         }
-        return;
     }
-    const combinedHtml = pages.join("\n<!-- PAGE SPLIT -->\n");
-    const candidates = extractCandidatesFromSearchHtml(combinedHtml).slice(0, MAX_CANDIDATES);
-    if (!candidates.length) {
-        if (!options["silent"]) {
-            console.log(`[miss] ${agenda.uid} no candidates after parse (videotype=${searchParams.videotype}, organe=${searchParams.organe || "-"}, date=${searchParams.begin || "-"})`);
+    let master = null;
+    let accepted = false;
+    if (!skipDownload) {
+        STATS.total++;
+        const searchParams = {
+            search: "true",
+            videotype: getAgendaType(agenda),
+        };
+        if (agenda.date) {
+            const fr = toFRDate(agenda.date);
+            searchParams.period = "custom";
+            searchParams.begin = fr;
+            searchParams.end = fr;
         }
-        return;
-    }
-    // ==== 2) Enrich via data.nvs + scoring; pick best ====
-    let best = null;
-    for (const c of candidates) {
-        const dataUrl = `${SENAT_DATAS_ROOT}/${c.id}_${c.hash}/content/data.nvs`;
-        const finalUrl = `${SENAT_DATAS_ROOT}/${c.id}_${c.hash}/content/finalplayer.nvs`;
-        const dataBuf = await fetchBuffer(dataUrl);
-        if (!dataBuf)
-            continue;
-        const meta = parseDataNvs(dataBuf.toString("utf-8"));
-        let sameOrg = false;
-        // If organes are too different, go to next candidates
-        if (agenda.organe && meta.organes?.length) {
-            const agendaOrgNorm = normalize(agenda.organe);
-            const agendaKey = getOrgKey(agendaOrgNorm);
-            let bestDice = 0;
-            let hasSameKey = false;
-            for (const vo of meta.organes) {
-                const videoOrgNorm = normalize(vo);
-                const videoKey = getOrgKey(videoOrgNorm);
-                const d = dice(agendaOrgNorm, videoOrgNorm);
-                if (videoKey === agendaKey && videoKey !== "autre") {
-                    hasSameKey = true;
-                }
-                if (d > bestDice)
-                    bestDice = d;
+        if (agenda.organe) {
+            searchParams.organe = agenda.organe;
+        }
+        const pages = await fetchAllSearchPages(searchParams);
+        if (!pages.length) {
+            if (!options["silent"]) {
+                console.log(`[miss] ${agenda.uid} no candidates (videotype=${searchParams.videotype}, organe=${searchParams.organe || "-"}, date=${searchParams.begin || "-"})`);
             }
-            if (hasSameKey) {
-                sameOrg = true; // we are sure this is the same org
+            return;
+        }
+        const combinedHtml = pages.join("\n<!-- PAGE SPLIT -->\n");
+        const candidates = extractCandidatesFromSearchHtml(combinedHtml).slice(0, MAX_CANDIDATES);
+        if (!candidates.length) {
+            if (!options["silent"]) {
+                console.log(`[miss] ${agenda.uid} no candidates after parse (videotype=${searchParams.videotype}, organe=${searchParams.organe || "-"}, date=${searchParams.begin || "-"})`);
             }
-            else if (bestDice < 0.8) {
-                // if diff org and dice too low we skip
+            return;
+        }
+        // ==== 2) Enrich via data.nvs + scoring; pick best ====
+        let best = null;
+        for (const c of candidates) {
+            const dataUrl = `${SENAT_DATAS_ROOT}/${c.id}_${c.hash}/content/data.nvs`;
+            const finalUrl = `${SENAT_DATAS_ROOT}/${c.id}_${c.hash}/content/finalplayer.nvs`;
+            const dataBuf = await fetchBuffer(dataUrl);
+            if (!dataBuf)
                 continue;
+            const meta = parseDataNvs(dataBuf.toString("utf-8"));
+            let sameOrg = false;
+            // If organes are too different, go to next candidates
+            if (agenda.organe && meta.organes?.length) {
+                const agendaOrgNorm = normalize(agenda.organe);
+                const agendaKey = getOrgKey(agendaOrgNorm);
+                let bestDice = 0;
+                let hasSameKey = false;
+                for (const vo of meta.organes) {
+                    const videoOrgNorm = normalize(vo);
+                    const videoKey = getOrgKey(videoOrgNorm);
+                    const d = dice(agendaOrgNorm, videoOrgNorm);
+                    if (videoKey === agendaKey && videoKey !== "autre") {
+                        hasSameKey = true;
+                    }
+                    if (d > bestDice)
+                        bestDice = d;
+                }
+                if (hasSameKey) {
+                    sameOrg = true; // we are sure this is the same org
+                }
+                else if (bestDice < 0.8) {
+                    // if diff org and dice too low we skip
+                    continue;
+                }
+            }
+            let videoTitle = c.title;
+            if (c.isSeancePublique && meta.firstChapterLabel) {
+                videoTitle = meta.firstChapterLabel;
+            }
+            const s = score(agenda, agendaTs, sameOrg, videoTitle, meta.epoch, meta.organes);
+            if (!best || s > best.score) {
+                best = {
+                    id: c.id,
+                    hash: c.hash,
+                    pageUrl: c.pageUrl,
+                    epoch: meta.epoch,
+                    vtitle: videoTitle,
+                    score: s,
+                    vorgane: meta.organes[0],
+                };
             }
         }
-        let videoTitle = c.title;
-        if (c.isSeancePublique && meta.firstChapterLabel) {
-            videoTitle = meta.firstChapterLabel;
+        if (!best) {
+            if (!options["silent"])
+                console.log(`[miss] ${agenda.uid} No candidate found for this reunion`);
+            return;
         }
-        const s = score(agenda, agendaTs, sameOrg, videoTitle, meta.epoch, meta.organes);
-        if (!best || s > best.score) {
-            best = {
-                id: c.id,
-                hash: c.hash,
-                pageUrl: c.pageUrl,
-                epoch: meta.epoch,
-                vtitle: videoTitle,
-                score: s,
-                vorgane: meta.organes[0],
-            };
+        accepted = best.score >= MATCH_THRESHOLD;
+        if (accepted)
+            STATS.accepted++;
+        if (!options["silent"]) {
+            console.log(`[pick] ${agenda.uid} score=${best.score.toFixed(2)}
+      agenda title="${agenda.titre ?? ""}" agenda organe="${agenda.organe ?? ""}" agenda heure=${agenda.startTime}
+      best title="${best.vtitle ?? ""}" best organe="${best.vorgane ?? ""}"
+      accepted=${accepted}`);
         }
+        // ==== 3) Write metadata + NVS of the best candidate (always) ====
+        const bestDt = best?.epoch ? epochToParisDateTime(best.epoch) : null;
+        const metadata = {
+            reunionUid,
+            session,
+            accepted,
+            threshold: MATCH_THRESHOLD,
+            agenda: {
+                date: agenda.date,
+                startTime: agenda.startTime,
+                titre: agenda.titre,
+                organe: agenda.organe ?? undefined,
+                uid: agenda.uid,
+            },
+            best: {
+                id: best.id,
+                hash: best.hash,
+                pageUrl: best.pageUrl,
+                epoch: best.epoch ?? null,
+                date: bestDt?.date ?? null,
+                startTime: bestDt?.startTime ?? null,
+                title: best.vtitle ?? null,
+                score: best.score,
+            },
+        };
+        await writeIfChanged(path.join(baseDir, "metadata.json"), JSON.stringify(metadata, null, 2));
+        const dataUrl = `${SENAT_DATAS_ROOT}/${best.id}_${best.hash}/content/data.nvs`;
+        const finalUrl = `${SENAT_DATAS_ROOT}/${best.id}_${best.hash}/content/finalplayer.nvs`;
+        const dataTxt = await fetchText(dataUrl);
+        const finalTxt = await fetchText(finalUrl);
+        if (dataTxt)
+            await fsp.writeFile(path.join(baseDir, "data.nvs"), dataTxt, "utf-8");
+        if (finalTxt)
+            await fsp.writeFile(path.join(baseDir, "finalplayer.nvs"), finalTxt, "utf-8");
+        if (dataTxt && finalTxt)
+            master = buildSenatVodMasterM3u8FromNvs(dataTxt, finalTxt);
     }
-    if (!best) {
-        if (!options["silent"])
-            console.log(`[miss] ${agenda.uid} No candidate found for this reunion`);
-        return;
-    }
-    const accepted = best.score >= MATCH_THRESHOLD;
-    if (accepted)
-        STATS.accepted++;
-    if (!options["silent"]) {
-        console.log(`[pick] ${agenda.uid} score=${best.score.toFixed(2)}
-      agenda title="${agenda.titre ?? ""}" agenda organe="${agenda.organe ?? ""}" agenda heure=${agenda.startTime}
-      best title="${best.vtitle ?? ""}" best organe="${best.vorgane ?? ""}"
-      accepted=${accepted}`);
+    else {
+        // Skipped download, but need to read data.nvs for urlVideo
+        try {
+            const dataTxt = await fsp.readFile(path.join(baseDir, "data.nvs"), "utf-8");
+            const finalTxt = await fsp.readFile(path.join(baseDir, "finalplayer.nvs"), "utf-8");
+            master = buildSenatVodMasterM3u8FromNvs(dataTxt, finalTxt);
+        }
+        catch { }
     }
-    // ==== 3) Write metadata + NVS of the best candidate (always) ====
-    const bestDt = best?.epoch ? epochToParisDateTime(best.epoch) : null;
-    const metadata = {
-        reunionUid,
-        session,
-        accepted,
-        threshold: MATCH_THRESHOLD,
-        agenda: {
-            date: agenda.date,
-            startTime: agenda.startTime,
-            titre: agenda.titre,
-            organe: agenda.organe ?? undefined,
-            uid: agenda.uid,
-        },
-        best: {
-            id: best.id,
-            hash: best.hash,
-            pageUrl: best.pageUrl,
-            epoch: best.epoch ?? null,
-            date: bestDt?.date ?? null,
-            startTime: bestDt?.startTime ?? null,
-            title: best.vtitle ?? null,
-            score: best.score,
-        },
-    };
-    await writeIfChanged(path.join(baseDir, "metadata.json"), JSON.stringify(metadata, null, 2));
-    const dataUrl = `${SENAT_DATAS_ROOT}/${best.id}_${best.hash}/content/data.nvs`;
-    const finalUrl = `${SENAT_DATAS_ROOT}/${best.id}_${best.hash}/content/finalplayer.nvs`;
-    const dataTxt = await fetchText(dataUrl);
-    const finalTxt = await fetchText(finalUrl);
-    if (dataTxt)
-        await fsp.writeFile(path.join(baseDir, "data.nvs"), dataTxt, "utf-8");
-    if (finalTxt)
-        await fsp.writeFile(path.join(baseDir, "finalplayer.nvs"), finalTxt, "utf-8");
-    let master = null;
-    if (dataTxt && finalTxt)
-        master = buildSenatVodMasterM3u8FromNvs(dataTxt, finalTxt);
     // ==== 4) Update agenda file (only if accepted + m3u8) ====
-    if (accepted && master) {
+    if ((accepted || skipDownload) && master) {
         const agendaJsonPath = path.join(dataDir, AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, String(session), `${agenda.uid}.json`);
         if (await fs.pathExists(agendaJsonPath)) {
             const raw = await fsp.readFile(agendaJsonPath, "utf-8");

package/lib/scripts/shared/cli_helpers.d.ts CHANGED Viewed

@@ -48,13 +48,12 @@ export declare const commonOptions: ({
     name: string;
     type: StringConstructor;
 } | {
-    defaultValue: number;
+    alias: string;
     help: string;
     name: string;
-    type: NumberConstructor;
+    type: BooleanConstructor;
 } | {
-    alias: string;
     help: string;
     name: string;
-    type: BooleanConstructor;
+    type: NumberConstructor;
 })[];

package/lib/scripts/shared/cli_helpers.js CHANGED Viewed

@@ -35,4 +35,11 @@ export const onlyRecentOption = {
     name: "only-recent",
     type: Number,
 };
-export const commonOptions = [categoriesOption, dataDirDefaultOption, fromSessionOption, silentOption, verboseOption];
+export const commonOptions = [
+    categoriesOption,
+    dataDirDefaultOption,
+    fromSessionOption,
+    silentOption,
+    verboseOption,
+    onlyRecentOption,
+];

package/package.json CHANGED Viewed

@@ -1,101 +1,101 @@
-{
-  "name": "@tricoteuses/senat",
-  "version": "2.18.11",
-  "description": "Handle French Sénat's open data",
-  "keywords": [
-    "France",
-    "open data",
-    "Parliament",
-    "Sénat"
-  ],
-  "author": "Emmanuel Raviart <emmanuel@raviart.com>",
-  "bugs": {
-    "url": "https://git.tricoteuses.fr/logiciels/tricoteuses-senat/issues"
-  },
-  "homepage": "https://tricoteuses.fr/",
-  "license": "AGPL-3.0-or-later",
-  "repository": {
-    "type": "git",
-    "url": "https://git.tricoteuses.fr/logiciels/tricoteuses-senat.git"
-  },
-  "type": "module",
-  "engines": {
-    "node": ">=22"
-  },
-  "files": [
-    "lib"
-  ],
-  "exports": {
-    ".": {
-      "import": "./lib/index.js",
-      "types": "./lib/index.d.ts"
-    },
-    "./loaders": {
-      "import": "./lib/loaders.js",
-      "types": "./lib/loaders.d.ts"
-    },
-    "./package.json": "./package.json"
-  },
-  "publishConfig": {
-    "access": "public"
-  },
-  "scripts": {
-    "build": "tsc",
-    "build:types": "tsc --emitDeclarationOnly",
-    "data:convert_data": "tsx src/scripts/convert_data.ts",
-    "data:download": "tsx src/scripts/data-download.ts",
-    "data:generate_schemas": "tsx src/scripts/retrieve_open_data.ts --schema",
-    "data:retrieve_agenda": "cross-env TZ='Etc/UTC' tsx src/scripts/retrieve_agenda.ts",
-    "data:retrieve_cr_seance": "tsx src/scripts/retrieve_cr_seance.ts",
-    "data:retrieve_cr_commission": "tsx src/scripts/retrieve_cr_commission.ts",
-    "data:retrieve_documents": "tsx src/scripts/retrieve_documents.ts",
-    "data:retrieve_open_data": "tsx src/scripts/retrieve_open_data.ts --all",
-    "data:retrieve_senateurs_photos": "tsx src/scripts/retrieve_senateurs_photos.ts --fetch",
-    "data:retrieve_videos": "tsx src/scripts/retrieve_videos.ts",
-    "data:parse_textes_lois": "tsx src/scripts/parse_textes.ts",
-    "prepare": "npm run build",
-    "prepublishOnly": "npm run build",
-    "prettier": "prettier --write 'src/**/*.ts' 'tests/**/*.test.ts'",
-    "test:iter_load": "tsx src/scripts/test_iter_load.ts",
-    "type-check": "tsc --noEmit",
-    "type-check:watch": "npm run type-check -- --watch"
-  },
-  "dependencies": {
-    "@biryani/core": "^0.2.1",
-    "cheerio": "^1.1.2",
-    "command-line-args": "^5.1.1",
-    "dotenv": "^8.2.0",
-    "fs-extra": "^9.1.0",
-    "jsdom": "^26.0.0",
-    "kysely": "^0.27.4",
-    "luxon": "^3.7.2",
-    "node-stream-zip": "^1.8.2",
-    "pg": "^8.13.1",
-    "pg-cursor": "^2.12.1",
-    "p-limit": "^7.2.0",
-    "slug": "^11.0.0",
-    "tsx": "^4.20.6",
-    "windows-1252": "^1.0.0"
-  },
-  "devDependencies": {
-    "@typed-code/schemats": "^5.0.1",
-    "@types/cheerio": "^1.0.0",
-    "@types/command-line-args": "^5.0.0",
-    "@types/fs-extra": "^9.0.7",
-    "@types/jsdom": "^21.1.7",
-    "@types/luxon": "^3.7.1",
-    "@types/node": "^20.17.6",
-    "@types/pg": "^8.15.5",
-    "@types/pg-cursor": "^2.7.2",
-    "@types/slug": "^5.0.9",
-    "@typescript-eslint/eslint-plugin": "^8.46.0",
-    "@typescript-eslint/parser": "^8.46.0",
-    "cross-env": "^10.1.0",
-    "eslint": "^8.57.1",
-    "iconv-lite": "^0.7.0",
-    "kysely-codegen": "^0.19.0",
-    "prettier": "^3.5.3",
-    "tslib": "^2.1.0",
-    "typescript": "^5.9.3"
-  }
-}
+{
+  "name": "@tricoteuses/senat",
+  "version": "2.18.13",
+  "description": "Handle French Sénat's open data",
+  "keywords": [
+    "France",
+    "open data",
+    "Parliament",
+    "Sénat"
+  ],
+  "author": "Emmanuel Raviart <emmanuel@raviart.com>",
+  "bugs": {
+    "url": "https://git.tricoteuses.fr/logiciels/tricoteuses-senat/issues"
+  },
+  "homepage": "https://tricoteuses.fr/",
+  "license": "AGPL-3.0-or-later",
+  "repository": {
+    "type": "git",
+    "url": "https://git.tricoteuses.fr/logiciels/tricoteuses-senat.git"
+  },
+  "type": "module",
+  "engines": {
+    "node": ">=22"
+  },
+  "files": [
+    "lib"
+  ],
+  "exports": {
+    ".": {
+      "import": "./lib/index.js",
+      "types": "./lib/index.d.ts"
+    },
+    "./loaders": {
+      "import": "./lib/loaders.js",
+      "types": "./lib/loaders.d.ts"
+    },
+    "./package.json": "./package.json"
+  },
+  "publishConfig": {
+    "access": "public"
+  },
+  "scripts": {
+    "build": "tsc",
+    "build:types": "tsc --emitDeclarationOnly",
+    "data:convert_data": "tsx src/scripts/convert_data.ts",
+    "data:download": "tsx src/scripts/data-download.ts",
+    "data:generate_schemas": "tsx src/scripts/retrieve_open_data.ts --schema",
+    "data:retrieve_agenda": "cross-env TZ='Etc/UTC' tsx src/scripts/retrieve_agenda.ts",
+    "data:retrieve_cr_seance": "tsx src/scripts/retrieve_cr_seance.ts",
+    "data:retrieve_cr_commission": "tsx src/scripts/retrieve_cr_commission.ts",
+    "data:retrieve_documents": "tsx src/scripts/retrieve_documents.ts",
+    "data:retrieve_open_data": "tsx src/scripts/retrieve_open_data.ts --all",
+    "data:retrieve_senateurs_photos": "tsx src/scripts/retrieve_senateurs_photos.ts --fetch",
+    "data:retrieve_videos": "tsx src/scripts/retrieve_videos.ts",
+    "data:parse_textes_lois": "tsx src/scripts/parse_textes.ts",
+    "prepare": "npm run build",
+    "prepublishOnly": "npm run build",
+    "prettier": "prettier --write 'src/**/*.ts' 'tests/**/*.test.ts'",
+    "test:iter_load": "tsx src/scripts/test_iter_load.ts",
+    "type-check": "tsc --noEmit",
+    "type-check:watch": "npm run type-check -- --watch"
+  },
+  "dependencies": {
+    "@biryani/core": "^0.2.1",
+    "cheerio": "^1.1.2",
+    "command-line-args": "^5.1.1",
+    "dotenv": "^8.2.0",
+    "fs-extra": "^9.1.0",
+    "jsdom": "^26.0.0",
+    "kysely": "^0.27.4",
+    "luxon": "^3.7.2",
+    "node-stream-zip": "^1.8.2",
+    "pg": "^8.13.1",
+    "pg-cursor": "^2.12.1",
+    "p-limit": "^7.2.0",
+    "slug": "^11.0.0",
+    "tsx": "^4.20.6",
+    "windows-1252": "^1.0.0"
+  },
+  "devDependencies": {
+    "@typed-code/schemats": "^5.0.1",
+    "@types/cheerio": "^1.0.0",
+    "@types/command-line-args": "^5.0.0",
+    "@types/fs-extra": "^9.0.7",
+    "@types/jsdom": "^21.1.7",
+    "@types/luxon": "^3.7.1",
+    "@types/node": "^20.17.6",
+    "@types/pg": "^8.15.5",
+    "@types/pg-cursor": "^2.7.2",
+    "@types/slug": "^5.0.9",
+    "@typescript-eslint/eslint-plugin": "^8.46.0",
+    "@typescript-eslint/parser": "^8.46.0",
+    "cross-env": "^10.1.0",
+    "eslint": "^8.57.1",
+    "iconv-lite": "^0.7.0",
+    "kysely-codegen": "^0.19.0",
+    "prettier": "^3.5.3",
+    "tslib": "^2.1.0",
+    "typescript": "^5.9.3"
+  }
+}