@tricoteuses/senat 2.16.2 → 2.16.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ /**
2
+ * This file was generated by kysely-codegen.
3
+ * Please do not edit it manually.
4
+ */
5
+ export {};
@@ -9,7 +9,7 @@ import { createActesLegislatifs } from "../model/dosleg";
9
9
  import { UNDEFINED_SESSION } from "../types/sessions";
10
10
  import { getSessionFromDate, getSessionFromSignet } from "./datautil";
11
11
  import { commonOptions } from "./shared/cli_helpers";
12
- import { ensureAndClearDir } from "./shared/util";
12
+ import { ensureAndClearDir, ensureAndClearDirSync } from "./shared/util";
13
13
  const optionsDefinitions = [...commonOptions];
14
14
  const options = commandLineArgs(optionsDefinitions);
15
15
  const SENAT_TEXTE_XML_BASE_URL = "https://www.senat.fr/akomantoso/";
@@ -47,7 +47,7 @@ async function convertDatasetAmeli(dataDir, options) {
47
47
  console.log(`Converting database ${dataset.database} data into files…`);
48
48
  }
49
49
  const ameliReorganizedRootDir = path.join(dataDir, dataset.database);
50
- ensureAndClearDir(ameliReorganizedRootDir);
50
+ ensureAndClearDirSync(ameliReorganizedRootDir);
51
51
  for await (const amendement of findAllAmendements(options["fromSession"])) {
52
52
  if (options["verbose"]) {
53
53
  console.log(`Converting ${amendement["numero"]} file…`);
@@ -88,18 +88,20 @@ async function enrichDebat(debat, auteurs) {
88
88
  const enrichedDebat = { ...debat };
89
89
  for (const section of enrichedDebat.sections) {
90
90
  for (const intervention of section.interventions) {
91
+ ;
91
92
  intervention.auteur = findAuteur(intervention["auteur_code"], auteurs);
92
93
  }
93
94
  }
94
95
  for (const section of enrichedDebat.sections_divers) {
95
96
  for (const intervention of section.interventions) {
97
+ ;
96
98
  intervention.auteur = findAuteur(intervention["auteur_code"], auteurs);
97
99
  }
98
100
  }
99
101
  return enrichedDebat;
100
102
  }
101
103
  function findAuteur(auteurCode, auteurs) {
102
- return auteurs.find(auteur => auteur.code === auteurCode);
104
+ return auteurs.find((auteur) => auteur.code === auteurCode);
103
105
  }
104
106
  async function convertDatasetDosLeg(dataDir, options) {
105
107
  const dataset = datasets.dosleg;
@@ -0,0 +1,6 @@
1
+ /**
2
+ * Needs to be run after retrieve_agenda.ts !
3
+ * - downloads the ZIP of comptes-rendus des débats (CRI) from data.senat.fr
4
+ * - extracts XML files, distributes them by session/year
5
+ */
6
+ export declare function retrieveCriXmlDump(dataDir: string, options?: Record<string, any>): Promise<void>;
@@ -0,0 +1,274 @@
1
+ /**
2
+ * Needs to be run after retrieve_agenda.ts !
3
+ * - downloads the ZIP of comptes-rendus des débats (CRI) from data.senat.fr
4
+ * - extracts XML files, distributes them by session/year
5
+ */
6
+ import assert from "assert";
7
+ import commandLineArgs from "command-line-args";
8
+ import fs from "fs-extra";
9
+ import path from "path";
10
+ import StreamZip from "node-stream-zip";
11
+ import * as cheerio from "cheerio";
12
+ import { AGENDA_FOLDER, COMPTES_RENDUS_FOLDER, DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER, } from "../loaders";
13
+ import { commonOptions } from "./shared/cli_helpers";
14
+ import { deriveTitreObjetFromSommaire, parseCompteRenduSlotFromFile, parseYYYYMMDD, sessionStartYearFromDate } from "../model/compte_rendu";
15
+ import { makeGroupUid } from "../utils/reunion_grouping";
16
+ import { getSessionsFromStart } from "../types/sessions";
17
+ import { ensureAndClearDir, fetchWithRetry } from "./shared/util";
18
+ import { computeIntervalsBySlot } from "../utils/cr_spliting";
19
+ const optionsDefinitions = [
20
+ ...commonOptions,
21
+ {
22
+ help: "parse and convert comptes-rendus des débats into JSON",
23
+ name: "parseDebats",
24
+ type: Boolean,
25
+ }
26
+ ];
27
+ const options = commandLineArgs(optionsDefinitions);
28
+ const CRI_ZIP_URL = "https://data.senat.fr/data/debats/cri.zip";
29
+ const SLOT_ORDER = ["MATIN", "APRES-MIDI", "SOIR"];
30
+ class CompteRenduError extends Error {
31
+ constructor(message, url) {
32
+ super(`An error occurred while retrieving ${url}: ${message}`);
33
+ }
34
+ }
35
+ function pickFirstSlotOfDay(slots) {
36
+ for (const s of SLOT_ORDER)
37
+ if (slots.includes(s))
38
+ return s;
39
+ return null;
40
+ }
41
+ function loadAgendaSPSlotsForDate(dataDir, yyyymmdd, session) {
42
+ const dirPath = path.join(dataDir, AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, session.toString());
43
+ if (!fs.existsSync(dirPath)) {
44
+ console.warn(`[AGENDA] Directory not found for session ${session} → ${dirPath}`);
45
+ return null;
46
+ }
47
+ const pattern = new RegExp(`^RUSN${yyyymmdd}IDS-(MATIN|APRES-MIDI|SOIR)\\.json$`);
48
+ const ALLOWED_SLOTS = new Set(["MATIN", "APRES-MIDI", "SOIR"]);
49
+ try {
50
+ const files = fs.readdirSync(dirPath);
51
+ const matched = files.filter((f) => pattern.test(f));
52
+ if (matched.length === 0) {
53
+ return null;
54
+ }
55
+ const found = new Set();
56
+ for (const name of matched) {
57
+ const m = name.match(pattern);
58
+ const raw = (m?.[1] ?? "");
59
+ if (ALLOWED_SLOTS.has(raw))
60
+ found.add(raw);
61
+ }
62
+ const slots = Array.from(found);
63
+ if (slots.length === 0) {
64
+ return null;
65
+ }
66
+ return { filePath: dirPath, slots };
67
+ }
68
+ catch {
69
+ return null;
70
+ }
71
+ }
72
+ async function downloadCriZip(zipPath) {
73
+ if (!options["silent"])
74
+ console.log(`Downloading CRI zip ${CRI_ZIP_URL}…`);
75
+ const response = await fetchWithRetry(CRI_ZIP_URL);
76
+ if (!response.ok) {
77
+ if (response.status === 404) {
78
+ console.warn(`CRI zip ${CRI_ZIP_URL} not found`);
79
+ return;
80
+ }
81
+ throw new CompteRenduError(String(response.status), CRI_ZIP_URL);
82
+ }
83
+ const buf = Buffer.from(await response.arrayBuffer());
84
+ await fs.writeFile(zipPath, buf);
85
+ if (!options["silent"]) {
86
+ const mb = (buf.length / (1024 * 1024)).toFixed(1);
87
+ console.log(`[CRI] Downloaded ${mb} MB → ${zipPath}`);
88
+ }
89
+ }
90
+ async function extractAndDistributeXmlBySession(zipPath, originalRoot) {
91
+ const zip = new StreamZip.async({ file: zipPath });
92
+ const entries = await zip.entries();
93
+ let count = 0;
94
+ for (const entryName of Object.keys(entries)) {
95
+ if (!entryName.toLowerCase().endsWith(".xml"))
96
+ continue;
97
+ // ex: d20231005.xml
98
+ const base = path.basename(entryName);
99
+ const m = base.match(/^d(\d{8})\.xml$/i);
100
+ if (!m)
101
+ continue;
102
+ const yyyymmdd = m[1];
103
+ const dt = parseYYYYMMDD(yyyymmdd);
104
+ if (!dt)
105
+ continue;
106
+ const session = sessionStartYearFromDate(dt);
107
+ const destDir = path.join(originalRoot, String(session));
108
+ await fs.ensureDir(destDir);
109
+ const outPath = path.join(destDir, base);
110
+ await zip.extract(entryName, outPath);
111
+ count++;
112
+ }
113
+ await zip.close();
114
+ return count;
115
+ }
116
+ export async function retrieveCriXmlDump(dataDir, options = {}) {
117
+ const root = path.join(dataDir, COMPTES_RENDUS_FOLDER);
118
+ ensureAndClearDir(root);
119
+ const originalRoot = path.join(root, DATA_ORIGINAL_FOLDER);
120
+ fs.ensureDirSync(originalRoot);
121
+ const transformedRoot = path.join(root, DATA_TRANSFORMED_FOLDER);
122
+ if (options["parseDebats"])
123
+ fs.ensureDirSync(transformedRoot);
124
+ const sessions = getSessionsFromStart(options["fromSession"]);
125
+ // 1) Download ZIP global + distribut by session
126
+ const zipPath = path.join(dataDir, "cri.zip");
127
+ console.log("[CRI] Downloading global CRI zip…");
128
+ await downloadCriZip(zipPath);
129
+ console.log("[CRI] Extracting + distributing XMLs by session…");
130
+ for (const session of sessions) {
131
+ const dir = path.join(originalRoot, String(session));
132
+ if (await fs.pathExists(dir)) {
133
+ for (const f of await fs.readdir(dir))
134
+ if (/\.xml$/i.test(f))
135
+ await fs.remove(path.join(dir, f));
136
+ }
137
+ }
138
+ const n = await extractAndDistributeXmlBySession(zipPath, originalRoot);
139
+ if (n === 0) {
140
+ console.warn("[CRI] No XML extracted. Archive empty or layout changed?");
141
+ }
142
+ else {
143
+ console.log(`[CRI] Distributed ${n} XML file(s) into session folders.`);
144
+ }
145
+ if (!options["parseDebats"]) {
146
+ console.log("[CRI] parseDebats not requested → done.");
147
+ return;
148
+ }
149
+ for (const session of sessions) {
150
+ const originalSessionDir = path.join(originalRoot, String(session));
151
+ if (!(await fs.pathExists(originalSessionDir))) {
152
+ continue;
153
+ }
154
+ const xmlFiles = (await fs.readdir(originalSessionDir))
155
+ .filter((f) => /^d\d{8}\.xml$/i.test(f))
156
+ .sort();
157
+ const transformedSessionDir = path.join(transformedRoot, String(session));
158
+ if (options["parseDebats"])
159
+ await fs.ensureDir(transformedSessionDir);
160
+ for (const f of xmlFiles) {
161
+ const yyyymmdd = f.slice(1, 9);
162
+ const xmlPath = path.join(originalSessionDir, f);
163
+ // 1) Deduce slot(s) from agenda if it exsits
164
+ const agendaInfo = loadAgendaSPSlotsForDate(dataDir, yyyymmdd, session);
165
+ const firstSlotOfDay = pickFirstSlotOfDay(agendaInfo?.slots ?? []);
166
+ // 2) Detect slots from CRI content
167
+ let slotsInCri = [];
168
+ try {
169
+ const raw = await fs.readFile(xmlPath, "utf8");
170
+ const $ = cheerio.load(raw, { xml: false });
171
+ const order = $("body *").toArray();
172
+ const idx = new Map(order.map((el, i) => [el, i]));
173
+ const intervals = computeIntervalsBySlot($, idx, firstSlotOfDay ?? undefined);
174
+ const uniq = new Set();
175
+ for (const iv of intervals)
176
+ if (iv.slot && iv.slot !== "UNKNOWN")
177
+ uniq.add(iv.slot);
178
+ slotsInCri = Array.from(uniq);
179
+ }
180
+ catch (e) {
181
+ console.warn(`[CRI] [${session}] Cannot read/parse ${f}:`, e);
182
+ continue;
183
+ }
184
+ if (slotsInCri.length === 0) {
185
+ slotsInCri = [firstSlotOfDay ?? "MATIN"];
186
+ }
187
+ // 3) Parse & write each slot
188
+ for (const slot of slotsInCri) {
189
+ const outName = `CRSSN${yyyymmdd}-${slot}.json`;
190
+ const cr = await parseCompteRenduSlotFromFile(xmlPath, slot, firstSlotOfDay ?? slot);
191
+ if (!cr) {
192
+ console.warn(`[CRI] [${session}] Empty or no points for ${yyyymmdd} (${slot}) → skip`);
193
+ continue;
194
+ }
195
+ const outDir = transformedSessionDir;
196
+ await fs.ensureDir(outDir);
197
+ const outPath = path.join(outDir, outName);
198
+ await fs.writeJSON(outPath, cr, { spaces: 2 });
199
+ try {
200
+ await linkCriSlotIntoAgendaGrouped(dataDir, yyyymmdd, slot, cr.uid, cr, session);
201
+ }
202
+ catch (e) {
203
+ console.warn(`[AGENDA] [${session}] Could not link CR into grouped for ${yyyymmdd} ${slot}:`, e);
204
+ }
205
+ }
206
+ }
207
+ }
208
+ }
209
+ async function main() {
210
+ const dataDir = options["dataDir"];
211
+ assert(dataDir, "Missing argument: data directory");
212
+ console.time("CRI processing time");
213
+ await retrieveCriXmlDump(dataDir, options);
214
+ if (!options["silent"]) {
215
+ console.timeEnd("CRI processing time");
216
+ }
217
+ }
218
+ main()
219
+ .then(() => process.exit(0))
220
+ .catch((error) => {
221
+ console.error(error);
222
+ process.exit(1);
223
+ });
224
+ async function linkCriSlotIntoAgendaGrouped(dataDir, yyyymmdd, slot, crUid, cr, session) {
225
+ const groupedDir = path.join(dataDir, AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, session.toString());
226
+ fs.ensureDirSync(groupedDir);
227
+ const groupedPath = path.join(groupedDir, 'RUSN' + yyyymmdd + 'IDS-' + slot + '.json');
228
+ let groups = [];
229
+ if (fs.existsSync(groupedPath)) {
230
+ try {
231
+ groups = JSON.parse(fs.readFileSync(groupedPath, "utf8"));
232
+ if (!Array.isArray(groups))
233
+ groups = [];
234
+ }
235
+ catch (e) {
236
+ console.warn(`[AGENDA] unreadable grouped JSON → ${groupedPath} (${e}) → recreating`);
237
+ groups = [];
238
+ }
239
+ }
240
+ // find existing group with same slot
241
+ const sameSlot = groups.filter(g => g?.slot === slot);
242
+ let target = null;
243
+ if (sameSlot.length > 1) {
244
+ console.warn(`[AGENDA] multiple groups for ${yyyymmdd} ${slot} in ${groupedPath} → linking the first`);
245
+ }
246
+ target = sameSlot[0] ?? null;
247
+ const dateISO = `${yyyymmdd.slice(0, 4)}-${yyyymmdd.slice(4, 6)}-${yyyymmdd.slice(6, 8)}`;
248
+ const sommaire = cr?.metadonnees?.sommaire;
249
+ const { titre: dTitre, objet: dObjet } = deriveTitreObjetFromSommaire(sommaire, slot);
250
+ if (!target) {
251
+ const newGroup = {
252
+ uid: makeGroupUid(dateISO, slot),
253
+ chambre: "SN",
254
+ date: dateISO,
255
+ slot,
256
+ type: "Séance publique",
257
+ startTime: null,
258
+ endTime: null,
259
+ captationVideo: false,
260
+ titre: dTitre,
261
+ objet: dObjet || "",
262
+ events: [],
263
+ compteRenduRefUid: crUid,
264
+ };
265
+ groups.push(newGroup);
266
+ }
267
+ else {
268
+ target.compteRenduRefUid = crUid;
269
+ }
270
+ await fs.writeJSON(groupedPath, groups, { spaces: 2 });
271
+ if (!options["silent"]) {
272
+ console.log(`[AGENDA] Linked CR ${crUid} → ${path.basename(groupedPath)} [${slot}]`);
273
+ }
274
+ }
@@ -281,9 +281,9 @@ async function retrieveCommissionCRs(options = {}) {
281
281
  const MAX_TIME_DELTA_MIN = 120;
282
282
  const ORGANE_GATE = 0.55;
283
283
  const TITLE_GATE = 0.2;
284
- const W_ORG = 0.5;
285
- const W_TIM = 0.2;
286
- const W_TIT = 0.3;
284
+ const W_ORG = 0.4;
285
+ const W_TIM = 0.4;
286
+ const W_TIT = 0.2;
287
287
  for (let sIdx = 0; sIdx < sections.length; sIdx++) {
288
288
  const sec = sections[sIdx];
289
289
  let best = null;
@@ -292,7 +292,7 @@ async function retrieveCommissionCRs(options = {}) {
292
292
  const scored = hits
293
293
  .map((h) => {
294
294
  const sOrg = organeSimilarity(h, commissionKey); // 0..1
295
- const sTim = timeProximityScore(h, day.openTime ?? null, MAX_TIME_DELTA_MIN); // 0..1
295
+ const sTim = timeProximityScore(h, sec.time ?? day.openTime ?? null, MAX_TIME_DELTA_MIN); // 0..1
296
296
  const sTit = titleSimilarity(h, sec.title); // 0..1
297
297
  const total = W_ORG * sOrg + W_TIM * sTim + W_TIT * sTit;
298
298
  return { h, sOrg, sTim, sTit, total };
@@ -1,13 +1,11 @@
1
1
  import assert from "assert";
2
- import { execSync } from "child_process";
2
+ import { execSync, spawn } from "child_process";
3
3
  import commandLineArgs from "command-line-args";
4
4
  import fs from "fs-extra";
5
- // import fetch from "node-fetch"
6
5
  import path from "path";
7
6
  import StreamZip from "node-stream-zip";
8
- import readline from "readline";
9
7
  import windows1252 from "windows-1252";
10
- import { pipeline } from "stream";
8
+ import { pipeline, Transform } from "stream";
11
9
  import { promisify } from "util";
12
10
  import config from "../config";
13
11
  import { getChosenDatasets, getEnabledDatasets } from "../datasets";
@@ -69,60 +67,87 @@ async function downloadFile(url, dest) {
69
67
  }
70
68
  /**
71
69
  * Copy a dataset database to the main Senat database (overwriting its contents).
70
+ * Optimized to combine encoding repair and schema transformation in a single pass.
72
71
  */
73
72
  async function copyToSenat(dataset, dataDir, options) {
74
73
  if (!options["silent"]) {
75
74
  console.log(`Copying ${dataset.database} to Senat database...`);
76
75
  }
77
76
  const sqlFilePath = path.join(dataDir, `${dataset.database}.sql`);
78
- const schemaDumpFile = path.join(dataDir, `${dataset.database}_schema_dump.sql`);
79
- // Write the header and then stream the rest of the SQL file
80
- const schemaSqlWriter = fs.createWriteStream(schemaDumpFile, { encoding: "utf8" });
81
- // Add CREATE SCHEMA statement at the top
82
- schemaSqlWriter.write(`CREATE SCHEMA IF NOT EXISTS ${dataset.database};\n`);
83
- const lineReader = readline.createInterface({
84
- input: fs.createReadStream(sqlFilePath, { encoding: "utf8" }),
85
- crlfDelay: Infinity,
86
- });
87
- for await (const line of lineReader) {
88
- let newLine = line;
89
- // Replace 'public' schema outside single-quoted strings
90
- function replacePublicOutsideStrings(line, schema) {
91
- const parts = line.split(/(')/);
92
- let inString = false;
93
- for (let i = 0; i < parts.length; i++) {
94
- if (parts[i] === "'") {
95
- inString = !inString;
96
- }
97
- else if (!inString) {
98
- // Only replace outside of strings, including before comma
99
- parts[i] = parts[i].replace(/\bpublic\b(?=(\s*\.|\s*[,;]|\s|$))/g, schema);
100
- }
77
+ // Helper function to replace 'public' schema outside single-quoted strings
78
+ function replacePublicOutsideStrings(line, schema) {
79
+ const parts = line.split(/(')/);
80
+ let inString = false;
81
+ for (let i = 0; i < parts.length; i++) {
82
+ if (parts[i] === "'") {
83
+ inString = !inString;
84
+ }
85
+ else if (!inString) {
86
+ parts[i] = parts[i].replace(/\bpublic\b(?=(\s*\.|\s*[,;]|\s|$))/g, schema);
101
87
  }
102
- return parts.join('');
103
88
  }
104
- newLine = replacePublicOutsideStrings(line, dataset.database);
105
- // Replace SET client_encoding to UTF8
106
- newLine = newLine.replace(/SET client_encoding = 'LATIN1';/i, "SET client_encoding = 'UTF8';");
107
- schemaSqlWriter.write(newLine + "\n");
89
+ return parts.join('');
108
90
  }
109
- schemaSqlWriter.end();
110
- await new Promise((resolve, reject) => {
111
- schemaSqlWriter.on("finish", () => {
112
- try {
113
- execSync(`${options["sudo"] ? `sudo -u ${options["sudo"]} ` : ""}psql --quiet -d senat -f ${schemaDumpFile}`, {
114
- env: process.env,
115
- encoding: "utf-8",
116
- stdio: ["ignore", "ignore", "pipe"],
117
- });
91
+ // Spawn psql process
92
+ const psqlArgs = options["sudo"]
93
+ ? ["-u", options["sudo"], "psql", "--quiet", "-d", "senat"]
94
+ : ["--quiet", "-d", "senat"];
95
+ const psql = spawn(options["sudo"] ? "sudo" : "psql", psqlArgs, {
96
+ stdio: ["pipe", "ignore", "pipe"],
97
+ env: process.env,
98
+ });
99
+ psql.stdin.write(`CREATE SCHEMA IF NOT EXISTS ${dataset.database};\n`);
100
+ let buffer = '';
101
+ const combinedTransform = new Transform({
102
+ transform(chunk, encoding, callback) {
103
+ // Encoding repair if needed (decode from latin1 and fix Windows-1252 characters)
104
+ let data = dataset.repairEncoding
105
+ ? chunk.toString('latin1').replace(badWindows1252CharacterRegex, (match) => windows1252.decode(match, { mode: "fatal" }))
106
+ : chunk.toString();
107
+ buffer += data;
108
+ const lines = buffer.split('\n');
109
+ buffer = lines.pop() || '';
110
+ let processedData = '';
111
+ for (const line of lines) {
112
+ let newLine = replacePublicOutsideStrings(line, dataset.database);
113
+ newLine = newLine.replace(/SET client_encoding = 'LATIN1';/i, "SET client_encoding = 'UTF8';");
114
+ processedData += newLine + '\n';
115
+ }
116
+ callback(null, processedData);
117
+ },
118
+ flush(callback) {
119
+ // Process any remaining data in buffer
120
+ if (buffer) {
121
+ let newLine = replacePublicOutsideStrings(buffer, dataset.database);
122
+ newLine = newLine.replace(/SET client_encoding = 'LATIN1';/i, "SET client_encoding = 'UTF8';");
123
+ callback(null, newLine);
124
+ }
125
+ else {
126
+ callback();
127
+ }
128
+ }
129
+ });
130
+ let stderrData = '';
131
+ psql.stderr.on('data', (data) => {
132
+ stderrData += data.toString();
133
+ });
134
+ const pipelinePromise = streamPipeline(fs.createReadStream(sqlFilePath, {
135
+ encoding: dataset.repairEncoding ? undefined : "utf8",
136
+ highWaterMark: 4 * 1024 * 1024
137
+ }), combinedTransform, psql.stdin);
138
+ await pipelinePromise;
139
+ return new Promise((resolve, reject) => {
140
+ psql.on("close", (code) => {
141
+ if (code === 0) {
142
+ resolve();
118
143
  }
119
- finally {
120
- try { }
121
- catch { }
144
+ else {
145
+ if (!options["silent"] && stderrData) {
146
+ console.error(`psql stderr: ${stderrData}`);
147
+ }
148
+ reject(new Error(`psql exited with code ${code}`));
122
149
  }
123
- resolve();
124
150
  });
125
- schemaSqlWriter.on("error", reject);
126
151
  });
127
152
  }
128
153
  async function retrieveDataset(dataDir, dataset) {
@@ -176,31 +201,9 @@ async function retrieveDataset(dataDir, dataset) {
176
201
  dataset.repairZip(dataset, dataDir);
177
202
  }
178
203
  }
179
- if ((options["all"] || options["repairEncoding"]) && dataset.repairEncoding) {
180
- if (!options["silent"]) {
181
- console.log(`Repairing Windows CP1252 encoding in ${dataset.title}: ${sqlFilename}…`);
182
- }
183
- const repairedSqlFilePath = sqlFilePath + ".repaired";
184
- const repairedSqlWriter = fs.createWriteStream(repairedSqlFilePath, {
185
- encoding: "utf8",
186
- });
187
- // Read the file as latin1 (ISO-8859-1/CP1252) and write as UTF-8
188
- const lineReader = readline.createInterface({
189
- input: fs.createReadStream(sqlFilePath, { encoding: "latin1" }),
190
- crlfDelay: Infinity,
191
- });
192
- for await (const line of lineReader) {
193
- // Optionally repair Windows-1252 control characters
194
- let repairedLine = line.replace(badWindows1252CharacterRegex, (match) => windows1252.decode(match, { mode: "fatal" }));
195
- repairedSqlWriter.write(repairedLine + "\n");
196
- }
197
- repairedSqlWriter.end();
198
- await fs.move(repairedSqlFilePath, sqlFilePath, { overwrite: true });
199
- }
204
+ // Encoding repair is now handled in copyToSenat for better performance (single-pass processing)
205
+ // The separate encoding repair step has been removed
200
206
  if (options["all"] || options["import"] || options["schema"]) {
201
- if (!options["silent"]) {
202
- console.log(`Importing ${dataset.title}: ${sqlFilename}…`);
203
- }
204
207
  await copyToSenat(dataset, dataDir, options);
205
208
  // Create indexes programmatically after import
206
209
  if (dataset.indexes) {
@@ -270,7 +273,7 @@ async function retrieveOpenData() {
270
273
  process.env["PGUSER"] &&
271
274
  process.env["PGPASSWORD"], "Missing database configuration: environment variables PGHOST, PGPORT, PGUSER and PGPASSWORD or TRICOTEUSES_SENAT_DB_* in .env file");
272
275
  console.time("data extraction time");
273
- execSync(`${options["sudo"] ? `sudo -u ${options["sudo"]} ` : ""}psql --quiet -c "DROP DATABASE IF EXISTS senat"`, {
276
+ execSync(`${options["sudo"] ? `sudo -u ${options["sudo"]} ` : ""}psql --quiet -c "DROP DATABASE IF EXISTS senat;"`, {
274
277
  cwd: dataDir,
275
278
  env: process.env,
276
279
  encoding: "utf-8",
@@ -9,14 +9,13 @@ import { getSessionsFromStart } from "../types/sessions";
9
9
  import { commonOptions } from "./shared/cli_helpers";
10
10
  import { decodeHtmlEntities } from "../model/util";
11
11
  // ===================== Constants =====================
12
- const MATCH_THRESHOLD = 0.56;
12
+ const MATCH_THRESHOLD = 0.5;
13
13
  const MAX_CANDIDATES = 15;
14
14
  const MAX_PAGES = 3;
15
15
  const STATS = { total: 0, accepted: 0 };
16
16
  const VIDEOS_ROOT_FOLDER = "videos";
17
17
  const SENAT_VIDEOS_SEARCH_AJAX = "https://videos.senat.fr/senat_videos_search.php";
18
18
  const SENAT_DATAS_ROOT = "https://videos.senat.fr/Datas/senat";
19
- const SENAT_VOD_HOST = "https://vodsenat.akamaized.net";
20
19
  // ===================== CLI =====================
21
20
  const optionsDefinitions = [...commonOptions];
22
21
  const options = commandLineArgs(optionsDefinitions);
@@ -219,9 +218,9 @@ function score(agenda, agendaTs, videoTitle, videoEpoch, videoOrgane) {
219
218
  }
220
219
  const orgScore = videoOrgane && agenda.organe ? dice(agenda.organe, videoOrgane) : 0;
221
220
  if (orgScore === 0 && agenda.organe === "Séance publique") {
222
- return 0.5 * titleScore + 0.5 * timeScore;
221
+ return 0.3 * titleScore + 0.7 * timeScore;
223
222
  }
224
- return 0.4 * titleScore + 0.3 * timeScore + orgScore * 0.3;
223
+ return 0.2 * titleScore + 0.4 * timeScore + orgScore * 0.4;
225
224
  }
226
225
  /**
227
226
  * Build search strategies for senat's videos
@@ -1,3 +1,4 @@
1
1
  export declare function isOptionEmptyOrHasValue(option: string, value: string): boolean;
2
+ export declare function ensureAndClearDirSync(dir: string): void;
2
3
  export declare function ensureAndClearDir(path: string): void;
3
4
  export declare function fetchWithRetry(url: string, retries?: number, backoff?: number): Promise<Response>;
@@ -1,7 +1,14 @@
1
1
  import fs from "fs-extra";
2
+ import path from "path";
2
3
  export function isOptionEmptyOrHasValue(option, value) {
3
4
  return !option || option.length === 0 || option.includes(value);
4
5
  }
6
+ export function ensureAndClearDirSync(dir) {
7
+ fs.ensureDirSync(dir);
8
+ for (const name of fs.readdirSync(dir)) {
9
+ fs.rmSync(path.join(dir, name), { recursive: true, force: true });
10
+ }
11
+ }
5
12
  export function ensureAndClearDir(path) {
6
13
  if (!fs.existsSync(path)) {
7
14
  fs.mkdirSync(path, { recursive: true });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tricoteuses/senat",
3
- "version": "2.16.2",
3
+ "version": "2.16.4",
4
4
  "description": "Handle French Sénat's open data",
5
5
  "keywords": [
6
6
  "France",