@tricoteuses/senat 2.16.3 → 2.16.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/model/compte_rendu.d.ts +9 -0
- package/lib/model/compte_rendu.js +325 -0
- package/lib/model/debats.d.ts +26 -2
- package/lib/model/debats.js +65 -57
- package/lib/model/dosleg.d.ts +0 -13
- package/lib/model/dosleg.js +0 -13
- package/lib/model/index.d.ts +1 -1
- package/lib/model/index.js +1 -1
- package/lib/raw_types/db.d.ts +11389 -0
- package/lib/raw_types/db.js +5 -0
- package/lib/scripts/convert_data.js +50 -56
- package/lib/scripts/retrieve_comptes_rendus.d.ts +6 -0
- package/lib/scripts/retrieve_comptes_rendus.js +274 -0
- package/lib/scripts/retrieve_open_data.js +75 -76
- package/package.json +2 -1
|
@@ -2,14 +2,15 @@ import assert from "assert";
|
|
|
2
2
|
import commandLineArgs from "command-line-args";
|
|
3
3
|
import fs from "fs-extra";
|
|
4
4
|
import path from "path";
|
|
5
|
+
import pLimit from "p-limit";
|
|
5
6
|
import { datasets, EnabledDatasets, getEnabledDatasets } from "../datasets";
|
|
6
7
|
import { DATA_ORIGINAL_FOLDER, DOCUMENT_METADATA_FILE, DOSLEG_DOSSIERS_FOLDER, SCRUTINS_FOLDER, RAPPORT_FOLDER, SENS_CIRCONSCRIPTIONS_FOLDER, SENS_ORGANISMES_FOLDER, SENS_SENATEURS_FOLDER, TEXTE_FOLDER, } from "../loaders";
|
|
7
|
-
import { findAllAmendements, findAllCirconscriptions, findAllDebats, findAllDossiers, findAllScrutins, findAllOrganismes, findAllQuestions, findAllSens,
|
|
8
|
+
import { findAllAmendements, findAllCirconscriptions, findAllDebats, findAllDossiers, findAllScrutins, findAllOrganismes, findAllQuestions, findAllSens, findSenatRapportUrls, findSenatTexteUrls, } from "../model";
|
|
8
9
|
import { createActesLegislatifs } from "../model/dosleg";
|
|
9
10
|
import { UNDEFINED_SESSION } from "../types/sessions";
|
|
10
11
|
import { getSessionFromDate, getSessionFromSignet } from "./datautil";
|
|
11
12
|
import { commonOptions } from "./shared/cli_helpers";
|
|
12
|
-
import { ensureAndClearDir
|
|
13
|
+
import { ensureAndClearDir } from "./shared/util";
|
|
13
14
|
const optionsDefinitions = [...commonOptions];
|
|
14
15
|
const options = commandLineArgs(optionsDefinitions);
|
|
15
16
|
const SENAT_TEXTE_XML_BASE_URL = "https://www.senat.fr/akomantoso/";
|
|
@@ -47,19 +48,24 @@ async function convertDatasetAmeli(dataDir, options) {
|
|
|
47
48
|
console.log(`Converting database ${dataset.database} data into files…`);
|
|
48
49
|
}
|
|
49
50
|
const ameliReorganizedRootDir = path.join(dataDir, dataset.database);
|
|
50
|
-
|
|
51
|
+
await fs.ensureDir(ameliReorganizedRootDir);
|
|
52
|
+
const limit = pLimit(10);
|
|
53
|
+
const tasks = [];
|
|
51
54
|
for await (const amendement of findAllAmendements(options["fromSession"])) {
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
55
|
+
tasks.push(limit(async () => {
|
|
56
|
+
if (options["verbose"]) {
|
|
57
|
+
console.log(`Converting ${amendement["numero"]} file…`);
|
|
58
|
+
}
|
|
59
|
+
const session = String(amendement["session"]) || UNDEFINED_SESSION;
|
|
60
|
+
const signetDossierLegislatif = amendement["signet_dossier_legislatif"] ||
|
|
61
|
+
`${amendement["nature_texte"]}-${amendement["numero_texte"]}`.toLowerCase();
|
|
62
|
+
const ameliReorganizedDir = path.join(ameliReorganizedRootDir, String(session), signetDossierLegislatif);
|
|
63
|
+
await fs.ensureDir(ameliReorganizedDir);
|
|
64
|
+
const amendementFileName = `${amendement["numero"]}.json`;
|
|
65
|
+
await fs.writeJSON(path.join(ameliReorganizedDir, amendementFileName), amendement, { spaces: 2 });
|
|
66
|
+
}));
|
|
62
67
|
}
|
|
68
|
+
await Promise.all(tasks);
|
|
63
69
|
}
|
|
64
70
|
async function convertDatasetDebats(dataDir, options) {
|
|
65
71
|
const dataset = datasets.debats;
|
|
@@ -68,40 +74,19 @@ async function convertDatasetDebats(dataDir, options) {
|
|
|
68
74
|
}
|
|
69
75
|
const debatsReorganizedRootDir = path.join(dataDir, dataset.database);
|
|
70
76
|
ensureAndClearDir(debatsReorganizedRootDir);
|
|
71
|
-
const allAuteurs = await findAuteurs();
|
|
72
77
|
for await (const debat of findAllDebats()) {
|
|
73
78
|
if (options["verbose"]) {
|
|
74
79
|
console.log(`Converting ${debat.id} file…`);
|
|
75
80
|
}
|
|
76
|
-
const
|
|
77
|
-
const session = getSessionFromDate(enrichedDebat.date_seance);
|
|
81
|
+
const session = getSessionFromDate(debat.date_seance);
|
|
78
82
|
if (options["fromSession"] && session < options["fromSession"]) {
|
|
79
83
|
continue;
|
|
80
84
|
}
|
|
81
85
|
const debatsReorganizedDir = path.join(debatsReorganizedRootDir, String(session));
|
|
82
86
|
fs.ensureDirSync(debatsReorganizedDir);
|
|
83
|
-
const debatFileName = `${
|
|
84
|
-
fs.writeJSONSync(path.join(debatsReorganizedDir, debatFileName),
|
|
85
|
-
}
|
|
86
|
-
}
|
|
87
|
-
async function enrichDebat(debat, auteurs) {
|
|
88
|
-
const enrichedDebat = { ...debat };
|
|
89
|
-
for (const section of enrichedDebat.sections) {
|
|
90
|
-
for (const intervention of section.interventions) {
|
|
91
|
-
;
|
|
92
|
-
intervention.auteur = findAuteur(intervention["auteur_code"], auteurs);
|
|
93
|
-
}
|
|
94
|
-
}
|
|
95
|
-
for (const section of enrichedDebat.sections_divers) {
|
|
96
|
-
for (const intervention of section.interventions) {
|
|
97
|
-
;
|
|
98
|
-
intervention.auteur = findAuteur(intervention["auteur_code"], auteurs);
|
|
99
|
-
}
|
|
87
|
+
const debatFileName = `${debat.id}.json`;
|
|
88
|
+
fs.writeJSONSync(path.join(debatsReorganizedDir, debatFileName), debat, { spaces: 2 });
|
|
100
89
|
}
|
|
101
|
-
return enrichedDebat;
|
|
102
|
-
}
|
|
103
|
-
function findAuteur(auteurCode, auteurs) {
|
|
104
|
-
return auteurs.find((auteur) => auteur.code === auteurCode);
|
|
105
90
|
}
|
|
106
91
|
async function convertDatasetDosLeg(dataDir, options) {
|
|
107
92
|
const dataset = datasets.dosleg;
|
|
@@ -141,18 +126,22 @@ async function convertDatasetScrutins(dataDir, options) {
|
|
|
141
126
|
}
|
|
142
127
|
const scrutinsReorganizedDir = path.join(dataDir, SCRUTINS_FOLDER);
|
|
143
128
|
ensureAndClearDir(scrutinsReorganizedDir);
|
|
129
|
+
const limit = pLimit(10);
|
|
130
|
+
const tasks = [];
|
|
144
131
|
for await (const scrutin of findAllScrutins(options["fromSession"])) {
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
132
|
+
tasks.push(limit(async () => {
|
|
133
|
+
if (options["verbose"]) {
|
|
134
|
+
console.log(`Converting ${scrutin["numero"]} file…`);
|
|
135
|
+
}
|
|
136
|
+
let scrutinReorganizedDir = path.join(scrutinsReorganizedDir, String(UNDEFINED_SESSION));
|
|
137
|
+
const session = scrutin["session"] || UNDEFINED_SESSION;
|
|
138
|
+
scrutinReorganizedDir = path.join(scrutinsReorganizedDir, String(session));
|
|
139
|
+
await fs.ensureDir(scrutinReorganizedDir);
|
|
140
|
+
const scrutinFileName = `${scrutin["numero"]}.json`;
|
|
141
|
+
await fs.writeJSON(path.join(scrutinReorganizedDir, scrutinFileName), scrutin, {
|
|
142
|
+
spaces: 2,
|
|
143
|
+
});
|
|
144
|
+
}));
|
|
156
145
|
}
|
|
157
146
|
}
|
|
158
147
|
async function convertDatasetQuestions(dataDir) {
|
|
@@ -162,16 +151,21 @@ async function convertDatasetQuestions(dataDir) {
|
|
|
162
151
|
}
|
|
163
152
|
const questionsReorganizedRootDir = path.join(dataDir, dataset.database);
|
|
164
153
|
ensureAndClearDir(questionsReorganizedRootDir);
|
|
154
|
+
const limit = pLimit(10);
|
|
155
|
+
const tasks = [];
|
|
165
156
|
for await (const question of findAllQuestions()) {
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
157
|
+
tasks.push(limit(async () => {
|
|
158
|
+
if (options["verbose"]) {
|
|
159
|
+
console.log(`Converting ${question["reference"]} file…`);
|
|
160
|
+
}
|
|
161
|
+
const legislature = question["legislature"] ? question["legislature"] : 0;
|
|
162
|
+
const questionReorganizedDir = path.join(questionsReorganizedRootDir, String(legislature));
|
|
163
|
+
await fs.ensureDir(questionReorganizedDir);
|
|
164
|
+
const questionFileName = `${question["reference"]}.json`;
|
|
165
|
+
await fs.writeJSON(path.join(questionReorganizedDir, questionFileName), question, { spaces: 2 });
|
|
166
|
+
}));
|
|
174
167
|
}
|
|
168
|
+
await Promise.all(tasks);
|
|
175
169
|
}
|
|
176
170
|
async function convertTexteUrls(dataDir) {
|
|
177
171
|
const textesDir = path.join(dataDir, TEXTE_FOLDER);
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Needs to be run after retrieve_agenda.ts !
|
|
3
|
+
* - downloads the ZIP of comptes-rendus des débats (CRI) from data.senat.fr
|
|
4
|
+
* - extracts XML files, distributes them by session/year
|
|
5
|
+
*/
|
|
6
|
+
export declare function retrieveCriXmlDump(dataDir: string, options?: Record<string, any>): Promise<void>;
|
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Needs to be run after retrieve_agenda.ts !
|
|
3
|
+
* - downloads the ZIP of comptes-rendus des débats (CRI) from data.senat.fr
|
|
4
|
+
* - extracts XML files, distributes them by session/year
|
|
5
|
+
*/
|
|
6
|
+
import assert from "assert";
|
|
7
|
+
import commandLineArgs from "command-line-args";
|
|
8
|
+
import fs from "fs-extra";
|
|
9
|
+
import path from "path";
|
|
10
|
+
import StreamZip from "node-stream-zip";
|
|
11
|
+
import * as cheerio from "cheerio";
|
|
12
|
+
import { AGENDA_FOLDER, COMPTES_RENDUS_FOLDER, DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER, } from "../loaders";
|
|
13
|
+
import { commonOptions } from "./shared/cli_helpers";
|
|
14
|
+
import { deriveTitreObjetFromSommaire, parseCompteRenduSlotFromFile, parseYYYYMMDD, sessionStartYearFromDate } from "../model/compte_rendu";
|
|
15
|
+
import { makeGroupUid } from "../utils/reunion_grouping";
|
|
16
|
+
import { getSessionsFromStart } from "../types/sessions";
|
|
17
|
+
import { ensureAndClearDir, fetchWithRetry } from "./shared/util";
|
|
18
|
+
import { computeIntervalsBySlot } from "../utils/cr_spliting";
|
|
19
|
+
const optionsDefinitions = [
|
|
20
|
+
...commonOptions,
|
|
21
|
+
{
|
|
22
|
+
help: "parse and convert comptes-rendus des débats into JSON",
|
|
23
|
+
name: "parseDebats",
|
|
24
|
+
type: Boolean,
|
|
25
|
+
}
|
|
26
|
+
];
|
|
27
|
+
const options = commandLineArgs(optionsDefinitions);
|
|
28
|
+
const CRI_ZIP_URL = "https://data.senat.fr/data/debats/cri.zip";
|
|
29
|
+
const SLOT_ORDER = ["MATIN", "APRES-MIDI", "SOIR"];
|
|
30
|
+
class CompteRenduError extends Error {
|
|
31
|
+
constructor(message, url) {
|
|
32
|
+
super(`An error occurred while retrieving ${url}: ${message}`);
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
function pickFirstSlotOfDay(slots) {
|
|
36
|
+
for (const s of SLOT_ORDER)
|
|
37
|
+
if (slots.includes(s))
|
|
38
|
+
return s;
|
|
39
|
+
return null;
|
|
40
|
+
}
|
|
41
|
+
function loadAgendaSPSlotsForDate(dataDir, yyyymmdd, session) {
|
|
42
|
+
const dirPath = path.join(dataDir, AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, session.toString());
|
|
43
|
+
if (!fs.existsSync(dirPath)) {
|
|
44
|
+
console.warn(`[AGENDA] Directory not found for session ${session} → ${dirPath}`);
|
|
45
|
+
return null;
|
|
46
|
+
}
|
|
47
|
+
const pattern = new RegExp(`^RUSN${yyyymmdd}IDS-(MATIN|APRES-MIDI|SOIR)\\.json$`);
|
|
48
|
+
const ALLOWED_SLOTS = new Set(["MATIN", "APRES-MIDI", "SOIR"]);
|
|
49
|
+
try {
|
|
50
|
+
const files = fs.readdirSync(dirPath);
|
|
51
|
+
const matched = files.filter((f) => pattern.test(f));
|
|
52
|
+
if (matched.length === 0) {
|
|
53
|
+
return null;
|
|
54
|
+
}
|
|
55
|
+
const found = new Set();
|
|
56
|
+
for (const name of matched) {
|
|
57
|
+
const m = name.match(pattern);
|
|
58
|
+
const raw = (m?.[1] ?? "");
|
|
59
|
+
if (ALLOWED_SLOTS.has(raw))
|
|
60
|
+
found.add(raw);
|
|
61
|
+
}
|
|
62
|
+
const slots = Array.from(found);
|
|
63
|
+
if (slots.length === 0) {
|
|
64
|
+
return null;
|
|
65
|
+
}
|
|
66
|
+
return { filePath: dirPath, slots };
|
|
67
|
+
}
|
|
68
|
+
catch {
|
|
69
|
+
return null;
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
async function downloadCriZip(zipPath) {
|
|
73
|
+
if (!options["silent"])
|
|
74
|
+
console.log(`Downloading CRI zip ${CRI_ZIP_URL}…`);
|
|
75
|
+
const response = await fetchWithRetry(CRI_ZIP_URL);
|
|
76
|
+
if (!response.ok) {
|
|
77
|
+
if (response.status === 404) {
|
|
78
|
+
console.warn(`CRI zip ${CRI_ZIP_URL} not found`);
|
|
79
|
+
return;
|
|
80
|
+
}
|
|
81
|
+
throw new CompteRenduError(String(response.status), CRI_ZIP_URL);
|
|
82
|
+
}
|
|
83
|
+
const buf = Buffer.from(await response.arrayBuffer());
|
|
84
|
+
await fs.writeFile(zipPath, buf);
|
|
85
|
+
if (!options["silent"]) {
|
|
86
|
+
const mb = (buf.length / (1024 * 1024)).toFixed(1);
|
|
87
|
+
console.log(`[CRI] Downloaded ${mb} MB → ${zipPath}`);
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
async function extractAndDistributeXmlBySession(zipPath, originalRoot) {
|
|
91
|
+
const zip = new StreamZip.async({ file: zipPath });
|
|
92
|
+
const entries = await zip.entries();
|
|
93
|
+
let count = 0;
|
|
94
|
+
for (const entryName of Object.keys(entries)) {
|
|
95
|
+
if (!entryName.toLowerCase().endsWith(".xml"))
|
|
96
|
+
continue;
|
|
97
|
+
// ex: d20231005.xml
|
|
98
|
+
const base = path.basename(entryName);
|
|
99
|
+
const m = base.match(/^d(\d{8})\.xml$/i);
|
|
100
|
+
if (!m)
|
|
101
|
+
continue;
|
|
102
|
+
const yyyymmdd = m[1];
|
|
103
|
+
const dt = parseYYYYMMDD(yyyymmdd);
|
|
104
|
+
if (!dt)
|
|
105
|
+
continue;
|
|
106
|
+
const session = sessionStartYearFromDate(dt);
|
|
107
|
+
const destDir = path.join(originalRoot, String(session));
|
|
108
|
+
await fs.ensureDir(destDir);
|
|
109
|
+
const outPath = path.join(destDir, base);
|
|
110
|
+
await zip.extract(entryName, outPath);
|
|
111
|
+
count++;
|
|
112
|
+
}
|
|
113
|
+
await zip.close();
|
|
114
|
+
return count;
|
|
115
|
+
}
|
|
116
|
+
export async function retrieveCriXmlDump(dataDir, options = {}) {
|
|
117
|
+
const root = path.join(dataDir, COMPTES_RENDUS_FOLDER);
|
|
118
|
+
ensureAndClearDir(root);
|
|
119
|
+
const originalRoot = path.join(root, DATA_ORIGINAL_FOLDER);
|
|
120
|
+
fs.ensureDirSync(originalRoot);
|
|
121
|
+
const transformedRoot = path.join(root, DATA_TRANSFORMED_FOLDER);
|
|
122
|
+
if (options["parseDebats"])
|
|
123
|
+
fs.ensureDirSync(transformedRoot);
|
|
124
|
+
const sessions = getSessionsFromStart(options["fromSession"]);
|
|
125
|
+
// 1) Download ZIP global + distribut by session
|
|
126
|
+
const zipPath = path.join(dataDir, "cri.zip");
|
|
127
|
+
console.log("[CRI] Downloading global CRI zip…");
|
|
128
|
+
await downloadCriZip(zipPath);
|
|
129
|
+
console.log("[CRI] Extracting + distributing XMLs by session…");
|
|
130
|
+
for (const session of sessions) {
|
|
131
|
+
const dir = path.join(originalRoot, String(session));
|
|
132
|
+
if (await fs.pathExists(dir)) {
|
|
133
|
+
for (const f of await fs.readdir(dir))
|
|
134
|
+
if (/\.xml$/i.test(f))
|
|
135
|
+
await fs.remove(path.join(dir, f));
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
const n = await extractAndDistributeXmlBySession(zipPath, originalRoot);
|
|
139
|
+
if (n === 0) {
|
|
140
|
+
console.warn("[CRI] No XML extracted. Archive empty or layout changed?");
|
|
141
|
+
}
|
|
142
|
+
else {
|
|
143
|
+
console.log(`[CRI] Distributed ${n} XML file(s) into session folders.`);
|
|
144
|
+
}
|
|
145
|
+
if (!options["parseDebats"]) {
|
|
146
|
+
console.log("[CRI] parseDebats not requested → done.");
|
|
147
|
+
return;
|
|
148
|
+
}
|
|
149
|
+
for (const session of sessions) {
|
|
150
|
+
const originalSessionDir = path.join(originalRoot, String(session));
|
|
151
|
+
if (!(await fs.pathExists(originalSessionDir))) {
|
|
152
|
+
continue;
|
|
153
|
+
}
|
|
154
|
+
const xmlFiles = (await fs.readdir(originalSessionDir))
|
|
155
|
+
.filter((f) => /^d\d{8}\.xml$/i.test(f))
|
|
156
|
+
.sort();
|
|
157
|
+
const transformedSessionDir = path.join(transformedRoot, String(session));
|
|
158
|
+
if (options["parseDebats"])
|
|
159
|
+
await fs.ensureDir(transformedSessionDir);
|
|
160
|
+
for (const f of xmlFiles) {
|
|
161
|
+
const yyyymmdd = f.slice(1, 9);
|
|
162
|
+
const xmlPath = path.join(originalSessionDir, f);
|
|
163
|
+
// 1) Deduce slot(s) from agenda if it exsits
|
|
164
|
+
const agendaInfo = loadAgendaSPSlotsForDate(dataDir, yyyymmdd, session);
|
|
165
|
+
const firstSlotOfDay = pickFirstSlotOfDay(agendaInfo?.slots ?? []);
|
|
166
|
+
// 2) Detect slots from CRI content
|
|
167
|
+
let slotsInCri = [];
|
|
168
|
+
try {
|
|
169
|
+
const raw = await fs.readFile(xmlPath, "utf8");
|
|
170
|
+
const $ = cheerio.load(raw, { xml: false });
|
|
171
|
+
const order = $("body *").toArray();
|
|
172
|
+
const idx = new Map(order.map((el, i) => [el, i]));
|
|
173
|
+
const intervals = computeIntervalsBySlot($, idx, firstSlotOfDay ?? undefined);
|
|
174
|
+
const uniq = new Set();
|
|
175
|
+
for (const iv of intervals)
|
|
176
|
+
if (iv.slot && iv.slot !== "UNKNOWN")
|
|
177
|
+
uniq.add(iv.slot);
|
|
178
|
+
slotsInCri = Array.from(uniq);
|
|
179
|
+
}
|
|
180
|
+
catch (e) {
|
|
181
|
+
console.warn(`[CRI] [${session}] Cannot read/parse ${f}:`, e);
|
|
182
|
+
continue;
|
|
183
|
+
}
|
|
184
|
+
if (slotsInCri.length === 0) {
|
|
185
|
+
slotsInCri = [firstSlotOfDay ?? "MATIN"];
|
|
186
|
+
}
|
|
187
|
+
// 3) Parse & write each slot
|
|
188
|
+
for (const slot of slotsInCri) {
|
|
189
|
+
const outName = `CRSSN${yyyymmdd}-${slot}.json`;
|
|
190
|
+
const cr = await parseCompteRenduSlotFromFile(xmlPath, slot, firstSlotOfDay ?? slot);
|
|
191
|
+
if (!cr) {
|
|
192
|
+
console.warn(`[CRI] [${session}] Empty or no points for ${yyyymmdd} (${slot}) → skip`);
|
|
193
|
+
continue;
|
|
194
|
+
}
|
|
195
|
+
const outDir = transformedSessionDir;
|
|
196
|
+
await fs.ensureDir(outDir);
|
|
197
|
+
const outPath = path.join(outDir, outName);
|
|
198
|
+
await fs.writeJSON(outPath, cr, { spaces: 2 });
|
|
199
|
+
try {
|
|
200
|
+
await linkCriSlotIntoAgendaGrouped(dataDir, yyyymmdd, slot, cr.uid, cr, session);
|
|
201
|
+
}
|
|
202
|
+
catch (e) {
|
|
203
|
+
console.warn(`[AGENDA] [${session}] Could not link CR into grouped for ${yyyymmdd} ${slot}:`, e);
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
async function main() {
|
|
210
|
+
const dataDir = options["dataDir"];
|
|
211
|
+
assert(dataDir, "Missing argument: data directory");
|
|
212
|
+
console.time("CRI processing time");
|
|
213
|
+
await retrieveCriXmlDump(dataDir, options);
|
|
214
|
+
if (!options["silent"]) {
|
|
215
|
+
console.timeEnd("CRI processing time");
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
main()
|
|
219
|
+
.then(() => process.exit(0))
|
|
220
|
+
.catch((error) => {
|
|
221
|
+
console.error(error);
|
|
222
|
+
process.exit(1);
|
|
223
|
+
});
|
|
224
|
+
async function linkCriSlotIntoAgendaGrouped(dataDir, yyyymmdd, slot, crUid, cr, session) {
|
|
225
|
+
const groupedDir = path.join(dataDir, AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, session.toString());
|
|
226
|
+
fs.ensureDirSync(groupedDir);
|
|
227
|
+
const groupedPath = path.join(groupedDir, 'RUSN' + yyyymmdd + 'IDS-' + slot + '.json');
|
|
228
|
+
let groups = [];
|
|
229
|
+
if (fs.existsSync(groupedPath)) {
|
|
230
|
+
try {
|
|
231
|
+
groups = JSON.parse(fs.readFileSync(groupedPath, "utf8"));
|
|
232
|
+
if (!Array.isArray(groups))
|
|
233
|
+
groups = [];
|
|
234
|
+
}
|
|
235
|
+
catch (e) {
|
|
236
|
+
console.warn(`[AGENDA] unreadable grouped JSON → ${groupedPath} (${e}) → recreating`);
|
|
237
|
+
groups = [];
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
// find existing group with same slot
|
|
241
|
+
const sameSlot = groups.filter(g => g?.slot === slot);
|
|
242
|
+
let target = null;
|
|
243
|
+
if (sameSlot.length > 1) {
|
|
244
|
+
console.warn(`[AGENDA] multiple groups for ${yyyymmdd} ${slot} in ${groupedPath} → linking the first`);
|
|
245
|
+
}
|
|
246
|
+
target = sameSlot[0] ?? null;
|
|
247
|
+
const dateISO = `${yyyymmdd.slice(0, 4)}-${yyyymmdd.slice(4, 6)}-${yyyymmdd.slice(6, 8)}`;
|
|
248
|
+
const sommaire = cr?.metadonnees?.sommaire;
|
|
249
|
+
const { titre: dTitre, objet: dObjet } = deriveTitreObjetFromSommaire(sommaire, slot);
|
|
250
|
+
if (!target) {
|
|
251
|
+
const newGroup = {
|
|
252
|
+
uid: makeGroupUid(dateISO, slot),
|
|
253
|
+
chambre: "SN",
|
|
254
|
+
date: dateISO,
|
|
255
|
+
slot,
|
|
256
|
+
type: "Séance publique",
|
|
257
|
+
startTime: null,
|
|
258
|
+
endTime: null,
|
|
259
|
+
captationVideo: false,
|
|
260
|
+
titre: dTitre,
|
|
261
|
+
objet: dObjet || "",
|
|
262
|
+
events: [],
|
|
263
|
+
compteRenduRefUid: crUid,
|
|
264
|
+
};
|
|
265
|
+
groups.push(newGroup);
|
|
266
|
+
}
|
|
267
|
+
else {
|
|
268
|
+
target.compteRenduRefUid = crUid;
|
|
269
|
+
}
|
|
270
|
+
await fs.writeJSON(groupedPath, groups, { spaces: 2 });
|
|
271
|
+
if (!options["silent"]) {
|
|
272
|
+
console.log(`[AGENDA] Linked CR ${crUid} → ${path.basename(groupedPath)} [${slot}]`);
|
|
273
|
+
}
|
|
274
|
+
}
|
|
@@ -1,13 +1,11 @@
|
|
|
1
1
|
import assert from "assert";
|
|
2
|
-
import { execSync } from "child_process";
|
|
2
|
+
import { execSync, spawn } from "child_process";
|
|
3
3
|
import commandLineArgs from "command-line-args";
|
|
4
4
|
import fs from "fs-extra";
|
|
5
|
-
// import fetch from "node-fetch"
|
|
6
5
|
import path from "path";
|
|
7
6
|
import StreamZip from "node-stream-zip";
|
|
8
|
-
import readline from "readline";
|
|
9
7
|
import windows1252 from "windows-1252";
|
|
10
|
-
import { pipeline } from "stream";
|
|
8
|
+
import { pipeline, Transform } from "stream";
|
|
11
9
|
import { promisify } from "util";
|
|
12
10
|
import config from "../config";
|
|
13
11
|
import { getChosenDatasets, getEnabledDatasets } from "../datasets";
|
|
@@ -69,60 +67,88 @@ async function downloadFile(url, dest) {
|
|
|
69
67
|
}
|
|
70
68
|
/**
|
|
71
69
|
* Copy a dataset database to the main Senat database (overwriting its contents).
|
|
70
|
+
* Optimized to combine encoding repair and schema transformation in a single pass.
|
|
72
71
|
*/
|
|
73
72
|
async function copyToSenat(dataset, dataDir, options) {
|
|
74
73
|
if (!options["silent"]) {
|
|
75
74
|
console.log(`Copying ${dataset.database} to Senat database...`);
|
|
76
75
|
}
|
|
77
76
|
const sqlFilePath = path.join(dataDir, `${dataset.database}.sql`);
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
let newLine = line;
|
|
89
|
-
// Replace 'public' schema outside single-quoted strings
|
|
90
|
-
function replacePublicOutsideStrings(line, schema) {
|
|
91
|
-
const parts = line.split(/(')/);
|
|
92
|
-
let inString = false;
|
|
93
|
-
for (let i = 0; i < parts.length; i++) {
|
|
94
|
-
if (parts[i] === "'") {
|
|
95
|
-
inString = !inString;
|
|
96
|
-
}
|
|
97
|
-
else if (!inString) {
|
|
98
|
-
// Only replace outside of strings, including before comma
|
|
99
|
-
parts[i] = parts[i].replace(/\bpublic\b(?=(\s*\.|\s*[,;]|\s|$))/g, schema);
|
|
100
|
-
}
|
|
77
|
+
// Helper function to replace 'public' schema outside single-quoted strings
|
|
78
|
+
function replacePublicOutsideStrings(line, schema) {
|
|
79
|
+
const parts = line.split(/(')/);
|
|
80
|
+
let inString = false;
|
|
81
|
+
for (let i = 0; i < parts.length; i++) {
|
|
82
|
+
if (parts[i] === "'") {
|
|
83
|
+
inString = !inString;
|
|
84
|
+
}
|
|
85
|
+
else if (!inString) {
|
|
86
|
+
parts[i] = parts[i].replace(/\bpublic\b(?=(\s*\.|\s*[,;]|\s|$))/g, schema);
|
|
101
87
|
}
|
|
102
|
-
return parts.join('');
|
|
103
88
|
}
|
|
104
|
-
|
|
105
|
-
// Replace SET client_encoding to UTF8
|
|
106
|
-
newLine = newLine.replace(/SET client_encoding = 'LATIN1';/i, "SET client_encoding = 'UTF8';");
|
|
107
|
-
schemaSqlWriter.write(newLine + "\n");
|
|
89
|
+
return parts.join('');
|
|
108
90
|
}
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
91
|
+
// Spawn psql process
|
|
92
|
+
const psqlArgs = options["sudo"]
|
|
93
|
+
? ["-u", options["sudo"], "psql", "--quiet", "-d", "senat"]
|
|
94
|
+
: ["--quiet", "-d", "senat"];
|
|
95
|
+
const psql = spawn(options["sudo"] ? "sudo" : "psql", psqlArgs, {
|
|
96
|
+
stdio: ["pipe", "ignore", "pipe"],
|
|
97
|
+
env: process.env,
|
|
98
|
+
});
|
|
99
|
+
psql.stdin.write(`DROP SCHEMA IF EXISTS ${dataset.database} CASCADE;\n`);
|
|
100
|
+
psql.stdin.write(`CREATE SCHEMA IF NOT EXISTS ${dataset.database};\n`);
|
|
101
|
+
let buffer = '';
|
|
102
|
+
const combinedTransform = new Transform({
|
|
103
|
+
transform(chunk, encoding, callback) {
|
|
104
|
+
// Encoding repair if needed (decode from latin1 and fix Windows-1252 characters)
|
|
105
|
+
let data = dataset.repairEncoding
|
|
106
|
+
? chunk.toString('latin1').replace(badWindows1252CharacterRegex, (match) => windows1252.decode(match, { mode: "fatal" }))
|
|
107
|
+
: chunk.toString();
|
|
108
|
+
buffer += data;
|
|
109
|
+
const lines = buffer.split('\n');
|
|
110
|
+
buffer = lines.pop() || '';
|
|
111
|
+
let processedData = '';
|
|
112
|
+
for (const line of lines) {
|
|
113
|
+
let newLine = replacePublicOutsideStrings(line, dataset.database);
|
|
114
|
+
newLine = newLine.replace(/SET client_encoding = 'LATIN1';/i, "SET client_encoding = 'UTF8';");
|
|
115
|
+
processedData += newLine + '\n';
|
|
116
|
+
}
|
|
117
|
+
callback(null, processedData);
|
|
118
|
+
},
|
|
119
|
+
flush(callback) {
|
|
120
|
+
// Process any remaining data in buffer
|
|
121
|
+
if (buffer) {
|
|
122
|
+
let newLine = replacePublicOutsideStrings(buffer, dataset.database);
|
|
123
|
+
newLine = newLine.replace(/SET client_encoding = 'LATIN1';/i, "SET client_encoding = 'UTF8';");
|
|
124
|
+
callback(null, newLine);
|
|
125
|
+
}
|
|
126
|
+
else {
|
|
127
|
+
callback();
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
});
|
|
131
|
+
let stderrData = '';
|
|
132
|
+
psql.stderr.on('data', (data) => {
|
|
133
|
+
stderrData += data.toString();
|
|
134
|
+
});
|
|
135
|
+
const pipelinePromise = streamPipeline(fs.createReadStream(sqlFilePath, {
|
|
136
|
+
encoding: dataset.repairEncoding ? undefined : "utf8",
|
|
137
|
+
highWaterMark: 4 * 1024 * 1024
|
|
138
|
+
}), combinedTransform, psql.stdin);
|
|
139
|
+
await pipelinePromise;
|
|
140
|
+
return new Promise((resolve, reject) => {
|
|
141
|
+
psql.on("close", (code) => {
|
|
142
|
+
if (code === 0) {
|
|
143
|
+
resolve();
|
|
118
144
|
}
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
145
|
+
else {
|
|
146
|
+
if (!options["silent"] && stderrData) {
|
|
147
|
+
console.error(`psql stderr: ${stderrData}`);
|
|
148
|
+
}
|
|
149
|
+
reject(new Error(`psql exited with code ${code}`));
|
|
122
150
|
}
|
|
123
|
-
resolve();
|
|
124
151
|
});
|
|
125
|
-
schemaSqlWriter.on("error", reject);
|
|
126
152
|
});
|
|
127
153
|
}
|
|
128
154
|
async function retrieveDataset(dataDir, dataset) {
|
|
@@ -176,31 +202,9 @@ async function retrieveDataset(dataDir, dataset) {
|
|
|
176
202
|
dataset.repairZip(dataset, dataDir);
|
|
177
203
|
}
|
|
178
204
|
}
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
console.log(`Repairing Windows CP1252 encoding in ${dataset.title}: ${sqlFilename}…`);
|
|
182
|
-
}
|
|
183
|
-
const repairedSqlFilePath = sqlFilePath + ".repaired";
|
|
184
|
-
const repairedSqlWriter = fs.createWriteStream(repairedSqlFilePath, {
|
|
185
|
-
encoding: "utf8",
|
|
186
|
-
});
|
|
187
|
-
// Read the file as latin1 (ISO-8859-1/CP1252) and write as UTF-8
|
|
188
|
-
const lineReader = readline.createInterface({
|
|
189
|
-
input: fs.createReadStream(sqlFilePath, { encoding: "latin1" }),
|
|
190
|
-
crlfDelay: Infinity,
|
|
191
|
-
});
|
|
192
|
-
for await (const line of lineReader) {
|
|
193
|
-
// Optionally repair Windows-1252 control characters
|
|
194
|
-
let repairedLine = line.replace(badWindows1252CharacterRegex, (match) => windows1252.decode(match, { mode: "fatal" }));
|
|
195
|
-
repairedSqlWriter.write(repairedLine + "\n");
|
|
196
|
-
}
|
|
197
|
-
repairedSqlWriter.end();
|
|
198
|
-
await fs.move(repairedSqlFilePath, sqlFilePath, { overwrite: true });
|
|
199
|
-
}
|
|
205
|
+
// Encoding repair is now handled in copyToSenat for better performance (single-pass processing)
|
|
206
|
+
// The separate encoding repair step has been removed
|
|
200
207
|
if (options["all"] || options["import"] || options["schema"]) {
|
|
201
|
-
if (!options["silent"]) {
|
|
202
|
-
console.log(`Importing ${dataset.title}: ${sqlFilename}…`);
|
|
203
|
-
}
|
|
204
208
|
await copyToSenat(dataset, dataDir, options);
|
|
205
209
|
// Create indexes programmatically after import
|
|
206
210
|
if (dataset.indexes) {
|
|
@@ -270,12 +274,7 @@ async function retrieveOpenData() {
|
|
|
270
274
|
process.env["PGUSER"] &&
|
|
271
275
|
process.env["PGPASSWORD"], "Missing database configuration: environment variables PGHOST, PGPORT, PGUSER and PGPASSWORD or TRICOTEUSES_SENAT_DB_* in .env file");
|
|
272
276
|
console.time("data extraction time");
|
|
273
|
-
execSync(`${options["sudo"] ? `sudo -u ${options["sudo"]} ` : ""}psql --quiet -c "
|
|
274
|
-
cwd: dataDir,
|
|
275
|
-
env: process.env,
|
|
276
|
-
encoding: "utf-8",
|
|
277
|
-
});
|
|
278
|
-
execSync(`${options["sudo"] ? `sudo -u ${options["sudo"]} ` : ""}psql --quiet -c "CREATE DATABASE senat WITH OWNER opendata"`, {
|
|
277
|
+
execSync(`${options["sudo"] ? `sudo -u ${options["sudo"]} ` : ""}psql --quiet -c "CREATE DATABASE senat WITH OWNER opendata" || true`, {
|
|
279
278
|
cwd: dataDir,
|
|
280
279
|
env: process.env,
|
|
281
280
|
encoding: "utf-8",
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tricoteuses/senat",
|
|
3
|
-
"version": "2.16.
|
|
3
|
+
"version": "2.16.5",
|
|
4
4
|
"description": "Handle French Sénat's open data",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"France",
|
|
@@ -72,6 +72,7 @@
|
|
|
72
72
|
"node-stream-zip": "^1.8.2",
|
|
73
73
|
"pg": "^8.13.1",
|
|
74
74
|
"pg-cursor": "^2.12.1",
|
|
75
|
+
"p-limit": "^7.2.0",
|
|
75
76
|
"slug": "^11.0.0",
|
|
76
77
|
"tsx": "^4.20.6",
|
|
77
78
|
"windows-1252": "^1.0.0"
|