@tricoteuses/senat 2.20.20 → 2.20.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -43,7 +43,6 @@ npm run data:download ../senat-data
43
43
 
44
44
  - `npm run data:download <dir>`: Download, convert data to JSON
45
45
  - `npm run data:retrieve_documents <dir>`: Retrieval of textes and rapports from Sénat's website
46
- - `npm run data:parse_textes_lois <dir>`: Parse textes (requires xml files)
47
46
  - `npm run data:retrieve_agenda <dir>`: Retrieval of agenda from Sénat's website
48
47
  - `npm run data:retrieve_cr_seance <dir>`: Retrieval of comptes-rendus de séance from Sénat's data
49
48
  - `npm run data:retrieve_cr_commission <dir>`: Retrieval of comptes-rendus de commissions from Sénat's website
package/lib/loaders.d.ts CHANGED
@@ -13,11 +13,11 @@ export declare const COMPTES_RENDUS_FOLDER = "seances";
13
13
  export declare const COMMISSION_FOLDER = "commissions";
14
14
  export declare const DOSLEG_DOSSIERS_FOLDER = "dossiers";
15
15
  export declare const SCRUTINS_FOLDER = "scrutins";
16
- export declare const RAPPORT_FOLDER = "rap";
17
16
  export declare const SENS_CIRCONSCRIPTIONS_FOLDER = "circonscriptions";
18
17
  export declare const SENS_ORGANISMES_FOLDER = "organismes";
19
18
  export declare const SENS_SENATEURS_FOLDER = "senateurs";
20
19
  export declare const TEXTE_FOLDER = "leg";
20
+ export declare const RAPPORT_FOLDER = "rap";
21
21
  export declare const DATA_ORIGINAL_FOLDER = "original";
22
22
  export declare const DATA_TRANSFORMED_FOLDER = "transformed";
23
23
  export declare const DOCUMENT_METADATA_FILE = "metadata.json";
@@ -25,6 +25,7 @@ export type IterItem<T> = {
25
25
  item: T;
26
26
  filePathFromDataset?: string;
27
27
  legislature?: number;
28
+ gitStatus?: "A" | "M" | "D" | "R" | "C" | "T" | "U";
28
29
  };
29
30
  export interface TexteMetadata {
30
31
  name: string;
package/lib/loaders.js CHANGED
@@ -1,6 +1,7 @@
1
1
  import fsex from "fs-extra";
2
2
  import fs from "fs";
3
3
  import path from "path";
4
+ import * as git from "./git";
4
5
  import { datasets } from "./datasets";
5
6
  import { UNDEFINED_SESSION } from "./types/sessions";
6
7
  export { EnabledDatasets } from "./datasets";
@@ -9,11 +10,11 @@ export const COMPTES_RENDUS_FOLDER = "seances";
9
10
  export const COMMISSION_FOLDER = "commissions";
10
11
  export const DOSLEG_DOSSIERS_FOLDER = "dossiers";
11
12
  export const SCRUTINS_FOLDER = "scrutins";
12
- export const RAPPORT_FOLDER = "rap";
13
13
  export const SENS_CIRCONSCRIPTIONS_FOLDER = "circonscriptions";
14
14
  export const SENS_ORGANISMES_FOLDER = "organismes";
15
15
  export const SENS_SENATEURS_FOLDER = "senateurs";
16
16
  export const TEXTE_FOLDER = "leg";
17
+ export const RAPPORT_FOLDER = "rap";
17
18
  export const DATA_ORIGINAL_FOLDER = "original";
18
19
  export const DATA_TRANSFORMED_FOLDER = "transformed";
19
20
  export const DOCUMENT_METADATA_FILE = "metadata.json";
@@ -30,7 +31,7 @@ export function* iterFilePaths(dirPath) {
30
31
  }
31
32
  }
32
33
  }
33
- function* iterLoadSenatItems(dataDir, dataName, legislatureOrSession, subDir, { log = false } = {}) {
34
+ function* iterLoadSenatItems(dataDir, dataName, legislatureOrSession, subDir, { log = false, sinceCommit } = {}) {
34
35
  let itemsDir = path.join(dataDir, dataName);
35
36
  if (subDir) {
36
37
  itemsDir = path.join(itemsDir, subDir);
@@ -38,9 +39,26 @@ function* iterLoadSenatItems(dataDir, dataName, legislatureOrSession, subDir, {
38
39
  if (legislatureOrSession) {
39
40
  itemsDir = path.join(itemsDir, String(legislatureOrSession));
40
41
  }
42
+ // Get changed files if sinceCommit is specified (excluding deleted files)
43
+ const changedFiles = sinceCommit
44
+ ? git.getChangedFilesSinceCommit(itemsDir, sinceCommit, {
45
+ diffFilter: "AMR", // Added, Modified, Renamed
46
+ })
47
+ : null;
48
+ if (log && sinceCommit) {
49
+ console.log(`Filtering files changed since commit ${sinceCommit} in ${itemsDir}`);
50
+ console.log(`Found ${changedFiles?.size || 0} changed files (AMR)`);
51
+ }
41
52
  for (const filePath of iterFilePaths(itemsDir)) {
53
+ const relativePath = path.relative(path.join(dataDir, dataName), filePath);
54
+ const gitStatus = changedFiles?.get(relativePath);
55
+ // Filter by changed files if sinceCommit is specified
56
+ if (changedFiles && !gitStatus) {
57
+ // Skip files not in the change set
58
+ continue;
59
+ }
42
60
  if (log) {
43
- console.log(`Loading file: ${filePath}…`);
61
+ console.log(`Loading file: ${filePath}…${gitStatus ? ` (${gitStatus})` : ""}`);
44
62
  }
45
63
  let item;
46
64
  try {
@@ -56,8 +74,35 @@ function* iterLoadSenatItems(dataDir, dataName, legislatureOrSession, subDir, {
56
74
  item,
57
75
  filePathFromDataset,
58
76
  legislature: legislatureOrSession,
77
+ ...(gitStatus && { gitStatus }), // Include gitStatus
59
78
  };
60
79
  }
80
+ // Yield deleted files at the end if sinceCommit is specified
81
+ if (sinceCommit) {
82
+ const deletedFiles = git.getChangedFilesSinceCommit(itemsDir, sinceCommit, {
83
+ diffFilter: "D", // Deleted
84
+ });
85
+ if (log) {
86
+ console.log(`Found ${deletedFiles.size || 0} deleted files (D)`);
87
+ }
88
+ for (const [relativePath, status] of deletedFiles.entries()) {
89
+ const deletedFilePath = path.join(itemsDir, relativePath);
90
+ if (log) {
91
+ console.log(`Deleted file: ${deletedFilePath}`);
92
+ }
93
+ // Extract UID from filename (remove extension) for the placeholder item
94
+ const fileExtension = path.extname(relativePath) || ".json"; // Assuming files use an extension like .json
95
+ const filename = path.basename(relativePath, fileExtension);
96
+ const fakeItem = { uid: filename }; // Placeholder item using uid constraint
97
+ const filePathFromDataset = deletedFilePath.substring(deletedFilePath.indexOf(dataName) + dataName.length);
98
+ yield {
99
+ item: fakeItem,
100
+ filePathFromDataset,
101
+ legislature: legislatureOrSession,
102
+ gitStatus: status,
103
+ };
104
+ }
105
+ }
61
106
  }
62
107
  export function* iterLoadSenatAmendements(dataDir, session, options = {}) {
63
108
  for (const amendementItem of iterLoadSenatItems(dataDir, datasets.ameli.database, session, undefined, options)) {
@@ -249,7 +249,7 @@ export function createActesLegislatifs(dossier) {
249
249
  code_acte: `${codeParent}-DEBATS-SEANCE`,
250
250
  date: lectureAss["dates_seances"][0]?.["date"],
251
251
  id: lectureAss["id"],
252
- numero: lectureAss["numero"]
252
+ numero: lectureAss["numero"],
253
253
  });
254
254
  }
255
255
  const { textes, rapports, ...lectureAssWithoutTextes } = lectureAss;
@@ -0,0 +1,7 @@
1
+ import { ExposeDesMotifs, FlatTexte } from "../types/texte";
2
+ export declare function transformTexte(document: Document): FlatTexte | null;
3
+ export declare function transformExposeDesMotifs(document: Document): ExposeDesMotifs | null;
4
+ export declare function parseTexte(texteXml: string): FlatTexte | null;
5
+ export declare function parseTexteFromFile(xmlFilePath: string): Promise<FlatTexte | null>;
6
+ export declare function parseExposeDesMotifs(exposeDesMotifsHtml: string): ExposeDesMotifs | null;
7
+ export declare function parseExposeDesMotifsFromFile(htmlFilePath: string): Promise<ExposeDesMotifs | null>;
@@ -0,0 +1,228 @@
1
+ import { JSDOM } from "jsdom";
2
+ import { AKN_IDENTIFICATION_STRUCTURE_REGEXP, AKN_WORKFLOW_IDENTIFICATION_STRUCTURE_REGEXP } from "../scripts/datautil";
3
+ import { DivisionType, } from "../types/texte";
4
+ function buildWorklow(metaElement) {
5
+ const stepElements = metaElement.querySelectorAll("workflow step");
6
+ const steps = [];
7
+ for (const stepElement of stepElements) {
8
+ const identification = stepElement.getAttribute("href") ?? "";
9
+ const identificationParts = AKN_WORKFLOW_IDENTIFICATION_STRUCTURE_REGEXP.exec(identification)?.groups;
10
+ steps.push({
11
+ eId: stepElement.getAttribute("eId"),
12
+ date: stepElement.getAttribute("date") ? new Date(stepElement.getAttribute("date") ?? "") : null,
13
+ type: identificationParts?.["type"] || null,
14
+ session: identificationParts?.["session"] || null,
15
+ numero: identificationParts?.["numTexte"] || null,
16
+ version: identificationParts?.["version"] ? identificationParts["version"] : null,
17
+ outcome: stepElement.getAttribute("outcome"),
18
+ });
19
+ }
20
+ return steps;
21
+ }
22
+ function buildDivision(node, index) {
23
+ const eId = node.getAttribute("eId");
24
+ const tag = node.nodeName;
25
+ const level = DivisionType[tag];
26
+ const titleNode = node.querySelector("num");
27
+ const subtitleNode = node.querySelector("heading");
28
+ const headings = [
29
+ ...(titleNode
30
+ ? [
31
+ {
32
+ text: titleNode.textContent?.trim() ?? null,
33
+ html: titleNode.innerHTML?.trim() ?? null,
34
+ },
35
+ ]
36
+ : []),
37
+ ...(subtitleNode
38
+ ? [
39
+ {
40
+ text: subtitleNode.textContent?.trim() ?? null,
41
+ html: subtitleNode.innerHTML?.trim() ?? null,
42
+ },
43
+ ]
44
+ : []),
45
+ ];
46
+ const division = {
47
+ index,
48
+ eId,
49
+ tag,
50
+ level,
51
+ headings,
52
+ };
53
+ if (tag === "article") {
54
+ ;
55
+ division.alineas = [];
56
+ }
57
+ return division;
58
+ }
59
+ function buildAlinea(contentNode, alineaNode) {
60
+ const eId = alineaNode.getAttribute("eId");
61
+ const heading = {
62
+ text: alineaNode.querySelector("num")?.textContent ?? null,
63
+ };
64
+ const pastille = alineaNode.getAttribute("data:pastille") ?? null;
65
+ return {
66
+ eId,
67
+ heading,
68
+ text: contentNode.textContent?.trim() ?? null,
69
+ html: contentNode.innerHTML?.trim() ?? null,
70
+ pastille,
71
+ };
72
+ }
73
+ function buildEmptyArticle(index) {
74
+ return {
75
+ index: index,
76
+ eId: "",
77
+ tag: "article",
78
+ level: DivisionType["article"],
79
+ headings: [],
80
+ alineas: [],
81
+ };
82
+ }
83
+ function flattenTexte(texteContentRoot) {
84
+ const divisions = [];
85
+ let divisionIndex = 0;
86
+ const iter = (node) => {
87
+ if (node.nodeName === "content") {
88
+ return;
89
+ }
90
+ switch (node.nodeName) {
91
+ case "tome":
92
+ case "part":
93
+ case "book":
94
+ case "title":
95
+ case "subtitle":
96
+ case "chapter":
97
+ case "section":
98
+ case "subsection":
99
+ case "paragraph":
100
+ case "article":
101
+ divisions.push(buildDivision(node, divisionIndex++));
102
+ break;
103
+ }
104
+ if (node.nodeName === "alinea") {
105
+ Array.from(node.childNodes)
106
+ // Find direct content children programmatically
107
+ // because `:scope` selector does not work
108
+ // https://github.com/jsdom/jsdom/issues/2998
109
+ .filter((alineaChildNode) => alineaChildNode.nodeName === "content")
110
+ .forEach((alineaContentNode) => {
111
+ // Hypothesis: alineas should always be enclosed in articles
112
+ let lastArticle = divisions.findLast((division) => division.tag === "article");
113
+ if (!lastArticle) {
114
+ lastArticle = buildEmptyArticle(divisionIndex++);
115
+ divisions.push(lastArticle);
116
+ }
117
+ lastArticle.alineas.push(buildAlinea(alineaContentNode, node));
118
+ });
119
+ }
120
+ if (node.hasChildNodes()) {
121
+ node.childNodes.forEach((childNode) => iter(childNode));
122
+ }
123
+ };
124
+ iter(texteContentRoot);
125
+ return divisions;
126
+ }
127
+ export function transformTexte(document) {
128
+ const metaElement = document.querySelector("meta");
129
+ const preambleElement = document.querySelector("preamble");
130
+ const identification = metaElement?.querySelector("FRBRExpression FRBRuri")?.getAttribute("value") ?? "";
131
+ const identificationParts = AKN_IDENTIFICATION_STRUCTURE_REGEXP.exec(identification)?.groups;
132
+ const bodyElement = document.querySelector("body");
133
+ const sessionYears = identificationParts?.["session"]?.split("-") || null;
134
+ const datePresentation = metaElement?.querySelector("FRBRdate[name='#presentation']")?.getAttribute("date");
135
+ const dateDepot = metaElement?.querySelector("FRBRdate[name='#depot']")?.getAttribute("date");
136
+ const datePublicationXml = metaElement?.querySelector("FRBRdate[name='#publication-xml']")?.getAttribute("date");
137
+ return {
138
+ titre: preambleElement?.querySelector("docTitle")?.textContent || null,
139
+ titreCourt: metaElement?.querySelector("FRBRalias[name='intitule-court']")?.getAttribute("value") || null,
140
+ signetDossier: metaElement?.querySelector("FRBRalias[name='signet-dossier-legislatif-senat']")?.getAttribute("value") || null,
141
+ urlDossierSenat: metaElement?.querySelector("FRBRalias[name='url-senat']")?.getAttribute("value") || null,
142
+ urlDossierAssemblee: metaElement?.querySelector("FRBRalias[name='url-AN']")?.getAttribute("value") || null,
143
+ type: identificationParts?.["type"] || null,
144
+ session: sessionYears && sessionYears.length > 0 ? sessionYears[0] : null,
145
+ numero: identificationParts?.["numTexte"] ? parseInt(identificationParts["numTexte"]) : null,
146
+ datePresentation: datePresentation ? new Date(datePresentation) : null,
147
+ dateDepot: dateDepot ? new Date(dateDepot) : null,
148
+ datePublicationXml: datePublicationXml ? new Date(datePublicationXml) : null,
149
+ version: identificationParts?.["version"] ? identificationParts["version"] : null,
150
+ workflow: metaElement ? buildWorklow(metaElement) : [],
151
+ divisions: bodyElement ? flattenTexte(bodyElement) : [],
152
+ };
153
+ }
154
+ export function transformExposeDesMotifs(document) {
155
+ const sectionElements = document.querySelectorAll("section");
156
+ const exposeDesMotifsRegexp = new RegExp("EXPOS.{1,2}[\\n\\s]DES[\\n\\s]MOTIFS");
157
+ for (const sectionElement of sectionElements) {
158
+ const firstParagraph = sectionElement.querySelector("p:first-of-type");
159
+ const secondParagraph = sectionElement.querySelector("p:nth-of-type(2)");
160
+ if (!firstParagraph) {
161
+ continue;
162
+ }
163
+ const firstParagraphContent = firstParagraph.textContent;
164
+ const secondParagraphContent = secondParagraph?.textContent;
165
+ if (!firstParagraphContent || !exposeDesMotifsRegexp.test(firstParagraphContent.toUpperCase())) {
166
+ if (!secondParagraphContent || !exposeDesMotifsRegexp.test(secondParagraphContent.toUpperCase())) {
167
+ continue;
168
+ }
169
+ else {
170
+ secondParagraph.remove();
171
+ }
172
+ }
173
+ firstParagraph.remove();
174
+ return {
175
+ text: sectionElement.textContent?.trim() ?? null,
176
+ html: sectionElement.innerHTML?.trim() ?? null,
177
+ };
178
+ }
179
+ return null;
180
+ }
181
+ export function parseTexte(texteXml) {
182
+ try {
183
+ const { document } = new JSDOM(texteXml, {
184
+ contentType: "text/xml",
185
+ }).window;
186
+ return transformTexte(document);
187
+ }
188
+ catch (error) {
189
+ console.error(`Could not parse texte with error ${error}`);
190
+ }
191
+ return null;
192
+ }
193
+ // Prevent from memory leak
194
+ // https://github.com/jsdom/jsdom/issues/2583#issuecomment-559520814
195
+ export async function parseTexteFromFile(xmlFilePath) {
196
+ try {
197
+ const { document } = (await JSDOM.fromFile(xmlFilePath, { contentType: "text/xml" })).window;
198
+ return transformTexte(document);
199
+ }
200
+ catch (error) {
201
+ console.error(`Could not parse texte with error ${error}`);
202
+ }
203
+ return null;
204
+ }
205
+ export function parseExposeDesMotifs(exposeDesMotifsHtml) {
206
+ try {
207
+ const { document } = new JSDOM(exposeDesMotifsHtml, {
208
+ contentType: "text/html",
209
+ }).window;
210
+ return transformExposeDesMotifs(document);
211
+ }
212
+ catch (error) {
213
+ console.error(`Could not parse exposé des motifs with error ${error}`);
214
+ }
215
+ return null;
216
+ }
217
+ // Prevent from memory leak
218
+ // https://github.com/jsdom/jsdom/issues/2583#issuecomment-559520814
219
+ export async function parseExposeDesMotifsFromFile(htmlFilePath) {
220
+ try {
221
+ const { document } = (await JSDOM.fromFile(htmlFilePath, { contentType: "text/html" })).window;
222
+ return transformExposeDesMotifs(document);
223
+ }
224
+ catch (error) {
225
+ console.error(`Could not parse exposé des motifs with error ${error}`);
226
+ }
227
+ return null;
228
+ }
@@ -5,8 +5,9 @@ import path from "path";
5
5
  import pLimit from "p-limit";
6
6
  import * as git from "../git";
7
7
  import { datasets, EnabledDatasets, getEnabledDatasets } from "../datasets";
8
- import { DATA_ORIGINAL_FOLDER, DOCUMENT_METADATA_FILE, DOSLEG_DOSSIERS_FOLDER, SCRUTINS_FOLDER, RAPPORT_FOLDER, SENS_CIRCONSCRIPTIONS_FOLDER, SENS_ORGANISMES_FOLDER, SENS_SENATEURS_FOLDER, TEXTE_FOLDER, } from "../loaders";
8
+ import { DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER, DOCUMENT_METADATA_FILE, DOSLEG_DOSSIERS_FOLDER, SCRUTINS_FOLDER, RAPPORT_FOLDER, SENS_CIRCONSCRIPTIONS_FOLDER, SENS_ORGANISMES_FOLDER, SENS_SENATEURS_FOLDER, TEXTE_FOLDER, } from "../loaders";
9
9
  import { findAllAmendements, findAllCirconscriptions, findAllDebats, findAllDossiers, findAllScrutins, findAllOrganismes, findAllQuestions, findAllSens, findSenatRapportUrls, findSenatTexteUrls, } from "../model";
10
+ import { processRapport, processTexte } from "./retrieve_documents";
10
11
  import { createActesLegislatifs } from "../model/dosleg";
11
12
  import { UNDEFINED_SESSION } from "../types/sessions";
12
13
  import { getSessionFromDate, getSessionFromSignet } from "./datautil";
@@ -73,7 +74,7 @@ async function convertData() {
73
74
  }
74
75
  if (enabledDatasets & EnabledDatasets.Questions) {
75
76
  try {
76
- await convertDatasetQuestions(dataDir);
77
+ await convertDatasetQuestions(dataDir, options);
77
78
  const questionsDir = path.join(dataDir, datasets.questions.database);
78
79
  exitCode = commitGit(questionsDir, options, exitCode);
79
80
  }
@@ -83,7 +84,7 @@ async function convertData() {
83
84
  }
84
85
  if (enabledDatasets & EnabledDatasets.Sens) {
85
86
  try {
86
- await convertDatasetSens(dataDir);
87
+ await convertDatasetSens(dataDir, options);
87
88
  const sensDir = path.join(dataDir, datasets.sens.database);
88
89
  exitCode = commitGit(sensDir, options, exitCode);
89
90
  }
@@ -102,7 +103,9 @@ async function convertDatasetAmeli(dataDir, options) {
102
103
  console.log(`Converting database ${dataset.database} data into files…`);
103
104
  }
104
105
  const ameliReorganizedRootDir = path.join(dataDir, dataset.database);
105
- await fs.ensureDir(ameliReorganizedRootDir);
106
+ if (!options.keepDir) {
107
+ ensureAndClearDir(ameliReorganizedRootDir);
108
+ }
106
109
  for await (const amendement of findAllAmendements(options["fromSession"])) {
107
110
  if (options["verbose"]) {
108
111
  console.log(`Converting ${amendement["numero"]} file…`);
@@ -110,11 +113,9 @@ async function convertDatasetAmeli(dataDir, options) {
110
113
  const session = String(amendement["session"]) || UNDEFINED_SESSION;
111
114
  const signetDossierLegislatif = amendement["signet_dossier_legislatif"] ||
112
115
  `${amendement["nature_texte"]}-${amendement["numero_texte"]}`.toLowerCase();
113
- const ameliReorganizedDir = path.join(ameliReorganizedRootDir, String(session), signetDossierLegislatif);
114
- await fs.ensureDir(ameliReorganizedDir);
115
116
  const amendementFileName = `${amendement["numero"]}.json`;
116
- const filePath = path.join(ameliReorganizedDir, amendementFileName);
117
- await fs.writeJSON(filePath, amendement, { spaces: 2 });
117
+ const filePath = path.join(ameliReorganizedRootDir, String(session), signetDossierLegislatif, amendementFileName);
118
+ await fs.outputJSON(filePath, amendement, { spaces: 2 });
118
119
  }
119
120
  }
120
121
  async function convertDatasetDebats(dataDir, options) {
@@ -123,7 +124,9 @@ async function convertDatasetDebats(dataDir, options) {
123
124
  console.log(`Converting database ${dataset.database} data into files…`);
124
125
  }
125
126
  const debatsReorganizedRootDir = path.join(dataDir, dataset.database);
126
- ensureAndClearDir(debatsReorganizedRootDir);
127
+ if (!options.keepDir) {
128
+ ensureAndClearDir(debatsReorganizedRootDir);
129
+ }
127
130
  for await (const debat of findAllDebats()) {
128
131
  if (options["verbose"]) {
129
132
  console.log(`Converting ${debat.id} file…`);
@@ -132,11 +135,9 @@ async function convertDatasetDebats(dataDir, options) {
132
135
  if (options["fromSession"] && session < options["fromSession"]) {
133
136
  continue;
134
137
  }
135
- const debatsReorganizedDir = path.join(debatsReorganizedRootDir, String(session));
136
- await fs.ensureDir(debatsReorganizedDir);
137
138
  const debatFileName = `${debat.id}.json`;
138
- const filePath = path.join(debatsReorganizedDir, debatFileName);
139
- await fs.writeJSON(filePath, debat, { spaces: 2 });
139
+ const filePath = path.join(debatsReorganizedRootDir, String(session), debatFileName);
140
+ await fs.outputJSON(filePath, debat, { spaces: 2 });
140
141
  }
141
142
  }
142
143
  async function convertDatasetDosLeg(dataDir, options) {
@@ -146,8 +147,10 @@ async function convertDatasetDosLeg(dataDir, options) {
146
147
  }
147
148
  const doslegReorganizedRootDir = path.join(dataDir, dataset.database);
148
149
  const dossiersReorganizedDir = path.join(doslegReorganizedRootDir, DOSLEG_DOSSIERS_FOLDER);
149
- ensureAndClearDir(doslegReorganizedRootDir);
150
- ensureAndClearDir(dossiersReorganizedDir);
150
+ if (!options.keepDir) {
151
+ ensureAndClearDir(doslegReorganizedRootDir);
152
+ ensureAndClearDir(dossiersReorganizedDir);
153
+ }
151
154
  for await (const loi of findAllDossiers()) {
152
155
  if (options["verbose"]) {
153
156
  console.log(`Converting ${loi["signet"]} file…`);
@@ -158,16 +161,14 @@ async function convertDatasetDosLeg(dataDir, options) {
158
161
  continue;
159
162
  }
160
163
  loiReorganizedDir = path.join(dossiersReorganizedDir, String(session));
161
- await fs.ensureDir(loiReorganizedDir);
162
164
  // Ajout des actes législatifs au dossier
163
165
  const actesLegislatifs = createActesLegislatifs(loi);
164
166
  const loiWithActes = { ...loi, actes_legislatifs: actesLegislatifs };
165
167
  const dossierFile = `${loi["signet"]}.json`;
166
- const filePath = path.join(loiReorganizedDir, dossierFile);
167
- await fs.writeJSON(filePath, loiWithActes, { spaces: 2 });
168
+ await fs.outputJSON(path.join(loiReorganizedDir, dossierFile), loiWithActes, { spaces: 2 });
168
169
  }
169
- await convertTexteUrls(dataDir);
170
- await convertRapportUrls(dataDir);
170
+ await convertTexteUrls(dataDir, options);
171
+ await convertRapportUrls(dataDir, options);
171
172
  }
172
173
  async function convertDatasetScrutins(dataDir, options) {
173
174
  const dataset = datasets.dosleg;
@@ -175,7 +176,9 @@ async function convertDatasetScrutins(dataDir, options) {
175
176
  console.log(`Converting database scrutins (${dataset.database}) data into files…`);
176
177
  }
177
178
  const scrutinsReorganizedDir = path.join(dataDir, SCRUTINS_FOLDER);
178
- ensureAndClearDir(scrutinsReorganizedDir);
179
+ if (!options.keepDir) {
180
+ ensureAndClearDir(scrutinsReorganizedDir);
181
+ }
179
182
  for await (const scrutin of findAllScrutins(options["fromSession"])) {
180
183
  if (options["verbose"]) {
181
184
  console.log(`Converting ${scrutin["numero"]} file…`);
@@ -183,20 +186,21 @@ async function convertDatasetScrutins(dataDir, options) {
183
186
  let scrutinReorganizedDir = path.join(scrutinsReorganizedDir, String(UNDEFINED_SESSION));
184
187
  const session = scrutin["session"] || UNDEFINED_SESSION;
185
188
  scrutinReorganizedDir = path.join(scrutinsReorganizedDir, String(session));
186
- await fs.ensureDir(scrutinReorganizedDir);
187
189
  const scrutinFileName = `${scrutin["numero"]}.json`;
188
- await fs.writeJSON(path.join(scrutinReorganizedDir, scrutinFileName), scrutin, {
190
+ await fs.outputJSON(path.join(scrutinReorganizedDir, scrutinFileName), scrutin, {
189
191
  spaces: 2,
190
192
  });
191
193
  }
192
194
  }
193
- async function convertDatasetQuestions(dataDir) {
195
+ async function convertDatasetQuestions(dataDir, options) {
194
196
  const dataset = datasets.questions;
195
197
  if (!options["silent"]) {
196
198
  console.log(`Converting database ${dataset.database} data into files…`);
197
199
  }
198
200
  const questionsReorganizedRootDir = path.join(dataDir, dataset.database);
199
- ensureAndClearDir(questionsReorganizedRootDir);
201
+ if (!options.keepDir) {
202
+ ensureAndClearDir(questionsReorganizedRootDir);
203
+ }
200
204
  const limit = pLimit(10);
201
205
  const tasks = [];
202
206
  for await (const question of findAllQuestions()) {
@@ -205,22 +209,27 @@ async function convertDatasetQuestions(dataDir) {
205
209
  console.log(`Converting ${question["reference"]} file…`);
206
210
  }
207
211
  const legislature = question["legislature"] ? question["legislature"] : 0;
208
- const questionReorganizedDir = path.join(questionsReorganizedRootDir, String(legislature));
209
- await fs.ensureDir(questionReorganizedDir);
210
212
  const questionFileName = `${question["reference"]}.json`;
211
- await fs.writeJSON(path.join(questionReorganizedDir, questionFileName), question, { spaces: 2 });
213
+ await fs.outputJSON(path.join(questionsReorganizedRootDir, String(legislature), questionFileName), question, {
214
+ spaces: 2,
215
+ });
212
216
  }));
213
217
  }
214
218
  await Promise.all(tasks);
215
219
  }
216
- async function convertTexteUrls(dataDir) {
217
- const textesDir = path.join(dataDir, TEXTE_FOLDER);
218
- fs.ensureDirSync(textesDir);
219
- const originalTextesDir = path.join(textesDir, DATA_ORIGINAL_FOLDER);
220
+ async function convertTexteUrls(dataDir, options) {
221
+ const originalTextesDir = path.join(dataDir, TEXTE_FOLDER, DATA_ORIGINAL_FOLDER);
222
+ const transformedTextesDir = path.join(dataDir, TEXTE_FOLDER, DATA_TRANSFORMED_FOLDER);
223
+ if (!options["silent"]) {
224
+ console.log(`Converting database textes data into files…`);
225
+ }
220
226
  for await (const texte of findSenatTexteUrls()) {
227
+ const session = texte.session ?? UNDEFINED_SESSION;
228
+ if (options["fromSession"] && session < options["fromSession"]) {
229
+ continue;
230
+ }
221
231
  const texteName = path.parse(texte.url).name;
222
- const texteDir = path.join(originalTextesDir, `${texte.session ?? UNDEFINED_SESSION}`, texteName);
223
- fs.ensureDirSync(texteDir);
232
+ const texteDir = path.join(originalTextesDir, `${session}`, texteName);
224
233
  const metadata = {
225
234
  name: texteName,
226
235
  session: texte.session,
@@ -232,20 +241,27 @@ async function convertTexteUrls(dataDir) {
232
241
  url_html: new URL(`${texteName}.html`, SENAT_TEXTE_BASE_URL),
233
242
  url_pdf: new URL(`${texteName}.pdf`, SENAT_TEXTE_BASE_URL),
234
243
  };
235
- fs.writeJSONSync(path.join(texteDir, DOCUMENT_METADATA_FILE), metadata, {
244
+ fs.outputJSONSync(path.join(texteDir, DOCUMENT_METADATA_FILE), metadata, {
236
245
  spaces: 2,
237
246
  });
247
+ if (options.fetchDocuments) {
248
+ await processTexte(metadata, originalTextesDir, transformedTextesDir, options);
249
+ }
238
250
  }
239
251
  }
240
- async function convertRapportUrls(dataDir) {
241
- const rapportsDir = path.join(dataDir, RAPPORT_FOLDER);
242
- fs.ensureDirSync(rapportsDir);
243
- const originalTextesDir = path.join(rapportsDir, DATA_ORIGINAL_FOLDER);
252
+ async function convertRapportUrls(dataDir, options) {
253
+ const originalRapportsDir = path.join(dataDir, RAPPORT_FOLDER, DATA_ORIGINAL_FOLDER);
254
+ if (!options["silent"]) {
255
+ console.log(`Converting database rapports data into files…`);
256
+ }
244
257
  for await (const rapport of findSenatRapportUrls()) {
258
+ const session = rapport.session ?? UNDEFINED_SESSION;
259
+ if (options["fromSession"] && session < options["fromSession"]) {
260
+ continue;
261
+ }
245
262
  const parsedRapportUrl = path.parse(rapport.url);
246
263
  const rapportName = parsedRapportUrl.name;
247
- const rapportDir = path.join(originalTextesDir, `${rapport.session ?? UNDEFINED_SESSION}`, rapportName);
248
- fs.ensureDirSync(rapportDir);
264
+ const rapportDir = path.join(originalRapportsDir, `${session}`, rapportName);
249
265
  const rapportHtmlUrlBase = `${rapportName}_mono.html`;
250
266
  const rapportHtmlUrl = path.format({
251
267
  dir: parsedRapportUrl.dir,
@@ -263,12 +279,15 @@ async function convertRapportUrls(dataDir) {
263
279
  url_html: new URL(rapportHtmlUrl, SENAT_RAPPORT_BASE_URL),
264
280
  url_pdf: new URL(rapportPdfUrl, SENAT_RAPPORT_BASE_URL),
265
281
  };
266
- fs.writeJSONSync(path.join(rapportDir, DOCUMENT_METADATA_FILE), metadata, {
282
+ fs.outputJSONSync(path.join(rapportDir, DOCUMENT_METADATA_FILE), metadata, {
267
283
  spaces: 2,
268
284
  });
285
+ if (options.fetchDocuments) {
286
+ await processRapport(metadata, originalRapportsDir, options);
287
+ }
269
288
  }
270
289
  }
271
- async function convertDatasetSens(dataDir) {
290
+ async function convertDatasetSens(dataDir, options) {
272
291
  const dataset = datasets.sens;
273
292
  if (!options["silent"]) {
274
293
  console.log(`Converting database ${dataset.database} data into files…`);
@@ -277,16 +296,18 @@ async function convertDatasetSens(dataDir) {
277
296
  const senateursReorganizedDir = path.join(sensReorganizedRootDir, SENS_SENATEURS_FOLDER);
278
297
  const circonscriptionsReorganizedDir = path.join(sensReorganizedRootDir, SENS_CIRCONSCRIPTIONS_FOLDER);
279
298
  const organismesReorganizedDir = path.join(sensReorganizedRootDir, SENS_ORGANISMES_FOLDER);
280
- ensureAndClearDir(sensReorganizedRootDir);
281
- ensureAndClearDir(senateursReorganizedDir);
282
- ensureAndClearDir(circonscriptionsReorganizedDir);
283
- ensureAndClearDir(organismesReorganizedDir);
299
+ if (!options.keepDir) {
300
+ ensureAndClearDir(sensReorganizedRootDir);
301
+ ensureAndClearDir(senateursReorganizedDir);
302
+ ensureAndClearDir(circonscriptionsReorganizedDir);
303
+ ensureAndClearDir(organismesReorganizedDir);
304
+ }
284
305
  for await (const sen of findAllSens()) {
285
306
  if (options["verbose"]) {
286
307
  console.log(`Converting ${sen["matricule"]} file…`);
287
308
  }
288
309
  const senFileName = `${sen["matricule"]}.json`;
289
- fs.writeJSONSync(path.join(senateursReorganizedDir, senFileName), sen, {
310
+ fs.outputJSONSync(path.join(senateursReorganizedDir, senFileName), sen, {
290
311
  spaces: 2,
291
312
  });
292
313
  }
@@ -295,16 +316,18 @@ async function convertDatasetSens(dataDir) {
295
316
  console.log(`Converting ${circonscription["identifiant"]} file…`);
296
317
  }
297
318
  const circonscriptionFileName = `${circonscription["identifiant"]}.json`;
298
- fs.writeJSONSync(path.join(circonscriptionsReorganizedDir, circonscriptionFileName), circonscription, { spaces: 2 });
319
+ fs.outputJSONSync(path.join(circonscriptionsReorganizedDir, circonscriptionFileName), circonscription, {
320
+ spaces: 2,
321
+ });
299
322
  }
300
323
  for await (const organisme of findAllOrganismes()) {
301
324
  if (options["verbose"]) {
302
325
  console.log(`Converting ${organisme["code"]} file…`);
303
326
  }
304
327
  const organismeFileName = `${organisme["code"]}.json`;
305
- const organismeDir = path.join(organismesReorganizedDir, organisme["type_code"]);
306
- fs.ensureDirSync(organismeDir);
307
- fs.writeJSONSync(path.join(organismeDir, organismeFileName), organisme, { spaces: 2 });
328
+ fs.outputJSONSync(path.join(organismesReorganizedDir, organisme["type_code"], organismeFileName), organisme, {
329
+ spaces: 2,
330
+ });
308
331
  }
309
332
  }
310
333
  convertData()
@@ -5,5 +5,8 @@ try {
5
5
  execSync(`tsx src/scripts/convert_data.ts ${args}`, { stdio: "inherit" });
6
6
  }
7
7
  catch (error) {
8
- process.exit(1);
8
+ if (error.status !== 10) {
9
+ console.error("Error during data retrieval:", error);
10
+ process.exit(error.status || 1);
11
+ }
9
12
  }
@@ -1 +1,2 @@
1
- export {};
1
+ export declare function processTexte(texteMetadata: any, originalTextesDir: string, transformedTextesDir: string, options: any): Promise<void>;
2
+ export declare function processRapport(rapportMetadata: any, originalRapportsDir: string, options: any): Promise<void>;
@@ -4,17 +4,12 @@ import fs from "fs-extra";
4
4
  import { DateTime } from "luxon";
5
5
  import path from "path";
6
6
  import { DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER, iterLoadSenatDossiersLegislatifsRapportUrls, iterLoadSenatDossiersLegislatifsTexteUrls, RAPPORT_FOLDER, TEXTE_FOLDER, } from "../loaders";
7
- import { parseExposeDesMotifs, parseTexte, parseTexteFromFile } from "../model/texte";
7
+ import { parseExposeDesMotifs, parseTexte, parseTexteFromFile } from "../parsers/texte";
8
8
  import { getSessionsFromStart, UNDEFINED_SESSION } from "../types/sessions";
9
9
  import { commonOptions } from "./shared/cli_helpers";
10
10
  import { ensureAndClearDir, fetchWithRetry, isOptionEmptyOrHasValue } from "./shared/util";
11
11
  const optionsDefinitions = [
12
12
  ...commonOptions,
13
- {
14
- help: "parse and convert documents into JSON (textes only for now, requires format xml)",
15
- name: "parseDocuments",
16
- type: Boolean,
17
- },
18
13
  {
19
14
  alias: "F",
20
15
  help: "formats of documents to retrieve (xml/html/pdf for textes, html/pdf for rapports); leave empty for all",
@@ -38,205 +33,142 @@ const options = commandLineArgs(optionsDefinitions);
38
33
  const textDecoder = new TextDecoder("utf8");
39
34
  const today = DateTime.now();
40
35
  function isDocumentRecent(documentDate, daysThreshold) {
41
- if (!documentDate) {
36
+ if (!documentDate)
42
37
  return false;
43
- }
44
38
  const docDate = DateTime.fromISO(documentDate);
45
- if (!docDate.isValid) {
46
- return false;
47
- }
48
- const daysDiff = today.diff(docDate, "days").days;
49
- return daysDiff <= daysThreshold;
39
+ return docDate.isValid && today.diff(docDate, "days").days <= daysThreshold;
50
40
  }
51
- async function retrieveTextes(dataDir, sessions) {
52
- const textesDir = path.join(dataDir, TEXTE_FOLDER);
53
- fs.ensureDirSync(textesDir);
54
- const originalTextesDir = path.join(textesDir, DATA_ORIGINAL_FOLDER);
55
- const transformedTextesDir = path.join(textesDir, DATA_TRANSFORMED_FOLDER);
56
- if (options["parseDocuments"]) {
57
- ensureAndClearDir(transformedTextesDir);
41
+ function shouldDownload(filePath, docDate, options) {
42
+ if (options.force)
43
+ return true;
44
+ if (!fs.existsSync(filePath))
45
+ return true;
46
+ if (options.onlyRecent !== undefined) {
47
+ return isDocumentRecent(docDate, options.onlyRecent);
48
+ }
49
+ return false;
50
+ }
51
+ async function downloadDocument(documentUrl, verbose) {
52
+ if (verbose) {
53
+ console.log(`Downloading document ${documentUrl}…`);
58
54
  }
59
- let retrievedTextesCount = 0;
60
- const texteUrlsNotFoundOrError = [];
61
- const texteUrlsParseError = [];
62
- for (const session of sessions) {
63
- for (const { item: texteMetadata } of iterLoadSenatDossiersLegislatifsTexteUrls(dataDir, session)) {
64
- const texteDir = path.join(originalTextesDir, `${texteMetadata.session ?? UNDEFINED_SESSION}`, texteMetadata.name);
65
- fs.ensureDirSync(texteDir);
66
- let exposeDesMotifsContent = null;
67
- if (texteMetadata.url_expose_des_motifs) {
68
- exposeDesMotifsContent = await downloadExposeDesMotifs(texteDir, texteMetadata.name, String(texteMetadata.url_expose_des_motifs));
69
- }
70
- if (isOptionEmptyOrHasValue(options["formats"], "xml")) {
71
- const textePath = path.join(texteDir, `${texteMetadata.name}.xml`);
72
- let texteBuffer = null;
73
- // Check if document should be skipped based on onlyRecent option
74
- const shouldSkip = !options["force"] &&
75
- fs.existsSync(textePath) &&
76
- (options["only-recent"] === undefined || !isDocumentRecent(texteMetadata.date, options["only-recent"]));
77
- if (shouldSkip) {
78
- if (!options["silent"]) {
79
- console.info(`Already downloaded texte ${textePath}…`);
80
- }
81
- }
82
- else {
83
- texteBuffer = await downloadDocument(texteMetadata.url_xml.toString());
84
- if (!texteBuffer) {
85
- texteUrlsNotFoundOrError.push(texteMetadata.url_xml);
86
- continue;
87
- }
88
- fs.writeFileSync(textePath, Buffer.from(texteBuffer));
89
- retrievedTextesCount++;
90
- }
91
- if (options["parseDocuments"]) {
92
- const parsedTexte = await parseDocument(texteMetadata.session, transformedTextesDir, textePath, texteMetadata.name, texteBuffer, exposeDesMotifsContent);
93
- if (!parsedTexte) {
94
- texteUrlsParseError.push(texteMetadata.url_xml);
95
- }
96
- }
97
- }
98
- if (isOptionEmptyOrHasValue(options["formats"], "html")) {
99
- const textePath = path.join(texteDir, `${texteMetadata.name}.html`);
100
- // Check if document should be skipped based on onlyRecent option
101
- const shouldSkip = !options["force"] &&
102
- fs.existsSync(textePath) &&
103
- (options["only-recent"] === undefined || !isDocumentRecent(texteMetadata.date, options["only-recent"]));
104
- if (shouldSkip) {
105
- if (!options["silent"]) {
106
- console.info(`Already downloaded texte ${textePath}…`);
107
- }
108
- }
109
- else {
110
- const texteBuffer = await downloadDocument(texteMetadata.url_html.toString());
111
- if (!texteBuffer) {
112
- texteUrlsNotFoundOrError.push(texteMetadata.url_html);
113
- continue;
114
- }
115
- fs.writeFileSync(textePath, Buffer.from(texteBuffer));
116
- retrievedTextesCount++;
55
+ try {
56
+ const response = await fetchWithRetry(documentUrl);
57
+ if (!response.ok) {
58
+ if (response.status === 404) {
59
+ if (verbose) {
60
+ console.warn(`Document ${documentUrl} not found`);
117
61
  }
118
62
  }
119
- if (isOptionEmptyOrHasValue(options["formats"], "pdf")) {
120
- const textePath = path.join(texteDir, `${texteMetadata.name}.pdf`);
121
- // Check if document should be skipped based on onlyRecent option
122
- const shouldSkip = !options["force"] &&
123
- fs.existsSync(textePath) &&
124
- (options["only-recent"] === undefined || !isDocumentRecent(texteMetadata.date, options["only-recent"]));
125
- if (shouldSkip) {
126
- if (!options["silent"]) {
127
- console.info(`Already downloaded texte ${textePath}…`);
128
- }
129
- }
130
- else {
131
- const texteBuffer = await downloadDocument(texteMetadata.url_pdf.toString());
132
- if (!texteBuffer) {
133
- texteUrlsNotFoundOrError.push(texteMetadata.url_pdf);
134
- continue;
135
- }
136
- fs.writeFileSync(textePath, Buffer.from(texteBuffer));
137
- retrievedTextesCount++;
63
+ else {
64
+ if (verbose) {
65
+ console.error(`An error occurred while retrieving document ${documentUrl}: ${response.status}`);
138
66
  }
139
67
  }
68
+ return null;
140
69
  }
70
+ return response.arrayBuffer();
141
71
  }
142
- if (options["verbose"]) {
143
- console.log(`${retrievedTextesCount} textes retrieved`);
144
- console.log(`${texteUrlsNotFoundOrError.length} textes failed to be retrieved with URLs ${texteUrlsNotFoundOrError.join(", ")}`);
145
- if (options["parseDocuments"]) {
146
- console.log(`${texteUrlsParseError.length} textes failed to be parsed with URLs ${texteUrlsParseError.join(", ")}`);
147
- }
72
+ catch (error) {
73
+ console.error(error.message);
74
+ return null;
148
75
  }
149
76
  }
150
- async function retrieveRapports(dataDir, sessions) {
151
- const rapportsDir = path.join(dataDir, RAPPORT_FOLDER);
152
- fs.ensureDirSync(rapportsDir);
153
- const originalRapportsDir = path.join(rapportsDir, DATA_ORIGINAL_FOLDER);
154
- let retrievedRapportsCount = 0;
155
- const rapportUrlsNotFoundOrError = [];
156
- for (const session of sessions) {
157
- for (const { item: rapportMetadata } of iterLoadSenatDossiersLegislatifsRapportUrls(dataDir, session)) {
158
- const rapportDir = path.join(originalRapportsDir, `${rapportMetadata.session ?? UNDEFINED_SESSION}`, rapportMetadata.name);
159
- fs.ensureDirSync(rapportDir);
160
- if (isOptionEmptyOrHasValue(options["formats"], "html")) {
161
- const rapportPath = path.join(rapportDir, `${rapportMetadata.name}.html`);
162
- // Check if document should be skipped based on onlyRecent option
163
- const shouldSkip = !options["force"] &&
164
- fs.existsSync(rapportPath) &&
165
- (options["only-recent"] === undefined || !isDocumentRecent(rapportMetadata.date, options["only-recent"]));
166
- if (shouldSkip) {
167
- if (!options["silent"]) {
168
- console.info(`Already downloaded rapport ${rapportPath}…`);
169
- }
170
- continue;
171
- }
172
- const rapportBuffer = await downloadDocument(rapportMetadata.url_html.toString());
173
- if (!rapportBuffer) {
174
- rapportUrlsNotFoundOrError.push(rapportMetadata.url_html);
175
- continue;
176
- }
177
- fs.writeFileSync(rapportPath, Buffer.from(rapportBuffer));
178
- retrievedRapportsCount++;
179
- }
180
- if (isOptionEmptyOrHasValue(options["formats"], "pdf")) {
181
- const rapportPath = path.join(rapportDir, `${rapportMetadata.name}.pdf`);
182
- // Check if document should be skipped based on onlyRecent option
183
- const shouldSkip = !options["force"] &&
184
- fs.existsSync(rapportPath) &&
185
- (options["only-recent"] === undefined || !isDocumentRecent(rapportMetadata.date, options["only-recent"]));
186
- if (shouldSkip) {
187
- if (!options["silent"]) {
188
- console.info(`Already downloaded rapport ${rapportPath}…`);
189
- }
190
- continue;
191
- }
192
- const rapportBuffer = await downloadDocument(rapportMetadata.url_pdf.toString());
193
- if (!rapportBuffer) {
194
- rapportUrlsNotFoundOrError.push(rapportMetadata.url_pdf);
195
- continue;
196
- }
197
- fs.writeFileSync(rapportPath, Buffer.from(rapportBuffer));
198
- retrievedRapportsCount++;
77
+ async function processDocument(url, destPath, docDate, options) {
78
+ if (!shouldDownload(destPath, docDate, options)) {
79
+ if (options.verbose)
80
+ console.info(`Already downloaded ${destPath}…`);
81
+ return { success: true, skipped: true, buffer: null };
82
+ }
83
+ const arrayBuffer = await downloadDocument(url, options.verbose);
84
+ if (!arrayBuffer) {
85
+ return { success: false, skipped: false, buffer: null };
86
+ }
87
+ const buffer = Buffer.from(arrayBuffer);
88
+ await fs.outputFile(destPath, buffer);
89
+ return { success: true, skipped: false, buffer };
90
+ }
91
+ export async function processTexte(texteMetadata, originalTextesDir, transformedTextesDir, options) {
92
+ const texteDir = path.join(originalTextesDir, `${texteMetadata.session ?? UNDEFINED_SESSION}`, texteMetadata.name);
93
+ let exposeDesMotifsContent = null;
94
+ if (texteMetadata.url_expose_des_motifs) {
95
+ const exposePath = path.join(texteDir, `${texteMetadata.name}-expose.html`);
96
+ const res = await processDocument(texteMetadata.url_expose_des_motifs.toString(), exposePath, texteMetadata.date, options);
97
+ if (res.buffer) {
98
+ exposeDesMotifsContent = res.buffer;
99
+ }
100
+ else if (res.skipped && options.parseDocuments) {
101
+ if (await fs.pathExists(exposePath)) {
102
+ exposeDesMotifsContent = await fs.readFile(exposePath);
199
103
  }
200
104
  }
201
105
  }
202
- if (options["verbose"]) {
203
- console.log(`${retrievedRapportsCount} rapports retrieved`);
204
- console.log(`${rapportUrlsNotFoundOrError.length} rapports failed with URLs ${rapportUrlsNotFoundOrError.join(", ")}`);
106
+ const formats = [
107
+ { type: "xml", url: texteMetadata.url_xml, isParseTarget: true },
108
+ { type: "html", url: texteMetadata.url_html, isParseTarget: false },
109
+ { type: "pdf", url: texteMetadata.url_pdf, isParseTarget: false },
110
+ ];
111
+ for (const format of formats) {
112
+ if (!isOptionEmptyOrHasValue(options.formats, format.type))
113
+ continue;
114
+ const destPath = path.join(texteDir, `${texteMetadata.name}.${format.type}`);
115
+ const result = await processDocument(format.url.toString(), destPath, texteMetadata.date, options);
116
+ // Specific logic: Parsing (Only applies to XML)
117
+ if (format.isParseTarget && options.parseDocuments) {
118
+ await parseDocument(texteMetadata.session, transformedTextesDir, destPath, texteMetadata.name, result.buffer, exposeDesMotifsContent, options);
119
+ }
205
120
  }
206
121
  }
207
- async function downloadExposeDesMotifs(texteDir, texteName, url) {
208
- const content = await downloadDocument(url);
209
- if (!content) {
210
- return null;
122
+ export async function processRapport(rapportMetadata, originalRapportsDir, options) {
123
+ const rapportDir = path.join(originalRapportsDir, `${rapportMetadata.session ?? UNDEFINED_SESSION}`, rapportMetadata.name);
124
+ const formats = [
125
+ { type: "html", url: rapportMetadata.url_html },
126
+ { type: "pdf", url: rapportMetadata.url_pdf },
127
+ ];
128
+ for (const format of formats) {
129
+ if (!isOptionEmptyOrHasValue(options["formats"], format.type))
130
+ continue;
131
+ const destPath = path.join(rapportDir, `${rapportMetadata.name}.${format.type}`);
132
+ await processDocument(format.url.toString(), destPath, rapportMetadata.date, options);
211
133
  }
212
- const exposeDesMotifsPath = path.join(texteDir, `${texteName}-expose.html`);
213
- fs.writeFileSync(exposeDesMotifsPath, Buffer.from(content));
214
- return content;
215
134
  }
216
- async function downloadDocument(documentUrl) {
217
- if (!options["silent"]) {
218
- console.log(`Downloading document ${documentUrl}…`);
135
+ async function retrieveTextes(dataDir, sessions) {
136
+ const originalTextesDir = path.join(dataDir, TEXTE_FOLDER, DATA_ORIGINAL_FOLDER);
137
+ const transformedTextesDir = path.join(dataDir, TEXTE_FOLDER, DATA_TRANSFORMED_FOLDER);
138
+ if (options["parseDocuments"]) {
139
+ ensureAndClearDir(transformedTextesDir);
219
140
  }
220
- try {
221
- const response = await fetchWithRetry(documentUrl);
222
- if (!response.ok) {
223
- if (response.status === 404) {
224
- console.warn(`Texte ${documentUrl} not found`);
225
- }
226
- else {
227
- console.error(`An error occurred while retrieving document ${documentUrl}: ${response.status}`);
228
- }
229
- return null;
141
+ const dlOptions = {
142
+ force: options["force"],
143
+ silent: options["silent"],
144
+ verbose: options["verbose"],
145
+ onlyRecent: options["only-recent"],
146
+ formats: options["formats"],
147
+ parseDocuments: options["parseDocuments"],
148
+ };
149
+ for (const session of sessions) {
150
+ for (const { item: texteMetadata } of iterLoadSenatDossiersLegislatifsTexteUrls(dataDir, session)) {
151
+ await processTexte(texteMetadata, originalTextesDir, transformedTextesDir, dlOptions);
230
152
  }
231
- return response.arrayBuffer();
232
153
  }
233
- catch (error) {
234
- console.error(error.message);
235
- return null;
154
+ }
155
+ async function retrieveRapports(dataDir, sessions) {
156
+ const originalRapportsDir = path.join(dataDir, RAPPORT_FOLDER, DATA_ORIGINAL_FOLDER);
157
+ const dlOptions = {
158
+ force: options["force"],
159
+ silent: options["silent"],
160
+ verbose: options["verbose"],
161
+ onlyRecent: options["only-recent"],
162
+ formats: options["formats"],
163
+ };
164
+ for (const session of sessions) {
165
+ for (const { item: rapportMetadata } of iterLoadSenatDossiersLegislatifsRapportUrls(dataDir, session)) {
166
+ await processRapport(rapportMetadata, originalRapportsDir, dlOptions);
167
+ }
236
168
  }
237
169
  }
238
- async function parseDocument(session, transformedTextesDir, textePath, texteName, texteBuffer, exposeDesMotifs = null) {
239
- if (!options["silent"]) {
170
+ async function parseDocument(session, transformedTextesDir, textePath, texteName, texteBuffer, exposeDesMotifs = null, options = {}) {
171
+ if (options.verbose) {
240
172
  console.log(`Parsing texte ${textePath}…`);
241
173
  }
242
174
  let parsedTexte;
@@ -247,19 +179,17 @@ async function parseDocument(session, transformedTextesDir, textePath, texteName
247
179
  else {
248
180
  parsedTexte = await parseTexteFromFile(textePath);
249
181
  }
250
- if (!parsedTexte) {
182
+ if (!parsedTexte)
251
183
  return null;
252
- }
253
184
  if (exposeDesMotifs) {
254
- if (!options["silent"]) {
185
+ if (options.verbose) {
255
186
  console.log("Parsing exposé des motifs…");
256
187
  }
257
188
  const exposeDesMotifsHtml = textDecoder.decode(exposeDesMotifs);
258
189
  parsedTexte.exposeDesMotifs = parseExposeDesMotifs(exposeDesMotifsHtml);
259
190
  }
260
191
  const transformedTexteDir = path.join(transformedTextesDir, `${session ?? UNDEFINED_SESSION}`, texteName);
261
- fs.ensureDirSync(transformedTexteDir);
262
- fs.writeJSONSync(path.join(transformedTexteDir, `${texteName}.json`), parsedTexte, { spaces: 2 });
192
+ await fs.outputJSON(path.join(transformedTexteDir, `${texteName}.json`), parsedTexte, { spaces: 2 });
263
193
  return parsedTexte;
264
194
  }
265
195
  async function main() {
@@ -277,9 +207,11 @@ async function main() {
277
207
  console.timeEnd("documents processing time");
278
208
  }
279
209
  }
280
- main()
281
- .then(() => process.exit(0))
282
- .catch((error) => {
283
- console.log(error);
284
- process.exit(1);
285
- });
210
+ if (process.argv[1].endsWith("retrieve_documents.ts")) {
211
+ main()
212
+ .then(() => process.exit(0))
213
+ .catch((error) => {
214
+ console.log(error);
215
+ process.exit(1);
216
+ });
217
+ }
@@ -8,7 +8,7 @@ import * as cheerio from "cheerio";
8
8
  import { AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, iterLoadSenatAgendas } from "../loaders";
9
9
  import { getSessionsFromStart } from "../types/sessions";
10
10
  import { commonOptions } from "./shared/cli_helpers";
11
- import { getFirstInterventionStartTimecode } from "../utils/nvs-timecode";
11
+ import { getAgendaSegmentTimecodes } from "../utils/nvs-timecode";
12
12
  import { decodeHtmlEntities } from "../utils/string_cleaning";
13
13
  import { dice, normalize, scoreVideo } from "../utils/scoring";
14
14
  import { epochToParisDateTime, toFRDate, toTargetEpoch } from "../utils/date";
@@ -379,16 +379,22 @@ async function processGroupedReunion(agenda, session, dataDir) {
379
379
  console.warn(e);
380
380
  }
381
381
  }
382
- let timecodeDebutVideo = null;
383
- if (dataTxt && finalTxt) {
384
- timecodeDebutVideo = getFirstInterventionStartTimecode(dataTxt, finalTxt);
385
- if (timecodeDebutVideo === null) {
386
- console.warn(`[warn] Cannot retrieve start video timecode from reunion` + reunionUid);
387
- }
388
- }
389
382
  // ==== 4) Update agenda file (only if accepted + m3u8) ====
390
383
  if ((accepted || skipDownload) && master) {
391
384
  const agendaJsonPath = path.join(dataDir, AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, String(session), `${agenda.uid}.json`);
385
+ let timecodeDebutVideo = null;
386
+ let timecodeFinVideo = null;
387
+ if (dataTxt && finalTxt) {
388
+ const agendaKey = agenda.titre || agenda.objet || "";
389
+ const seg = getAgendaSegmentTimecodes(dataTxt, finalTxt, agendaKey);
390
+ if (!seg) {
391
+ console.warn(`[warn] Cannot retrieve agenda segment timecodes from reunion ${reunionUid}`);
392
+ }
393
+ else {
394
+ timecodeDebutVideo = seg.start;
395
+ timecodeFinVideo = seg.end;
396
+ }
397
+ }
392
398
  if (await fs.pathExists(agendaJsonPath)) {
393
399
  const raw = await fsp.readFile(agendaJsonPath, "utf-8");
394
400
  let obj;
@@ -403,6 +409,7 @@ async function processGroupedReunion(agenda, session, dataDir) {
403
409
  const next = { ...obj, urlVideo: master };
404
410
  if (timecodeDebutVideo != null) {
405
411
  next.timecodeDebutVideo = timecodeDebutVideo;
412
+ next.timecodeFinVideo = timecodeFinVideo;
406
413
  }
407
414
  await writeIfChanged(agendaJsonPath, JSON.stringify(next, null, 2));
408
415
  if (!options["silent"]) {
@@ -64,6 +64,16 @@ export declare const pullOption: {
64
64
  name: string;
65
65
  type: BooleanConstructor;
66
66
  };
67
+ export declare const fetchDocumentsOption: {
68
+ help: string;
69
+ name: string;
70
+ type: BooleanConstructor;
71
+ };
72
+ export declare const parseDocumentsOption: {
73
+ help: string;
74
+ name: string;
75
+ type: BooleanConstructor;
76
+ };
67
77
  export declare const commonOptions: ({
68
78
  defaultOption: boolean;
69
79
  help: string;
@@ -64,6 +64,16 @@ export const pullOption = {
64
64
  name: "pull",
65
65
  type: Boolean,
66
66
  };
67
+ export const fetchDocumentsOption = {
68
+ help: "download documents",
69
+ name: "fetchDocuments",
70
+ type: Boolean,
71
+ };
72
+ export const parseDocumentsOption = {
73
+ help: "parse documents",
74
+ name: "parseDocuments",
75
+ type: Boolean,
76
+ };
67
77
  export const commonOptions = [
68
78
  categoriesOption,
69
79
  dataDirDefaultOption,
@@ -76,4 +86,6 @@ export const commonOptions = [
76
86
  commitOption,
77
87
  remoteOption,
78
88
  pullOption,
89
+ fetchDocumentsOption,
90
+ parseDocumentsOption,
79
91
  ];
@@ -1,29 +1,18 @@
1
- import { iterLoadSenatScrutins } from "../loaders";
1
+ import { iterLoadSenatAmendements, iterLoadSenatDossiersLegislatifs } from "../loaders";
2
2
  import commandLineArgs from "command-line-args";
3
3
  import { dataDirDefaultOption } from "./shared/cli_helpers";
4
4
  const optionsDefinitions = [dataDirDefaultOption];
5
5
  const options = commandLineArgs(optionsDefinitions);
6
- const noValidation = false;
7
6
  const session = 2024;
8
- const s = new Set();
9
- for (const { item: scrutin } of iterLoadSenatScrutins(options["dataDir"], session, { noValidation: noValidation })) {
10
- s.add(scrutin["lecture_libelle"]);
7
+ const sinceCommit = undefined;
8
+ for (const { item: amendement } of iterLoadSenatAmendements(options["dataDir"], session, {
9
+ log: true,
10
+ sinceCommit: sinceCommit,
11
+ })) {
12
+ console.log(amendement["numero"]);
11
13
  }
12
- console.log(s);
13
- /*
14
- for (const { item: amendement } of iterLoadSenatAmendements(
15
- options["dataDir"],
16
- session,
17
- { noValidation: noValidation },
18
- )) {
19
- console.log(amendement["numero"])
14
+ for (const { item: dossierLegislatif } of iterLoadSenatDossiersLegislatifs(options["dataDir"], session, {
15
+ sinceCommit: sinceCommit,
16
+ })) {
17
+ console.log(dossierLegislatif["numero"]);
20
18
  }
21
-
22
- for (const { item: dossierLegislatif } of iterLoadSenatDossiersLegislatifs(
23
- options["dataDir"],
24
- session,
25
- { noValidation: noValidation },
26
- )) {
27
- console.log(dossierLegislatif["numero"])
28
- }
29
- */
@@ -31,6 +31,7 @@ export interface Reunion {
31
31
  transcriptionRef?: string;
32
32
  urlVideo?: string;
33
33
  timecodeDebutVideo?: number;
34
+ timecodeFinVideo?: number;
34
35
  odj?: ReunionOdj;
35
36
  }
36
37
  export interface ReunionOdjPoint {
@@ -1 +1,7 @@
1
- export declare function getFirstInterventionStartTimecode(dataNvs: string, finalPlayerNvs: string): number | null;
1
+ export declare function getAgendaSegmentTimecodes(dataNvs: string, finalPlayerNvs: string, agendaTitleOrObjet: string): {
2
+ start: number;
3
+ end: number | null;
4
+ chapterId: string;
5
+ nextChapterId: string | null;
6
+ score: number;
7
+ } | null;
@@ -1,4 +1,7 @@
1
1
  import { XMLParser } from "fast-xml-parser";
2
+ import { dice, normalize } from "./scoring";
3
+ import { decodeHtmlEntities } from "./string_cleaning";
4
+ const CHAPTER_MATCH_THRESHOLD = 0.5;
2
5
  const xmlParser = new XMLParser({
3
6
  ignoreAttributes: false,
4
7
  attributeNamePrefix: "@_",
@@ -20,20 +23,57 @@ function getTimecodeForChapterId(finalPlayerNvs, chapterId) {
20
23
  return null;
21
24
  return Math.floor(ms / 1000);
22
25
  }
23
- export function getFirstInterventionStartTimecode(dataNvs, finalPlayerNvs) {
24
- const firstChapterId = getFirstChapterId(dataNvs);
25
- if (!firstChapterId)
26
- return null;
27
- return getTimecodeForChapterId(finalPlayerNvs, firstChapterId);
26
+ function toArray(v) {
27
+ if (!v)
28
+ return [];
29
+ return Array.isArray(v) ? v : [v];
28
30
  }
29
- function getFirstChapterId(dataNvs) {
31
+ function getLevel1Chapters(dataNvs) {
30
32
  const xml = xmlParser.parse(dataNvs);
31
- const rootChapters = xml?.data?.chapters?.chapter;
32
- if (!rootChapters)
33
+ const root = xml?.data?.chapters?.chapter ?? xml?.chapters?.chapter;
34
+ const roots = toArray(root);
35
+ return roots
36
+ .map((ch, i) => {
37
+ const id = ch?.id ?? ch?.["@_id"];
38
+ const labelRaw = ch?.label ?? ch?.["@_label"] ?? "";
39
+ return {
40
+ id: String(id),
41
+ label: decodeHtmlEntities(String(labelRaw)).trim(),
42
+ index: i,
43
+ };
44
+ })
45
+ .filter((c) => c.id && c.label);
46
+ }
47
+ function pickBestLevel1ChapterForAgenda(chapters, agendaTitle) {
48
+ const q = normalize(agendaTitle);
49
+ let best = null;
50
+ for (const ch of chapters) {
51
+ const s = dice(q, ch.label);
52
+ if (!best || s > best.score)
53
+ best = { chapter: ch, score: s };
54
+ }
55
+ if (!best || best.score < CHAPTER_MATCH_THRESHOLD)
56
+ return { chapter: chapters[0], score: 0 };
57
+ return best;
58
+ }
59
+ export function getAgendaSegmentTimecodes(dataNvs, finalPlayerNvs, agendaTitleOrObjet) {
60
+ const l1 = getLevel1Chapters(dataNvs);
61
+ if (!l1.length)
62
+ return null;
63
+ const best = pickBestLevel1ChapterForAgenda(l1, agendaTitleOrObjet);
64
+ if (!best)
33
65
  return null;
34
- const chaptersArray = Array.isArray(rootChapters) ? rootChapters : [rootChapters];
35
- const firstChapter = chaptersArray[0];
36
- if (!firstChapter || !firstChapter["@_id"])
66
+ const chapter = best.chapter;
67
+ const next = l1[chapter.index + 1] ?? null;
68
+ const start = getTimecodeForChapterId(finalPlayerNvs, chapter.id);
69
+ if (start == null)
37
70
  return null;
38
- return String(firstChapter["@_id"]);
71
+ const end = next ? getTimecodeForChapterId(finalPlayerNvs, next.id) : null;
72
+ return {
73
+ start,
74
+ end,
75
+ chapterId: chapter.id,
76
+ nextChapterId: next?.id ?? null,
77
+ score: best.score,
78
+ };
39
79
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tricoteuses/senat",
3
- "version": "2.20.20",
3
+ "version": "2.20.22",
4
4
  "description": "Handle French Sénat's open data",
5
5
  "keywords": [
6
6
  "France",
@@ -52,7 +52,6 @@
52
52
  "data:retrieve_open_data": "tsx src/scripts/retrieve_open_data.ts --all",
53
53
  "data:retrieve_senateurs_photos": "tsx src/scripts/retrieve_senateurs_photos.ts --fetch",
54
54
  "data:retrieve_videos": "tsx src/scripts/retrieve_videos.ts",
55
- "data:parse_textes_lois": "tsx src/scripts/parse_textes.ts",
56
55
  "prepare": "npm run build",
57
56
  "prepublishOnly": "npm run build",
58
57
  "prettier": "prettier --write 'src/**/*.ts' 'tests/**/*.test.ts'",