@tricoteuses/senat 2.20.21 → 2.20.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -1
- package/lib/loaders.d.ts +2 -1
- package/lib/loaders.js +48 -3
- package/lib/model/dosleg.js +1 -1
- package/lib/parsers/texte.d.ts +7 -0
- package/lib/parsers/texte.js +228 -0
- package/lib/scripts/convert_data.js +75 -52
- package/lib/scripts/data-download.js +4 -1
- package/lib/scripts/retrieve_documents.d.ts +2 -1
- package/lib/scripts/retrieve_documents.js +124 -192
- package/lib/scripts/shared/cli_helpers.d.ts +10 -0
- package/lib/scripts/shared/cli_helpers.js +12 -0
- package/lib/scripts/test_iter_load.js +11 -22
- package/package.json +1 -2
package/README.md
CHANGED
|
@@ -43,7 +43,6 @@ npm run data:download ../senat-data
|
|
|
43
43
|
|
|
44
44
|
- `npm run data:download <dir>`: Download, convert data to JSON
|
|
45
45
|
- `npm run data:retrieve_documents <dir>`: Retrieval of textes and rapports from Sénat's website
|
|
46
|
-
- `npm run data:parse_textes_lois <dir>`: Parse textes (requires xml files)
|
|
47
46
|
- `npm run data:retrieve_agenda <dir>`: Retrieval of agenda from Sénat's website
|
|
48
47
|
- `npm run data:retrieve_cr_seance <dir>`: Retrieval of comptes-rendus de séance from Sénat's data
|
|
49
48
|
- `npm run data:retrieve_cr_commission <dir>`: Retrieval of comptes-rendus de commissions from Sénat's website
|
package/lib/loaders.d.ts
CHANGED
|
@@ -13,11 +13,11 @@ export declare const COMPTES_RENDUS_FOLDER = "seances";
|
|
|
13
13
|
export declare const COMMISSION_FOLDER = "commissions";
|
|
14
14
|
export declare const DOSLEG_DOSSIERS_FOLDER = "dossiers";
|
|
15
15
|
export declare const SCRUTINS_FOLDER = "scrutins";
|
|
16
|
-
export declare const RAPPORT_FOLDER = "rap";
|
|
17
16
|
export declare const SENS_CIRCONSCRIPTIONS_FOLDER = "circonscriptions";
|
|
18
17
|
export declare const SENS_ORGANISMES_FOLDER = "organismes";
|
|
19
18
|
export declare const SENS_SENATEURS_FOLDER = "senateurs";
|
|
20
19
|
export declare const TEXTE_FOLDER = "leg";
|
|
20
|
+
export declare const RAPPORT_FOLDER = "rap";
|
|
21
21
|
export declare const DATA_ORIGINAL_FOLDER = "original";
|
|
22
22
|
export declare const DATA_TRANSFORMED_FOLDER = "transformed";
|
|
23
23
|
export declare const DOCUMENT_METADATA_FILE = "metadata.json";
|
|
@@ -25,6 +25,7 @@ export type IterItem<T> = {
|
|
|
25
25
|
item: T;
|
|
26
26
|
filePathFromDataset?: string;
|
|
27
27
|
legislature?: number;
|
|
28
|
+
gitStatus?: "A" | "M" | "D" | "R" | "C" | "T" | "U";
|
|
28
29
|
};
|
|
29
30
|
export interface TexteMetadata {
|
|
30
31
|
name: string;
|
package/lib/loaders.js
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import fsex from "fs-extra";
|
|
2
2
|
import fs from "fs";
|
|
3
3
|
import path from "path";
|
|
4
|
+
import * as git from "./git";
|
|
4
5
|
import { datasets } from "./datasets";
|
|
5
6
|
import { UNDEFINED_SESSION } from "./types/sessions";
|
|
6
7
|
export { EnabledDatasets } from "./datasets";
|
|
@@ -9,11 +10,11 @@ export const COMPTES_RENDUS_FOLDER = "seances";
|
|
|
9
10
|
export const COMMISSION_FOLDER = "commissions";
|
|
10
11
|
export const DOSLEG_DOSSIERS_FOLDER = "dossiers";
|
|
11
12
|
export const SCRUTINS_FOLDER = "scrutins";
|
|
12
|
-
export const RAPPORT_FOLDER = "rap";
|
|
13
13
|
export const SENS_CIRCONSCRIPTIONS_FOLDER = "circonscriptions";
|
|
14
14
|
export const SENS_ORGANISMES_FOLDER = "organismes";
|
|
15
15
|
export const SENS_SENATEURS_FOLDER = "senateurs";
|
|
16
16
|
export const TEXTE_FOLDER = "leg";
|
|
17
|
+
export const RAPPORT_FOLDER = "rap";
|
|
17
18
|
export const DATA_ORIGINAL_FOLDER = "original";
|
|
18
19
|
export const DATA_TRANSFORMED_FOLDER = "transformed";
|
|
19
20
|
export const DOCUMENT_METADATA_FILE = "metadata.json";
|
|
@@ -30,7 +31,7 @@ export function* iterFilePaths(dirPath) {
|
|
|
30
31
|
}
|
|
31
32
|
}
|
|
32
33
|
}
|
|
33
|
-
function* iterLoadSenatItems(dataDir, dataName, legislatureOrSession, subDir, { log = false } = {}) {
|
|
34
|
+
function* iterLoadSenatItems(dataDir, dataName, legislatureOrSession, subDir, { log = false, sinceCommit } = {}) {
|
|
34
35
|
let itemsDir = path.join(dataDir, dataName);
|
|
35
36
|
if (subDir) {
|
|
36
37
|
itemsDir = path.join(itemsDir, subDir);
|
|
@@ -38,9 +39,26 @@ function* iterLoadSenatItems(dataDir, dataName, legislatureOrSession, subDir, {
|
|
|
38
39
|
if (legislatureOrSession) {
|
|
39
40
|
itemsDir = path.join(itemsDir, String(legislatureOrSession));
|
|
40
41
|
}
|
|
42
|
+
// Get changed files if sinceCommit is specified (excluding deleted files)
|
|
43
|
+
const changedFiles = sinceCommit
|
|
44
|
+
? git.getChangedFilesSinceCommit(itemsDir, sinceCommit, {
|
|
45
|
+
diffFilter: "AMR", // Added, Modified, Renamed
|
|
46
|
+
})
|
|
47
|
+
: null;
|
|
48
|
+
if (log && sinceCommit) {
|
|
49
|
+
console.log(`Filtering files changed since commit ${sinceCommit} in ${itemsDir}`);
|
|
50
|
+
console.log(`Found ${changedFiles?.size || 0} changed files (AMR)`);
|
|
51
|
+
}
|
|
41
52
|
for (const filePath of iterFilePaths(itemsDir)) {
|
|
53
|
+
const relativePath = path.relative(path.join(dataDir, dataName), filePath);
|
|
54
|
+
const gitStatus = changedFiles?.get(relativePath);
|
|
55
|
+
// Filter by changed files if sinceCommit is specified
|
|
56
|
+
if (changedFiles && !gitStatus) {
|
|
57
|
+
// Skip files not in the change set
|
|
58
|
+
continue;
|
|
59
|
+
}
|
|
42
60
|
if (log) {
|
|
43
|
-
console.log(`Loading file: ${filePath}
|
|
61
|
+
console.log(`Loading file: ${filePath}…${gitStatus ? ` (${gitStatus})` : ""}`);
|
|
44
62
|
}
|
|
45
63
|
let item;
|
|
46
64
|
try {
|
|
@@ -56,8 +74,35 @@ function* iterLoadSenatItems(dataDir, dataName, legislatureOrSession, subDir, {
|
|
|
56
74
|
item,
|
|
57
75
|
filePathFromDataset,
|
|
58
76
|
legislature: legislatureOrSession,
|
|
77
|
+
...(gitStatus && { gitStatus }), // Include gitStatus
|
|
59
78
|
};
|
|
60
79
|
}
|
|
80
|
+
// Yield deleted files at the end if sinceCommit is specified
|
|
81
|
+
if (sinceCommit) {
|
|
82
|
+
const deletedFiles = git.getChangedFilesSinceCommit(itemsDir, sinceCommit, {
|
|
83
|
+
diffFilter: "D", // Deleted
|
|
84
|
+
});
|
|
85
|
+
if (log) {
|
|
86
|
+
console.log(`Found ${deletedFiles.size || 0} deleted files (D)`);
|
|
87
|
+
}
|
|
88
|
+
for (const [relativePath, status] of deletedFiles.entries()) {
|
|
89
|
+
const deletedFilePath = path.join(itemsDir, relativePath);
|
|
90
|
+
if (log) {
|
|
91
|
+
console.log(`Deleted file: ${deletedFilePath}`);
|
|
92
|
+
}
|
|
93
|
+
// Extract UID from filename (remove extension) for the placeholder item
|
|
94
|
+
const fileExtension = path.extname(relativePath) || ".json"; // Assuming files use an extension like .json
|
|
95
|
+
const filename = path.basename(relativePath, fileExtension);
|
|
96
|
+
const fakeItem = { uid: filename }; // Placeholder item using uid constraint
|
|
97
|
+
const filePathFromDataset = deletedFilePath.substring(deletedFilePath.indexOf(dataName) + dataName.length);
|
|
98
|
+
yield {
|
|
99
|
+
item: fakeItem,
|
|
100
|
+
filePathFromDataset,
|
|
101
|
+
legislature: legislatureOrSession,
|
|
102
|
+
gitStatus: status,
|
|
103
|
+
};
|
|
104
|
+
}
|
|
105
|
+
}
|
|
61
106
|
}
|
|
62
107
|
export function* iterLoadSenatAmendements(dataDir, session, options = {}) {
|
|
63
108
|
for (const amendementItem of iterLoadSenatItems(dataDir, datasets.ameli.database, session, undefined, options)) {
|
package/lib/model/dosleg.js
CHANGED
|
@@ -249,7 +249,7 @@ export function createActesLegislatifs(dossier) {
|
|
|
249
249
|
code_acte: `${codeParent}-DEBATS-SEANCE`,
|
|
250
250
|
date: lectureAss["dates_seances"][0]?.["date"],
|
|
251
251
|
id: lectureAss["id"],
|
|
252
|
-
numero: lectureAss["numero"]
|
|
252
|
+
numero: lectureAss["numero"],
|
|
253
253
|
});
|
|
254
254
|
}
|
|
255
255
|
const { textes, rapports, ...lectureAssWithoutTextes } = lectureAss;
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import { ExposeDesMotifs, FlatTexte } from "../types/texte";
|
|
2
|
+
export declare function transformTexte(document: Document): FlatTexte | null;
|
|
3
|
+
export declare function transformExposeDesMotifs(document: Document): ExposeDesMotifs | null;
|
|
4
|
+
export declare function parseTexte(texteXml: string): FlatTexte | null;
|
|
5
|
+
export declare function parseTexteFromFile(xmlFilePath: string): Promise<FlatTexte | null>;
|
|
6
|
+
export declare function parseExposeDesMotifs(exposeDesMotifsHtml: string): ExposeDesMotifs | null;
|
|
7
|
+
export declare function parseExposeDesMotifsFromFile(htmlFilePath: string): Promise<ExposeDesMotifs | null>;
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
import { JSDOM } from "jsdom";
|
|
2
|
+
import { AKN_IDENTIFICATION_STRUCTURE_REGEXP, AKN_WORKFLOW_IDENTIFICATION_STRUCTURE_REGEXP } from "../scripts/datautil";
|
|
3
|
+
import { DivisionType, } from "../types/texte";
|
|
4
|
+
function buildWorklow(metaElement) {
|
|
5
|
+
const stepElements = metaElement.querySelectorAll("workflow step");
|
|
6
|
+
const steps = [];
|
|
7
|
+
for (const stepElement of stepElements) {
|
|
8
|
+
const identification = stepElement.getAttribute("href") ?? "";
|
|
9
|
+
const identificationParts = AKN_WORKFLOW_IDENTIFICATION_STRUCTURE_REGEXP.exec(identification)?.groups;
|
|
10
|
+
steps.push({
|
|
11
|
+
eId: stepElement.getAttribute("eId"),
|
|
12
|
+
date: stepElement.getAttribute("date") ? new Date(stepElement.getAttribute("date") ?? "") : null,
|
|
13
|
+
type: identificationParts?.["type"] || null,
|
|
14
|
+
session: identificationParts?.["session"] || null,
|
|
15
|
+
numero: identificationParts?.["numTexte"] || null,
|
|
16
|
+
version: identificationParts?.["version"] ? identificationParts["version"] : null,
|
|
17
|
+
outcome: stepElement.getAttribute("outcome"),
|
|
18
|
+
});
|
|
19
|
+
}
|
|
20
|
+
return steps;
|
|
21
|
+
}
|
|
22
|
+
function buildDivision(node, index) {
|
|
23
|
+
const eId = node.getAttribute("eId");
|
|
24
|
+
const tag = node.nodeName;
|
|
25
|
+
const level = DivisionType[tag];
|
|
26
|
+
const titleNode = node.querySelector("num");
|
|
27
|
+
const subtitleNode = node.querySelector("heading");
|
|
28
|
+
const headings = [
|
|
29
|
+
...(titleNode
|
|
30
|
+
? [
|
|
31
|
+
{
|
|
32
|
+
text: titleNode.textContent?.trim() ?? null,
|
|
33
|
+
html: titleNode.innerHTML?.trim() ?? null,
|
|
34
|
+
},
|
|
35
|
+
]
|
|
36
|
+
: []),
|
|
37
|
+
...(subtitleNode
|
|
38
|
+
? [
|
|
39
|
+
{
|
|
40
|
+
text: subtitleNode.textContent?.trim() ?? null,
|
|
41
|
+
html: subtitleNode.innerHTML?.trim() ?? null,
|
|
42
|
+
},
|
|
43
|
+
]
|
|
44
|
+
: []),
|
|
45
|
+
];
|
|
46
|
+
const division = {
|
|
47
|
+
index,
|
|
48
|
+
eId,
|
|
49
|
+
tag,
|
|
50
|
+
level,
|
|
51
|
+
headings,
|
|
52
|
+
};
|
|
53
|
+
if (tag === "article") {
|
|
54
|
+
;
|
|
55
|
+
division.alineas = [];
|
|
56
|
+
}
|
|
57
|
+
return division;
|
|
58
|
+
}
|
|
59
|
+
function buildAlinea(contentNode, alineaNode) {
|
|
60
|
+
const eId = alineaNode.getAttribute("eId");
|
|
61
|
+
const heading = {
|
|
62
|
+
text: alineaNode.querySelector("num")?.textContent ?? null,
|
|
63
|
+
};
|
|
64
|
+
const pastille = alineaNode.getAttribute("data:pastille") ?? null;
|
|
65
|
+
return {
|
|
66
|
+
eId,
|
|
67
|
+
heading,
|
|
68
|
+
text: contentNode.textContent?.trim() ?? null,
|
|
69
|
+
html: contentNode.innerHTML?.trim() ?? null,
|
|
70
|
+
pastille,
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
function buildEmptyArticle(index) {
|
|
74
|
+
return {
|
|
75
|
+
index: index,
|
|
76
|
+
eId: "",
|
|
77
|
+
tag: "article",
|
|
78
|
+
level: DivisionType["article"],
|
|
79
|
+
headings: [],
|
|
80
|
+
alineas: [],
|
|
81
|
+
};
|
|
82
|
+
}
|
|
83
|
+
function flattenTexte(texteContentRoot) {
|
|
84
|
+
const divisions = [];
|
|
85
|
+
let divisionIndex = 0;
|
|
86
|
+
const iter = (node) => {
|
|
87
|
+
if (node.nodeName === "content") {
|
|
88
|
+
return;
|
|
89
|
+
}
|
|
90
|
+
switch (node.nodeName) {
|
|
91
|
+
case "tome":
|
|
92
|
+
case "part":
|
|
93
|
+
case "book":
|
|
94
|
+
case "title":
|
|
95
|
+
case "subtitle":
|
|
96
|
+
case "chapter":
|
|
97
|
+
case "section":
|
|
98
|
+
case "subsection":
|
|
99
|
+
case "paragraph":
|
|
100
|
+
case "article":
|
|
101
|
+
divisions.push(buildDivision(node, divisionIndex++));
|
|
102
|
+
break;
|
|
103
|
+
}
|
|
104
|
+
if (node.nodeName === "alinea") {
|
|
105
|
+
Array.from(node.childNodes)
|
|
106
|
+
// Find direct content children programmatically
|
|
107
|
+
// because `:scope` selector does not work
|
|
108
|
+
// https://github.com/jsdom/jsdom/issues/2998
|
|
109
|
+
.filter((alineaChildNode) => alineaChildNode.nodeName === "content")
|
|
110
|
+
.forEach((alineaContentNode) => {
|
|
111
|
+
// Hypothesis: alineas should always be enclosed in articles
|
|
112
|
+
let lastArticle = divisions.findLast((division) => division.tag === "article");
|
|
113
|
+
if (!lastArticle) {
|
|
114
|
+
lastArticle = buildEmptyArticle(divisionIndex++);
|
|
115
|
+
divisions.push(lastArticle);
|
|
116
|
+
}
|
|
117
|
+
lastArticle.alineas.push(buildAlinea(alineaContentNode, node));
|
|
118
|
+
});
|
|
119
|
+
}
|
|
120
|
+
if (node.hasChildNodes()) {
|
|
121
|
+
node.childNodes.forEach((childNode) => iter(childNode));
|
|
122
|
+
}
|
|
123
|
+
};
|
|
124
|
+
iter(texteContentRoot);
|
|
125
|
+
return divisions;
|
|
126
|
+
}
|
|
127
|
+
export function transformTexte(document) {
|
|
128
|
+
const metaElement = document.querySelector("meta");
|
|
129
|
+
const preambleElement = document.querySelector("preamble");
|
|
130
|
+
const identification = metaElement?.querySelector("FRBRExpression FRBRuri")?.getAttribute("value") ?? "";
|
|
131
|
+
const identificationParts = AKN_IDENTIFICATION_STRUCTURE_REGEXP.exec(identification)?.groups;
|
|
132
|
+
const bodyElement = document.querySelector("body");
|
|
133
|
+
const sessionYears = identificationParts?.["session"]?.split("-") || null;
|
|
134
|
+
const datePresentation = metaElement?.querySelector("FRBRdate[name='#presentation']")?.getAttribute("date");
|
|
135
|
+
const dateDepot = metaElement?.querySelector("FRBRdate[name='#depot']")?.getAttribute("date");
|
|
136
|
+
const datePublicationXml = metaElement?.querySelector("FRBRdate[name='#publication-xml']")?.getAttribute("date");
|
|
137
|
+
return {
|
|
138
|
+
titre: preambleElement?.querySelector("docTitle")?.textContent || null,
|
|
139
|
+
titreCourt: metaElement?.querySelector("FRBRalias[name='intitule-court']")?.getAttribute("value") || null,
|
|
140
|
+
signetDossier: metaElement?.querySelector("FRBRalias[name='signet-dossier-legislatif-senat']")?.getAttribute("value") || null,
|
|
141
|
+
urlDossierSenat: metaElement?.querySelector("FRBRalias[name='url-senat']")?.getAttribute("value") || null,
|
|
142
|
+
urlDossierAssemblee: metaElement?.querySelector("FRBRalias[name='url-AN']")?.getAttribute("value") || null,
|
|
143
|
+
type: identificationParts?.["type"] || null,
|
|
144
|
+
session: sessionYears && sessionYears.length > 0 ? sessionYears[0] : null,
|
|
145
|
+
numero: identificationParts?.["numTexte"] ? parseInt(identificationParts["numTexte"]) : null,
|
|
146
|
+
datePresentation: datePresentation ? new Date(datePresentation) : null,
|
|
147
|
+
dateDepot: dateDepot ? new Date(dateDepot) : null,
|
|
148
|
+
datePublicationXml: datePublicationXml ? new Date(datePublicationXml) : null,
|
|
149
|
+
version: identificationParts?.["version"] ? identificationParts["version"] : null,
|
|
150
|
+
workflow: metaElement ? buildWorklow(metaElement) : [],
|
|
151
|
+
divisions: bodyElement ? flattenTexte(bodyElement) : [],
|
|
152
|
+
};
|
|
153
|
+
}
|
|
154
|
+
export function transformExposeDesMotifs(document) {
|
|
155
|
+
const sectionElements = document.querySelectorAll("section");
|
|
156
|
+
const exposeDesMotifsRegexp = new RegExp("EXPOS.{1,2}[\\n\\s]DES[\\n\\s]MOTIFS");
|
|
157
|
+
for (const sectionElement of sectionElements) {
|
|
158
|
+
const firstParagraph = sectionElement.querySelector("p:first-of-type");
|
|
159
|
+
const secondParagraph = sectionElement.querySelector("p:nth-of-type(2)");
|
|
160
|
+
if (!firstParagraph) {
|
|
161
|
+
continue;
|
|
162
|
+
}
|
|
163
|
+
const firstParagraphContent = firstParagraph.textContent;
|
|
164
|
+
const secondParagraphContent = secondParagraph?.textContent;
|
|
165
|
+
if (!firstParagraphContent || !exposeDesMotifsRegexp.test(firstParagraphContent.toUpperCase())) {
|
|
166
|
+
if (!secondParagraphContent || !exposeDesMotifsRegexp.test(secondParagraphContent.toUpperCase())) {
|
|
167
|
+
continue;
|
|
168
|
+
}
|
|
169
|
+
else {
|
|
170
|
+
secondParagraph.remove();
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
firstParagraph.remove();
|
|
174
|
+
return {
|
|
175
|
+
text: sectionElement.textContent?.trim() ?? null,
|
|
176
|
+
html: sectionElement.innerHTML?.trim() ?? null,
|
|
177
|
+
};
|
|
178
|
+
}
|
|
179
|
+
return null;
|
|
180
|
+
}
|
|
181
|
+
export function parseTexte(texteXml) {
|
|
182
|
+
try {
|
|
183
|
+
const { document } = new JSDOM(texteXml, {
|
|
184
|
+
contentType: "text/xml",
|
|
185
|
+
}).window;
|
|
186
|
+
return transformTexte(document);
|
|
187
|
+
}
|
|
188
|
+
catch (error) {
|
|
189
|
+
console.error(`Could not parse texte with error ${error}`);
|
|
190
|
+
}
|
|
191
|
+
return null;
|
|
192
|
+
}
|
|
193
|
+
// Prevent from memory leak
|
|
194
|
+
// https://github.com/jsdom/jsdom/issues/2583#issuecomment-559520814
|
|
195
|
+
export async function parseTexteFromFile(xmlFilePath) {
|
|
196
|
+
try {
|
|
197
|
+
const { document } = (await JSDOM.fromFile(xmlFilePath, { contentType: "text/xml" })).window;
|
|
198
|
+
return transformTexte(document);
|
|
199
|
+
}
|
|
200
|
+
catch (error) {
|
|
201
|
+
console.error(`Could not parse texte with error ${error}`);
|
|
202
|
+
}
|
|
203
|
+
return null;
|
|
204
|
+
}
|
|
205
|
+
export function parseExposeDesMotifs(exposeDesMotifsHtml) {
|
|
206
|
+
try {
|
|
207
|
+
const { document } = new JSDOM(exposeDesMotifsHtml, {
|
|
208
|
+
contentType: "text/html",
|
|
209
|
+
}).window;
|
|
210
|
+
return transformExposeDesMotifs(document);
|
|
211
|
+
}
|
|
212
|
+
catch (error) {
|
|
213
|
+
console.error(`Could not parse exposé des motifs with error ${error}`);
|
|
214
|
+
}
|
|
215
|
+
return null;
|
|
216
|
+
}
|
|
217
|
+
// Prevent from memory leak
|
|
218
|
+
// https://github.com/jsdom/jsdom/issues/2583#issuecomment-559520814
|
|
219
|
+
export async function parseExposeDesMotifsFromFile(htmlFilePath) {
|
|
220
|
+
try {
|
|
221
|
+
const { document } = (await JSDOM.fromFile(htmlFilePath, { contentType: "text/html" })).window;
|
|
222
|
+
return transformExposeDesMotifs(document);
|
|
223
|
+
}
|
|
224
|
+
catch (error) {
|
|
225
|
+
console.error(`Could not parse exposé des motifs with error ${error}`);
|
|
226
|
+
}
|
|
227
|
+
return null;
|
|
228
|
+
}
|
|
@@ -5,8 +5,9 @@ import path from "path";
|
|
|
5
5
|
import pLimit from "p-limit";
|
|
6
6
|
import * as git from "../git";
|
|
7
7
|
import { datasets, EnabledDatasets, getEnabledDatasets } from "../datasets";
|
|
8
|
-
import { DATA_ORIGINAL_FOLDER, DOCUMENT_METADATA_FILE, DOSLEG_DOSSIERS_FOLDER, SCRUTINS_FOLDER, RAPPORT_FOLDER, SENS_CIRCONSCRIPTIONS_FOLDER, SENS_ORGANISMES_FOLDER, SENS_SENATEURS_FOLDER, TEXTE_FOLDER, } from "../loaders";
|
|
8
|
+
import { DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER, DOCUMENT_METADATA_FILE, DOSLEG_DOSSIERS_FOLDER, SCRUTINS_FOLDER, RAPPORT_FOLDER, SENS_CIRCONSCRIPTIONS_FOLDER, SENS_ORGANISMES_FOLDER, SENS_SENATEURS_FOLDER, TEXTE_FOLDER, } from "../loaders";
|
|
9
9
|
import { findAllAmendements, findAllCirconscriptions, findAllDebats, findAllDossiers, findAllScrutins, findAllOrganismes, findAllQuestions, findAllSens, findSenatRapportUrls, findSenatTexteUrls, } from "../model";
|
|
10
|
+
import { processRapport, processTexte } from "./retrieve_documents";
|
|
10
11
|
import { createActesLegislatifs } from "../model/dosleg";
|
|
11
12
|
import { UNDEFINED_SESSION } from "../types/sessions";
|
|
12
13
|
import { getSessionFromDate, getSessionFromSignet } from "./datautil";
|
|
@@ -73,7 +74,7 @@ async function convertData() {
|
|
|
73
74
|
}
|
|
74
75
|
if (enabledDatasets & EnabledDatasets.Questions) {
|
|
75
76
|
try {
|
|
76
|
-
await convertDatasetQuestions(dataDir);
|
|
77
|
+
await convertDatasetQuestions(dataDir, options);
|
|
77
78
|
const questionsDir = path.join(dataDir, datasets.questions.database);
|
|
78
79
|
exitCode = commitGit(questionsDir, options, exitCode);
|
|
79
80
|
}
|
|
@@ -83,7 +84,7 @@ async function convertData() {
|
|
|
83
84
|
}
|
|
84
85
|
if (enabledDatasets & EnabledDatasets.Sens) {
|
|
85
86
|
try {
|
|
86
|
-
await convertDatasetSens(dataDir);
|
|
87
|
+
await convertDatasetSens(dataDir, options);
|
|
87
88
|
const sensDir = path.join(dataDir, datasets.sens.database);
|
|
88
89
|
exitCode = commitGit(sensDir, options, exitCode);
|
|
89
90
|
}
|
|
@@ -102,7 +103,9 @@ async function convertDatasetAmeli(dataDir, options) {
|
|
|
102
103
|
console.log(`Converting database ${dataset.database} data into files…`);
|
|
103
104
|
}
|
|
104
105
|
const ameliReorganizedRootDir = path.join(dataDir, dataset.database);
|
|
105
|
-
|
|
106
|
+
if (!options.keepDir) {
|
|
107
|
+
ensureAndClearDir(ameliReorganizedRootDir);
|
|
108
|
+
}
|
|
106
109
|
for await (const amendement of findAllAmendements(options["fromSession"])) {
|
|
107
110
|
if (options["verbose"]) {
|
|
108
111
|
console.log(`Converting ${amendement["numero"]} file…`);
|
|
@@ -110,11 +113,9 @@ async function convertDatasetAmeli(dataDir, options) {
|
|
|
110
113
|
const session = String(amendement["session"]) || UNDEFINED_SESSION;
|
|
111
114
|
const signetDossierLegislatif = amendement["signet_dossier_legislatif"] ||
|
|
112
115
|
`${amendement["nature_texte"]}-${amendement["numero_texte"]}`.toLowerCase();
|
|
113
|
-
const ameliReorganizedDir = path.join(ameliReorganizedRootDir, String(session), signetDossierLegislatif);
|
|
114
|
-
await fs.ensureDir(ameliReorganizedDir);
|
|
115
116
|
const amendementFileName = `${amendement["numero"]}.json`;
|
|
116
|
-
const filePath = path.join(
|
|
117
|
-
await fs.
|
|
117
|
+
const filePath = path.join(ameliReorganizedRootDir, String(session), signetDossierLegislatif, amendementFileName);
|
|
118
|
+
await fs.outputJSON(filePath, amendement, { spaces: 2 });
|
|
118
119
|
}
|
|
119
120
|
}
|
|
120
121
|
async function convertDatasetDebats(dataDir, options) {
|
|
@@ -123,7 +124,9 @@ async function convertDatasetDebats(dataDir, options) {
|
|
|
123
124
|
console.log(`Converting database ${dataset.database} data into files…`);
|
|
124
125
|
}
|
|
125
126
|
const debatsReorganizedRootDir = path.join(dataDir, dataset.database);
|
|
126
|
-
|
|
127
|
+
if (!options.keepDir) {
|
|
128
|
+
ensureAndClearDir(debatsReorganizedRootDir);
|
|
129
|
+
}
|
|
127
130
|
for await (const debat of findAllDebats()) {
|
|
128
131
|
if (options["verbose"]) {
|
|
129
132
|
console.log(`Converting ${debat.id} file…`);
|
|
@@ -132,11 +135,9 @@ async function convertDatasetDebats(dataDir, options) {
|
|
|
132
135
|
if (options["fromSession"] && session < options["fromSession"]) {
|
|
133
136
|
continue;
|
|
134
137
|
}
|
|
135
|
-
const debatsReorganizedDir = path.join(debatsReorganizedRootDir, String(session));
|
|
136
|
-
await fs.ensureDir(debatsReorganizedDir);
|
|
137
138
|
const debatFileName = `${debat.id}.json`;
|
|
138
|
-
const filePath = path.join(
|
|
139
|
-
await fs.
|
|
139
|
+
const filePath = path.join(debatsReorganizedRootDir, String(session), debatFileName);
|
|
140
|
+
await fs.outputJSON(filePath, debat, { spaces: 2 });
|
|
140
141
|
}
|
|
141
142
|
}
|
|
142
143
|
async function convertDatasetDosLeg(dataDir, options) {
|
|
@@ -146,8 +147,10 @@ async function convertDatasetDosLeg(dataDir, options) {
|
|
|
146
147
|
}
|
|
147
148
|
const doslegReorganizedRootDir = path.join(dataDir, dataset.database);
|
|
148
149
|
const dossiersReorganizedDir = path.join(doslegReorganizedRootDir, DOSLEG_DOSSIERS_FOLDER);
|
|
149
|
-
|
|
150
|
-
|
|
150
|
+
if (!options.keepDir) {
|
|
151
|
+
ensureAndClearDir(doslegReorganizedRootDir);
|
|
152
|
+
ensureAndClearDir(dossiersReorganizedDir);
|
|
153
|
+
}
|
|
151
154
|
for await (const loi of findAllDossiers()) {
|
|
152
155
|
if (options["verbose"]) {
|
|
153
156
|
console.log(`Converting ${loi["signet"]} file…`);
|
|
@@ -158,16 +161,14 @@ async function convertDatasetDosLeg(dataDir, options) {
|
|
|
158
161
|
continue;
|
|
159
162
|
}
|
|
160
163
|
loiReorganizedDir = path.join(dossiersReorganizedDir, String(session));
|
|
161
|
-
await fs.ensureDir(loiReorganizedDir);
|
|
162
164
|
// Ajout des actes législatifs au dossier
|
|
163
165
|
const actesLegislatifs = createActesLegislatifs(loi);
|
|
164
166
|
const loiWithActes = { ...loi, actes_legislatifs: actesLegislatifs };
|
|
165
167
|
const dossierFile = `${loi["signet"]}.json`;
|
|
166
|
-
|
|
167
|
-
await fs.writeJSON(filePath, loiWithActes, { spaces: 2 });
|
|
168
|
+
await fs.outputJSON(path.join(loiReorganizedDir, dossierFile), loiWithActes, { spaces: 2 });
|
|
168
169
|
}
|
|
169
|
-
await convertTexteUrls(dataDir);
|
|
170
|
-
await convertRapportUrls(dataDir);
|
|
170
|
+
await convertTexteUrls(dataDir, options);
|
|
171
|
+
await convertRapportUrls(dataDir, options);
|
|
171
172
|
}
|
|
172
173
|
async function convertDatasetScrutins(dataDir, options) {
|
|
173
174
|
const dataset = datasets.dosleg;
|
|
@@ -175,7 +176,9 @@ async function convertDatasetScrutins(dataDir, options) {
|
|
|
175
176
|
console.log(`Converting database scrutins (${dataset.database}) data into files…`);
|
|
176
177
|
}
|
|
177
178
|
const scrutinsReorganizedDir = path.join(dataDir, SCRUTINS_FOLDER);
|
|
178
|
-
|
|
179
|
+
if (!options.keepDir) {
|
|
180
|
+
ensureAndClearDir(scrutinsReorganizedDir);
|
|
181
|
+
}
|
|
179
182
|
for await (const scrutin of findAllScrutins(options["fromSession"])) {
|
|
180
183
|
if (options["verbose"]) {
|
|
181
184
|
console.log(`Converting ${scrutin["numero"]} file…`);
|
|
@@ -183,20 +186,21 @@ async function convertDatasetScrutins(dataDir, options) {
|
|
|
183
186
|
let scrutinReorganizedDir = path.join(scrutinsReorganizedDir, String(UNDEFINED_SESSION));
|
|
184
187
|
const session = scrutin["session"] || UNDEFINED_SESSION;
|
|
185
188
|
scrutinReorganizedDir = path.join(scrutinsReorganizedDir, String(session));
|
|
186
|
-
await fs.ensureDir(scrutinReorganizedDir);
|
|
187
189
|
const scrutinFileName = `${scrutin["numero"]}.json`;
|
|
188
|
-
await fs.
|
|
190
|
+
await fs.outputJSON(path.join(scrutinReorganizedDir, scrutinFileName), scrutin, {
|
|
189
191
|
spaces: 2,
|
|
190
192
|
});
|
|
191
193
|
}
|
|
192
194
|
}
|
|
193
|
-
async function convertDatasetQuestions(dataDir) {
|
|
195
|
+
async function convertDatasetQuestions(dataDir, options) {
|
|
194
196
|
const dataset = datasets.questions;
|
|
195
197
|
if (!options["silent"]) {
|
|
196
198
|
console.log(`Converting database ${dataset.database} data into files…`);
|
|
197
199
|
}
|
|
198
200
|
const questionsReorganizedRootDir = path.join(dataDir, dataset.database);
|
|
199
|
-
|
|
201
|
+
if (!options.keepDir) {
|
|
202
|
+
ensureAndClearDir(questionsReorganizedRootDir);
|
|
203
|
+
}
|
|
200
204
|
const limit = pLimit(10);
|
|
201
205
|
const tasks = [];
|
|
202
206
|
for await (const question of findAllQuestions()) {
|
|
@@ -205,22 +209,27 @@ async function convertDatasetQuestions(dataDir) {
|
|
|
205
209
|
console.log(`Converting ${question["reference"]} file…`);
|
|
206
210
|
}
|
|
207
211
|
const legislature = question["legislature"] ? question["legislature"] : 0;
|
|
208
|
-
const questionReorganizedDir = path.join(questionsReorganizedRootDir, String(legislature));
|
|
209
|
-
await fs.ensureDir(questionReorganizedDir);
|
|
210
212
|
const questionFileName = `${question["reference"]}.json`;
|
|
211
|
-
await fs.
|
|
213
|
+
await fs.outputJSON(path.join(questionsReorganizedRootDir, String(legislature), questionFileName), question, {
|
|
214
|
+
spaces: 2,
|
|
215
|
+
});
|
|
212
216
|
}));
|
|
213
217
|
}
|
|
214
218
|
await Promise.all(tasks);
|
|
215
219
|
}
|
|
216
|
-
async function convertTexteUrls(dataDir) {
|
|
217
|
-
const
|
|
218
|
-
|
|
219
|
-
|
|
220
|
+
async function convertTexteUrls(dataDir, options) {
|
|
221
|
+
const originalTextesDir = path.join(dataDir, TEXTE_FOLDER, DATA_ORIGINAL_FOLDER);
|
|
222
|
+
const transformedTextesDir = path.join(dataDir, TEXTE_FOLDER, DATA_TRANSFORMED_FOLDER);
|
|
223
|
+
if (!options["silent"]) {
|
|
224
|
+
console.log(`Converting database textes data into files…`);
|
|
225
|
+
}
|
|
220
226
|
for await (const texte of findSenatTexteUrls()) {
|
|
227
|
+
const session = texte.session ?? UNDEFINED_SESSION;
|
|
228
|
+
if (options["fromSession"] && session < options["fromSession"]) {
|
|
229
|
+
continue;
|
|
230
|
+
}
|
|
221
231
|
const texteName = path.parse(texte.url).name;
|
|
222
|
-
const texteDir = path.join(originalTextesDir, `${
|
|
223
|
-
fs.ensureDirSync(texteDir);
|
|
232
|
+
const texteDir = path.join(originalTextesDir, `${session}`, texteName);
|
|
224
233
|
const metadata = {
|
|
225
234
|
name: texteName,
|
|
226
235
|
session: texte.session,
|
|
@@ -232,20 +241,27 @@ async function convertTexteUrls(dataDir) {
|
|
|
232
241
|
url_html: new URL(`${texteName}.html`, SENAT_TEXTE_BASE_URL),
|
|
233
242
|
url_pdf: new URL(`${texteName}.pdf`, SENAT_TEXTE_BASE_URL),
|
|
234
243
|
};
|
|
235
|
-
fs.
|
|
244
|
+
fs.outputJSONSync(path.join(texteDir, DOCUMENT_METADATA_FILE), metadata, {
|
|
236
245
|
spaces: 2,
|
|
237
246
|
});
|
|
247
|
+
if (options.fetchDocuments) {
|
|
248
|
+
await processTexte(metadata, originalTextesDir, transformedTextesDir, options);
|
|
249
|
+
}
|
|
238
250
|
}
|
|
239
251
|
}
|
|
240
|
-
async function convertRapportUrls(dataDir) {
|
|
241
|
-
const
|
|
242
|
-
|
|
243
|
-
|
|
252
|
+
async function convertRapportUrls(dataDir, options) {
|
|
253
|
+
const originalRapportsDir = path.join(dataDir, RAPPORT_FOLDER, DATA_ORIGINAL_FOLDER);
|
|
254
|
+
if (!options["silent"]) {
|
|
255
|
+
console.log(`Converting database rapports data into files…`);
|
|
256
|
+
}
|
|
244
257
|
for await (const rapport of findSenatRapportUrls()) {
|
|
258
|
+
const session = rapport.session ?? UNDEFINED_SESSION;
|
|
259
|
+
if (options["fromSession"] && session < options["fromSession"]) {
|
|
260
|
+
continue;
|
|
261
|
+
}
|
|
245
262
|
const parsedRapportUrl = path.parse(rapport.url);
|
|
246
263
|
const rapportName = parsedRapportUrl.name;
|
|
247
|
-
const rapportDir = path.join(
|
|
248
|
-
fs.ensureDirSync(rapportDir);
|
|
264
|
+
const rapportDir = path.join(originalRapportsDir, `${session}`, rapportName);
|
|
249
265
|
const rapportHtmlUrlBase = `${rapportName}_mono.html`;
|
|
250
266
|
const rapportHtmlUrl = path.format({
|
|
251
267
|
dir: parsedRapportUrl.dir,
|
|
@@ -263,12 +279,15 @@ async function convertRapportUrls(dataDir) {
|
|
|
263
279
|
url_html: new URL(rapportHtmlUrl, SENAT_RAPPORT_BASE_URL),
|
|
264
280
|
url_pdf: new URL(rapportPdfUrl, SENAT_RAPPORT_BASE_URL),
|
|
265
281
|
};
|
|
266
|
-
fs.
|
|
282
|
+
fs.outputJSONSync(path.join(rapportDir, DOCUMENT_METADATA_FILE), metadata, {
|
|
267
283
|
spaces: 2,
|
|
268
284
|
});
|
|
285
|
+
if (options.fetchDocuments) {
|
|
286
|
+
await processRapport(metadata, originalRapportsDir, options);
|
|
287
|
+
}
|
|
269
288
|
}
|
|
270
289
|
}
|
|
271
|
-
async function convertDatasetSens(dataDir) {
|
|
290
|
+
async function convertDatasetSens(dataDir, options) {
|
|
272
291
|
const dataset = datasets.sens;
|
|
273
292
|
if (!options["silent"]) {
|
|
274
293
|
console.log(`Converting database ${dataset.database} data into files…`);
|
|
@@ -277,16 +296,18 @@ async function convertDatasetSens(dataDir) {
|
|
|
277
296
|
const senateursReorganizedDir = path.join(sensReorganizedRootDir, SENS_SENATEURS_FOLDER);
|
|
278
297
|
const circonscriptionsReorganizedDir = path.join(sensReorganizedRootDir, SENS_CIRCONSCRIPTIONS_FOLDER);
|
|
279
298
|
const organismesReorganizedDir = path.join(sensReorganizedRootDir, SENS_ORGANISMES_FOLDER);
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
299
|
+
if (!options.keepDir) {
|
|
300
|
+
ensureAndClearDir(sensReorganizedRootDir);
|
|
301
|
+
ensureAndClearDir(senateursReorganizedDir);
|
|
302
|
+
ensureAndClearDir(circonscriptionsReorganizedDir);
|
|
303
|
+
ensureAndClearDir(organismesReorganizedDir);
|
|
304
|
+
}
|
|
284
305
|
for await (const sen of findAllSens()) {
|
|
285
306
|
if (options["verbose"]) {
|
|
286
307
|
console.log(`Converting ${sen["matricule"]} file…`);
|
|
287
308
|
}
|
|
288
309
|
const senFileName = `${sen["matricule"]}.json`;
|
|
289
|
-
fs.
|
|
310
|
+
fs.outputJSONSync(path.join(senateursReorganizedDir, senFileName), sen, {
|
|
290
311
|
spaces: 2,
|
|
291
312
|
});
|
|
292
313
|
}
|
|
@@ -295,16 +316,18 @@ async function convertDatasetSens(dataDir) {
|
|
|
295
316
|
console.log(`Converting ${circonscription["identifiant"]} file…`);
|
|
296
317
|
}
|
|
297
318
|
const circonscriptionFileName = `${circonscription["identifiant"]}.json`;
|
|
298
|
-
fs.
|
|
319
|
+
fs.outputJSONSync(path.join(circonscriptionsReorganizedDir, circonscriptionFileName), circonscription, {
|
|
320
|
+
spaces: 2,
|
|
321
|
+
});
|
|
299
322
|
}
|
|
300
323
|
for await (const organisme of findAllOrganismes()) {
|
|
301
324
|
if (options["verbose"]) {
|
|
302
325
|
console.log(`Converting ${organisme["code"]} file…`);
|
|
303
326
|
}
|
|
304
327
|
const organismeFileName = `${organisme["code"]}.json`;
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
328
|
+
fs.outputJSONSync(path.join(organismesReorganizedDir, organisme["type_code"], organismeFileName), organisme, {
|
|
329
|
+
spaces: 2,
|
|
330
|
+
});
|
|
308
331
|
}
|
|
309
332
|
}
|
|
310
333
|
convertData()
|
|
@@ -1 +1,2 @@
|
|
|
1
|
-
export
|
|
1
|
+
export declare function processTexte(texteMetadata: any, originalTextesDir: string, transformedTextesDir: string, options: any): Promise<void>;
|
|
2
|
+
export declare function processRapport(rapportMetadata: any, originalRapportsDir: string, options: any): Promise<void>;
|
|
@@ -4,17 +4,12 @@ import fs from "fs-extra";
|
|
|
4
4
|
import { DateTime } from "luxon";
|
|
5
5
|
import path from "path";
|
|
6
6
|
import { DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER, iterLoadSenatDossiersLegislatifsRapportUrls, iterLoadSenatDossiersLegislatifsTexteUrls, RAPPORT_FOLDER, TEXTE_FOLDER, } from "../loaders";
|
|
7
|
-
import { parseExposeDesMotifs, parseTexte, parseTexteFromFile } from "../
|
|
7
|
+
import { parseExposeDesMotifs, parseTexte, parseTexteFromFile } from "../parsers/texte";
|
|
8
8
|
import { getSessionsFromStart, UNDEFINED_SESSION } from "../types/sessions";
|
|
9
9
|
import { commonOptions } from "./shared/cli_helpers";
|
|
10
10
|
import { ensureAndClearDir, fetchWithRetry, isOptionEmptyOrHasValue } from "./shared/util";
|
|
11
11
|
const optionsDefinitions = [
|
|
12
12
|
...commonOptions,
|
|
13
|
-
{
|
|
14
|
-
help: "parse and convert documents into JSON (textes only for now, requires format xml)",
|
|
15
|
-
name: "parseDocuments",
|
|
16
|
-
type: Boolean,
|
|
17
|
-
},
|
|
18
13
|
{
|
|
19
14
|
alias: "F",
|
|
20
15
|
help: "formats of documents to retrieve (xml/html/pdf for textes, html/pdf for rapports); leave empty for all",
|
|
@@ -38,205 +33,142 @@ const options = commandLineArgs(optionsDefinitions);
|
|
|
38
33
|
const textDecoder = new TextDecoder("utf8");
|
|
39
34
|
const today = DateTime.now();
|
|
40
35
|
function isDocumentRecent(documentDate, daysThreshold) {
|
|
41
|
-
if (!documentDate)
|
|
36
|
+
if (!documentDate)
|
|
42
37
|
return false;
|
|
43
|
-
}
|
|
44
38
|
const docDate = DateTime.fromISO(documentDate);
|
|
45
|
-
|
|
46
|
-
return false;
|
|
47
|
-
}
|
|
48
|
-
const daysDiff = today.diff(docDate, "days").days;
|
|
49
|
-
return daysDiff <= daysThreshold;
|
|
39
|
+
return docDate.isValid && today.diff(docDate, "days").days <= daysThreshold;
|
|
50
40
|
}
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
if (options
|
|
57
|
-
|
|
41
|
+
function shouldDownload(filePath, docDate, options) {
|
|
42
|
+
if (options.force)
|
|
43
|
+
return true;
|
|
44
|
+
if (!fs.existsSync(filePath))
|
|
45
|
+
return true;
|
|
46
|
+
if (options.onlyRecent !== undefined) {
|
|
47
|
+
return isDocumentRecent(docDate, options.onlyRecent);
|
|
48
|
+
}
|
|
49
|
+
return false;
|
|
50
|
+
}
|
|
51
|
+
async function downloadDocument(documentUrl, verbose) {
|
|
52
|
+
if (verbose) {
|
|
53
|
+
console.log(`Downloading document ${documentUrl}…`);
|
|
58
54
|
}
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
fs.ensureDirSync(texteDir);
|
|
66
|
-
let exposeDesMotifsContent = null;
|
|
67
|
-
if (texteMetadata.url_expose_des_motifs) {
|
|
68
|
-
exposeDesMotifsContent = await downloadExposeDesMotifs(texteDir, texteMetadata.name, String(texteMetadata.url_expose_des_motifs));
|
|
69
|
-
}
|
|
70
|
-
if (isOptionEmptyOrHasValue(options["formats"], "xml")) {
|
|
71
|
-
const textePath = path.join(texteDir, `${texteMetadata.name}.xml`);
|
|
72
|
-
let texteBuffer = null;
|
|
73
|
-
// Check if document should be skipped based on onlyRecent option
|
|
74
|
-
const shouldSkip = !options["force"] &&
|
|
75
|
-
fs.existsSync(textePath) &&
|
|
76
|
-
(options["only-recent"] === undefined || !isDocumentRecent(texteMetadata.date, options["only-recent"]));
|
|
77
|
-
if (shouldSkip) {
|
|
78
|
-
if (!options["silent"]) {
|
|
79
|
-
console.info(`Already downloaded texte ${textePath}…`);
|
|
80
|
-
}
|
|
81
|
-
}
|
|
82
|
-
else {
|
|
83
|
-
texteBuffer = await downloadDocument(texteMetadata.url_xml.toString());
|
|
84
|
-
if (!texteBuffer) {
|
|
85
|
-
texteUrlsNotFoundOrError.push(texteMetadata.url_xml);
|
|
86
|
-
continue;
|
|
87
|
-
}
|
|
88
|
-
fs.writeFileSync(textePath, Buffer.from(texteBuffer));
|
|
89
|
-
retrievedTextesCount++;
|
|
90
|
-
}
|
|
91
|
-
if (options["parseDocuments"]) {
|
|
92
|
-
const parsedTexte = await parseDocument(texteMetadata.session, transformedTextesDir, textePath, texteMetadata.name, texteBuffer, exposeDesMotifsContent);
|
|
93
|
-
if (!parsedTexte) {
|
|
94
|
-
texteUrlsParseError.push(texteMetadata.url_xml);
|
|
95
|
-
}
|
|
96
|
-
}
|
|
97
|
-
}
|
|
98
|
-
if (isOptionEmptyOrHasValue(options["formats"], "html")) {
|
|
99
|
-
const textePath = path.join(texteDir, `${texteMetadata.name}.html`);
|
|
100
|
-
// Check if document should be skipped based on onlyRecent option
|
|
101
|
-
const shouldSkip = !options["force"] &&
|
|
102
|
-
fs.existsSync(textePath) &&
|
|
103
|
-
(options["only-recent"] === undefined || !isDocumentRecent(texteMetadata.date, options["only-recent"]));
|
|
104
|
-
if (shouldSkip) {
|
|
105
|
-
if (!options["silent"]) {
|
|
106
|
-
console.info(`Already downloaded texte ${textePath}…`);
|
|
107
|
-
}
|
|
108
|
-
}
|
|
109
|
-
else {
|
|
110
|
-
const texteBuffer = await downloadDocument(texteMetadata.url_html.toString());
|
|
111
|
-
if (!texteBuffer) {
|
|
112
|
-
texteUrlsNotFoundOrError.push(texteMetadata.url_html);
|
|
113
|
-
continue;
|
|
114
|
-
}
|
|
115
|
-
fs.writeFileSync(textePath, Buffer.from(texteBuffer));
|
|
116
|
-
retrievedTextesCount++;
|
|
55
|
+
try {
|
|
56
|
+
const response = await fetchWithRetry(documentUrl);
|
|
57
|
+
if (!response.ok) {
|
|
58
|
+
if (response.status === 404) {
|
|
59
|
+
if (verbose) {
|
|
60
|
+
console.warn(`Document ${documentUrl} not found`);
|
|
117
61
|
}
|
|
118
62
|
}
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
const shouldSkip = !options["force"] &&
|
|
123
|
-
fs.existsSync(textePath) &&
|
|
124
|
-
(options["only-recent"] === undefined || !isDocumentRecent(texteMetadata.date, options["only-recent"]));
|
|
125
|
-
if (shouldSkip) {
|
|
126
|
-
if (!options["silent"]) {
|
|
127
|
-
console.info(`Already downloaded texte ${textePath}…`);
|
|
128
|
-
}
|
|
129
|
-
}
|
|
130
|
-
else {
|
|
131
|
-
const texteBuffer = await downloadDocument(texteMetadata.url_pdf.toString());
|
|
132
|
-
if (!texteBuffer) {
|
|
133
|
-
texteUrlsNotFoundOrError.push(texteMetadata.url_pdf);
|
|
134
|
-
continue;
|
|
135
|
-
}
|
|
136
|
-
fs.writeFileSync(textePath, Buffer.from(texteBuffer));
|
|
137
|
-
retrievedTextesCount++;
|
|
63
|
+
else {
|
|
64
|
+
if (verbose) {
|
|
65
|
+
console.error(`An error occurred while retrieving document ${documentUrl}: ${response.status}`);
|
|
138
66
|
}
|
|
139
67
|
}
|
|
68
|
+
return null;
|
|
140
69
|
}
|
|
70
|
+
return response.arrayBuffer();
|
|
141
71
|
}
|
|
142
|
-
|
|
143
|
-
console.
|
|
144
|
-
|
|
145
|
-
if (options["parseDocuments"]) {
|
|
146
|
-
console.log(`${texteUrlsParseError.length} textes failed to be parsed with URLs ${texteUrlsParseError.join(", ")}`);
|
|
147
|
-
}
|
|
72
|
+
catch (error) {
|
|
73
|
+
console.error(error.message);
|
|
74
|
+
return null;
|
|
148
75
|
}
|
|
149
76
|
}
|
|
150
|
-
async function
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
}
|
|
177
|
-
fs.writeFileSync(rapportPath, Buffer.from(rapportBuffer));
|
|
178
|
-
retrievedRapportsCount++;
|
|
179
|
-
}
|
|
180
|
-
if (isOptionEmptyOrHasValue(options["formats"], "pdf")) {
|
|
181
|
-
const rapportPath = path.join(rapportDir, `${rapportMetadata.name}.pdf`);
|
|
182
|
-
// Check if document should be skipped based on onlyRecent option
|
|
183
|
-
const shouldSkip = !options["force"] &&
|
|
184
|
-
fs.existsSync(rapportPath) &&
|
|
185
|
-
(options["only-recent"] === undefined || !isDocumentRecent(rapportMetadata.date, options["only-recent"]));
|
|
186
|
-
if (shouldSkip) {
|
|
187
|
-
if (!options["silent"]) {
|
|
188
|
-
console.info(`Already downloaded rapport ${rapportPath}…`);
|
|
189
|
-
}
|
|
190
|
-
continue;
|
|
191
|
-
}
|
|
192
|
-
const rapportBuffer = await downloadDocument(rapportMetadata.url_pdf.toString());
|
|
193
|
-
if (!rapportBuffer) {
|
|
194
|
-
rapportUrlsNotFoundOrError.push(rapportMetadata.url_pdf);
|
|
195
|
-
continue;
|
|
196
|
-
}
|
|
197
|
-
fs.writeFileSync(rapportPath, Buffer.from(rapportBuffer));
|
|
198
|
-
retrievedRapportsCount++;
|
|
77
|
+
async function processDocument(url, destPath, docDate, options) {
|
|
78
|
+
if (!shouldDownload(destPath, docDate, options)) {
|
|
79
|
+
if (options.verbose)
|
|
80
|
+
console.info(`Already downloaded ${destPath}…`);
|
|
81
|
+
return { success: true, skipped: true, buffer: null };
|
|
82
|
+
}
|
|
83
|
+
const arrayBuffer = await downloadDocument(url, options.verbose);
|
|
84
|
+
if (!arrayBuffer) {
|
|
85
|
+
return { success: false, skipped: false, buffer: null };
|
|
86
|
+
}
|
|
87
|
+
const buffer = Buffer.from(arrayBuffer);
|
|
88
|
+
await fs.outputFile(destPath, buffer);
|
|
89
|
+
return { success: true, skipped: false, buffer };
|
|
90
|
+
}
|
|
91
|
+
export async function processTexte(texteMetadata, originalTextesDir, transformedTextesDir, options) {
|
|
92
|
+
const texteDir = path.join(originalTextesDir, `${texteMetadata.session ?? UNDEFINED_SESSION}`, texteMetadata.name);
|
|
93
|
+
let exposeDesMotifsContent = null;
|
|
94
|
+
if (texteMetadata.url_expose_des_motifs) {
|
|
95
|
+
const exposePath = path.join(texteDir, `${texteMetadata.name}-expose.html`);
|
|
96
|
+
const res = await processDocument(texteMetadata.url_expose_des_motifs.toString(), exposePath, texteMetadata.date, options);
|
|
97
|
+
if (res.buffer) {
|
|
98
|
+
exposeDesMotifsContent = res.buffer;
|
|
99
|
+
}
|
|
100
|
+
else if (res.skipped && options.parseDocuments) {
|
|
101
|
+
if (await fs.pathExists(exposePath)) {
|
|
102
|
+
exposeDesMotifsContent = await fs.readFile(exposePath);
|
|
199
103
|
}
|
|
200
104
|
}
|
|
201
105
|
}
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
106
|
+
const formats = [
|
|
107
|
+
{ type: "xml", url: texteMetadata.url_xml, isParseTarget: true },
|
|
108
|
+
{ type: "html", url: texteMetadata.url_html, isParseTarget: false },
|
|
109
|
+
{ type: "pdf", url: texteMetadata.url_pdf, isParseTarget: false },
|
|
110
|
+
];
|
|
111
|
+
for (const format of formats) {
|
|
112
|
+
if (!isOptionEmptyOrHasValue(options.formats, format.type))
|
|
113
|
+
continue;
|
|
114
|
+
const destPath = path.join(texteDir, `${texteMetadata.name}.${format.type}`);
|
|
115
|
+
const result = await processDocument(format.url.toString(), destPath, texteMetadata.date, options);
|
|
116
|
+
// Specific logic: Parsing (Only applies to XML)
|
|
117
|
+
if (format.isParseTarget && options.parseDocuments) {
|
|
118
|
+
await parseDocument(texteMetadata.session, transformedTextesDir, destPath, texteMetadata.name, result.buffer, exposeDesMotifsContent, options);
|
|
119
|
+
}
|
|
205
120
|
}
|
|
206
121
|
}
|
|
207
|
-
async function
|
|
208
|
-
const
|
|
209
|
-
|
|
210
|
-
|
|
122
|
+
export async function processRapport(rapportMetadata, originalRapportsDir, options) {
|
|
123
|
+
const rapportDir = path.join(originalRapportsDir, `${rapportMetadata.session ?? UNDEFINED_SESSION}`, rapportMetadata.name);
|
|
124
|
+
const formats = [
|
|
125
|
+
{ type: "html", url: rapportMetadata.url_html },
|
|
126
|
+
{ type: "pdf", url: rapportMetadata.url_pdf },
|
|
127
|
+
];
|
|
128
|
+
for (const format of formats) {
|
|
129
|
+
if (!isOptionEmptyOrHasValue(options["formats"], format.type))
|
|
130
|
+
continue;
|
|
131
|
+
const destPath = path.join(rapportDir, `${rapportMetadata.name}.${format.type}`);
|
|
132
|
+
await processDocument(format.url.toString(), destPath, rapportMetadata.date, options);
|
|
211
133
|
}
|
|
212
|
-
const exposeDesMotifsPath = path.join(texteDir, `${texteName}-expose.html`);
|
|
213
|
-
fs.writeFileSync(exposeDesMotifsPath, Buffer.from(content));
|
|
214
|
-
return content;
|
|
215
134
|
}
|
|
216
|
-
async function
|
|
217
|
-
|
|
218
|
-
|
|
135
|
+
async function retrieveTextes(dataDir, sessions) {
|
|
136
|
+
const originalTextesDir = path.join(dataDir, TEXTE_FOLDER, DATA_ORIGINAL_FOLDER);
|
|
137
|
+
const transformedTextesDir = path.join(dataDir, TEXTE_FOLDER, DATA_TRANSFORMED_FOLDER);
|
|
138
|
+
if (options["parseDocuments"]) {
|
|
139
|
+
ensureAndClearDir(transformedTextesDir);
|
|
219
140
|
}
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
141
|
+
const dlOptions = {
|
|
142
|
+
force: options["force"],
|
|
143
|
+
silent: options["silent"],
|
|
144
|
+
verbose: options["verbose"],
|
|
145
|
+
onlyRecent: options["only-recent"],
|
|
146
|
+
formats: options["formats"],
|
|
147
|
+
parseDocuments: options["parseDocuments"],
|
|
148
|
+
};
|
|
149
|
+
for (const session of sessions) {
|
|
150
|
+
for (const { item: texteMetadata } of iterLoadSenatDossiersLegislatifsTexteUrls(dataDir, session)) {
|
|
151
|
+
await processTexte(texteMetadata, originalTextesDir, transformedTextesDir, dlOptions);
|
|
230
152
|
}
|
|
231
|
-
return response.arrayBuffer();
|
|
232
153
|
}
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
154
|
+
}
|
|
155
|
+
async function retrieveRapports(dataDir, sessions) {
|
|
156
|
+
const originalRapportsDir = path.join(dataDir, RAPPORT_FOLDER, DATA_ORIGINAL_FOLDER);
|
|
157
|
+
const dlOptions = {
|
|
158
|
+
force: options["force"],
|
|
159
|
+
silent: options["silent"],
|
|
160
|
+
verbose: options["verbose"],
|
|
161
|
+
onlyRecent: options["only-recent"],
|
|
162
|
+
formats: options["formats"],
|
|
163
|
+
};
|
|
164
|
+
for (const session of sessions) {
|
|
165
|
+
for (const { item: rapportMetadata } of iterLoadSenatDossiersLegislatifsRapportUrls(dataDir, session)) {
|
|
166
|
+
await processRapport(rapportMetadata, originalRapportsDir, dlOptions);
|
|
167
|
+
}
|
|
236
168
|
}
|
|
237
169
|
}
|
|
238
|
-
async function parseDocument(session, transformedTextesDir, textePath, texteName, texteBuffer, exposeDesMotifs = null) {
|
|
239
|
-
if (
|
|
170
|
+
async function parseDocument(session, transformedTextesDir, textePath, texteName, texteBuffer, exposeDesMotifs = null, options = {}) {
|
|
171
|
+
if (options.verbose) {
|
|
240
172
|
console.log(`Parsing texte ${textePath}…`);
|
|
241
173
|
}
|
|
242
174
|
let parsedTexte;
|
|
@@ -247,19 +179,17 @@ async function parseDocument(session, transformedTextesDir, textePath, texteName
|
|
|
247
179
|
else {
|
|
248
180
|
parsedTexte = await parseTexteFromFile(textePath);
|
|
249
181
|
}
|
|
250
|
-
if (!parsedTexte)
|
|
182
|
+
if (!parsedTexte)
|
|
251
183
|
return null;
|
|
252
|
-
}
|
|
253
184
|
if (exposeDesMotifs) {
|
|
254
|
-
if (
|
|
185
|
+
if (options.verbose) {
|
|
255
186
|
console.log("Parsing exposé des motifs…");
|
|
256
187
|
}
|
|
257
188
|
const exposeDesMotifsHtml = textDecoder.decode(exposeDesMotifs);
|
|
258
189
|
parsedTexte.exposeDesMotifs = parseExposeDesMotifs(exposeDesMotifsHtml);
|
|
259
190
|
}
|
|
260
191
|
const transformedTexteDir = path.join(transformedTextesDir, `${session ?? UNDEFINED_SESSION}`, texteName);
|
|
261
|
-
fs.
|
|
262
|
-
fs.writeJSONSync(path.join(transformedTexteDir, `${texteName}.json`), parsedTexte, { spaces: 2 });
|
|
192
|
+
await fs.outputJSON(path.join(transformedTexteDir, `${texteName}.json`), parsedTexte, { spaces: 2 });
|
|
263
193
|
return parsedTexte;
|
|
264
194
|
}
|
|
265
195
|
async function main() {
|
|
@@ -277,9 +207,11 @@ async function main() {
|
|
|
277
207
|
console.timeEnd("documents processing time");
|
|
278
208
|
}
|
|
279
209
|
}
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
210
|
+
if (process.argv[1].endsWith("retrieve_documents.ts")) {
|
|
211
|
+
main()
|
|
212
|
+
.then(() => process.exit(0))
|
|
213
|
+
.catch((error) => {
|
|
214
|
+
console.log(error);
|
|
215
|
+
process.exit(1);
|
|
216
|
+
});
|
|
217
|
+
}
|
|
@@ -64,6 +64,16 @@ export declare const pullOption: {
|
|
|
64
64
|
name: string;
|
|
65
65
|
type: BooleanConstructor;
|
|
66
66
|
};
|
|
67
|
+
export declare const fetchDocumentsOption: {
|
|
68
|
+
help: string;
|
|
69
|
+
name: string;
|
|
70
|
+
type: BooleanConstructor;
|
|
71
|
+
};
|
|
72
|
+
export declare const parseDocumentsOption: {
|
|
73
|
+
help: string;
|
|
74
|
+
name: string;
|
|
75
|
+
type: BooleanConstructor;
|
|
76
|
+
};
|
|
67
77
|
export declare const commonOptions: ({
|
|
68
78
|
defaultOption: boolean;
|
|
69
79
|
help: string;
|
|
@@ -64,6 +64,16 @@ export const pullOption = {
|
|
|
64
64
|
name: "pull",
|
|
65
65
|
type: Boolean,
|
|
66
66
|
};
|
|
67
|
+
export const fetchDocumentsOption = {
|
|
68
|
+
help: "download documents",
|
|
69
|
+
name: "fetchDocuments",
|
|
70
|
+
type: Boolean,
|
|
71
|
+
};
|
|
72
|
+
export const parseDocumentsOption = {
|
|
73
|
+
help: "parse documents",
|
|
74
|
+
name: "parseDocuments",
|
|
75
|
+
type: Boolean,
|
|
76
|
+
};
|
|
67
77
|
export const commonOptions = [
|
|
68
78
|
categoriesOption,
|
|
69
79
|
dataDirDefaultOption,
|
|
@@ -76,4 +86,6 @@ export const commonOptions = [
|
|
|
76
86
|
commitOption,
|
|
77
87
|
remoteOption,
|
|
78
88
|
pullOption,
|
|
89
|
+
fetchDocumentsOption,
|
|
90
|
+
parseDocumentsOption,
|
|
79
91
|
];
|
|
@@ -1,29 +1,18 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { iterLoadSenatAmendements, iterLoadSenatDossiersLegislatifs } from "../loaders";
|
|
2
2
|
import commandLineArgs from "command-line-args";
|
|
3
3
|
import { dataDirDefaultOption } from "./shared/cli_helpers";
|
|
4
4
|
const optionsDefinitions = [dataDirDefaultOption];
|
|
5
5
|
const options = commandLineArgs(optionsDefinitions);
|
|
6
|
-
const noValidation = false;
|
|
7
6
|
const session = 2024;
|
|
8
|
-
const
|
|
9
|
-
for (const { item:
|
|
10
|
-
|
|
7
|
+
const sinceCommit = undefined;
|
|
8
|
+
for (const { item: amendement } of iterLoadSenatAmendements(options["dataDir"], session, {
|
|
9
|
+
log: true,
|
|
10
|
+
sinceCommit: sinceCommit,
|
|
11
|
+
})) {
|
|
12
|
+
console.log(amendement["numero"]);
|
|
11
13
|
}
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
session,
|
|
17
|
-
{ noValidation: noValidation },
|
|
18
|
-
)) {
|
|
19
|
-
console.log(amendement["numero"])
|
|
14
|
+
for (const { item: dossierLegislatif } of iterLoadSenatDossiersLegislatifs(options["dataDir"], session, {
|
|
15
|
+
sinceCommit: sinceCommit,
|
|
16
|
+
})) {
|
|
17
|
+
console.log(dossierLegislatif["numero"]);
|
|
20
18
|
}
|
|
21
|
-
|
|
22
|
-
for (const { item: dossierLegislatif } of iterLoadSenatDossiersLegislatifs(
|
|
23
|
-
options["dataDir"],
|
|
24
|
-
session,
|
|
25
|
-
{ noValidation: noValidation },
|
|
26
|
-
)) {
|
|
27
|
-
console.log(dossierLegislatif["numero"])
|
|
28
|
-
}
|
|
29
|
-
*/
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tricoteuses/senat",
|
|
3
|
-
"version": "2.20.
|
|
3
|
+
"version": "2.20.22",
|
|
4
4
|
"description": "Handle French Sénat's open data",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"France",
|
|
@@ -52,7 +52,6 @@
|
|
|
52
52
|
"data:retrieve_open_data": "tsx src/scripts/retrieve_open_data.ts --all",
|
|
53
53
|
"data:retrieve_senateurs_photos": "tsx src/scripts/retrieve_senateurs_photos.ts --fetch",
|
|
54
54
|
"data:retrieve_videos": "tsx src/scripts/retrieve_videos.ts",
|
|
55
|
-
"data:parse_textes_lois": "tsx src/scripts/parse_textes.ts",
|
|
56
55
|
"prepare": "npm run build",
|
|
57
56
|
"prepublishOnly": "npm run build",
|
|
58
57
|
"prettier": "prettier --write 'src/**/*.ts' 'tests/**/*.test.ts'",
|