@tricoteuses/senat 1.3.2 → 1.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/config.js +4 -6
- package/lib/databases.js +34 -75
- package/lib/datasets.js +20 -28
- package/lib/index.d.ts +5 -0
- package/lib/index.js +4 -43
- package/lib/loaders.js +56 -74
- package/lib/model/ameli.js +8 -11
- package/lib/model/dosleg.js +47 -52
- package/lib/model/index.js +4 -13
- package/lib/model/questions.js +15 -18
- package/lib/model/sens.d.ts +1 -1
- package/lib/model/sens.js +65 -71
- package/lib/model/texte.js +17 -25
- package/lib/model/util.js +13 -21
- package/lib/raw_types/ameli.js +1 -2
- package/lib/raw_types/debats.js +1 -2
- package/lib/raw_types/dosleg.js +1 -2
- package/lib/raw_types/questions.js +1 -2
- package/lib/raw_types/sens.js +1 -2
- package/lib/raw_types_schemats/ameli.js +1 -2
- package/lib/raw_types_schemats/debats.js +1 -2
- package/lib/raw_types_schemats/dosleg.js +1 -2
- package/lib/raw_types_schemats/questions.js +1 -2
- package/lib/raw_types_schemats/sens.js +1 -2
- package/lib/scripts/convert_data.js +78 -83
- package/lib/scripts/datautil.js +9 -13
- package/lib/scripts/parse_textes.js +23 -28
- package/lib/scripts/retrieve_documents.js +56 -61
- package/lib/scripts/retrieve_open_data.js +44 -49
- package/lib/scripts/retrieve_senateurs_photos.js +31 -36
- package/lib/scripts/shared/cli_helpers.js +9 -12
- package/lib/scripts/shared/util.js +7 -15
- package/lib/strings.js +4 -10
- package/lib/types/ameli.js +5 -8
- package/lib/types/debats.js +2 -5
- package/lib/types/dosleg.js +28 -31
- package/lib/types/questions.js +1 -2
- package/lib/types/sens.js +1 -2
- package/lib/types/sessions.js +2 -5
- package/lib/types/texte.js +2 -5
- package/lib/validators/config.js +4 -7
- package/package.json +4 -4
|
@@ -1,132 +1,127 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
const
|
|
13
|
-
const
|
|
14
|
-
const datautil_1 = require("./datautil");
|
|
15
|
-
const cli_helpers_1 = require("./shared/cli_helpers");
|
|
16
|
-
const util_1 = require("./shared/util");
|
|
17
|
-
const optionsDefinitions = [...cli_helpers_1.commonOptions];
|
|
18
|
-
const options = (0, command_line_args_1.default)(optionsDefinitions);
|
|
1
|
+
import assert from "assert";
|
|
2
|
+
import commandLineArgs from "command-line-args";
|
|
3
|
+
import fs from "fs-extra";
|
|
4
|
+
import path from "path";
|
|
5
|
+
import { datasets, EnabledDatasets, getEnabledDatasets } from "../datasets";
|
|
6
|
+
import { DOCUMENT_METADATA_FILE, DOSLEG_DOSSIERS_FOLDER, RAPPORT_FOLDER, SENS_CIRCONSCRIPTIONS_FOLDER, SENS_ORGANISMES_FOLDER, SENS_SENATEURS_FOLDER, TEXTE_FOLDER, TEXTE_ORIGINAL_FOLDER, } from "../loaders";
|
|
7
|
+
import { findAllAmendements, findAllCirconscriptions, findAllLois, findAllOrganismes, findAllQuestions, findAllSens, } from "../model";
|
|
8
|
+
import { findSenatRapportUrls, findSenatTexteUrls } from "../model/dosleg";
|
|
9
|
+
import { formatToFourDigitSession, SIGNET_STRUCTURE_REGEXP, UNDEFINED_SESSION, } from "./datautil";
|
|
10
|
+
import { commonOptions } from "./shared/cli_helpers";
|
|
11
|
+
import { ensureAndClearDir } from "./shared/util";
|
|
12
|
+
const optionsDefinitions = [...commonOptions];
|
|
13
|
+
const options = commandLineArgs(optionsDefinitions);
|
|
19
14
|
const SENAT_TEXTE_XML_BASE_URL = "https://www.senat.fr/akomantoso/";
|
|
20
15
|
const SENAT_TEXTE_BASE_URL = "https://www.senat.fr/leg/";
|
|
21
16
|
const SENAT_EXPOSE_DES_MOTIFS_BASE_URL = "https://www.senat.fr/leg/exposes-des-motifs/";
|
|
22
17
|
const SENAT_RAPPORT_BASE_URL = "https://www.senat.fr/rap/";
|
|
23
18
|
async function convertData() {
|
|
24
|
-
const enabledDatasets =
|
|
19
|
+
const enabledDatasets = getEnabledDatasets(options["categories"]);
|
|
25
20
|
const dataDir = options["dataDir"];
|
|
26
|
-
(
|
|
21
|
+
assert(dataDir, "Missing argument: data directory");
|
|
27
22
|
console.time("data transformation time");
|
|
28
|
-
if (enabledDatasets &
|
|
29
|
-
const dataset =
|
|
23
|
+
if (enabledDatasets & EnabledDatasets.Ameli) {
|
|
24
|
+
const dataset = datasets.ameli;
|
|
30
25
|
if (!options["silent"]) {
|
|
31
26
|
console.log(`Converting database ${dataset.database} data into files…`);
|
|
32
27
|
}
|
|
33
|
-
const ameliReorganizedRootDir =
|
|
34
|
-
|
|
35
|
-
for await (const amendement of
|
|
28
|
+
const ameliReorganizedRootDir = path.join(dataDir, dataset.database);
|
|
29
|
+
ensureAndClearDir(ameliReorganizedRootDir);
|
|
30
|
+
for await (const amendement of findAllAmendements()) {
|
|
36
31
|
if (options["verbose"]) {
|
|
37
32
|
console.log(`Converting ${amendement.numero} file…`);
|
|
38
33
|
}
|
|
39
|
-
const session = String(amendement.session) ||
|
|
34
|
+
const session = String(amendement.session) || UNDEFINED_SESSION;
|
|
40
35
|
const signetDossierLegislatif = amendement.signet_dossier_legislatif ||
|
|
41
36
|
`${amendement.nature_texte}-${amendement.numero_texte}`.toLowerCase();
|
|
42
|
-
const ameliReorganizedDir =
|
|
43
|
-
|
|
37
|
+
const ameliReorganizedDir = path.join(ameliReorganizedRootDir, session, signetDossierLegislatif);
|
|
38
|
+
fs.ensureDirSync(ameliReorganizedDir);
|
|
44
39
|
const amendementFileName = `${amendement.numero}.json`;
|
|
45
|
-
|
|
40
|
+
fs.writeJSONSync(path.join(ameliReorganizedDir, amendementFileName), amendement, { spaces: 2 });
|
|
46
41
|
}
|
|
47
42
|
}
|
|
48
|
-
if (enabledDatasets &
|
|
49
|
-
const dataset =
|
|
43
|
+
if (enabledDatasets & EnabledDatasets.DosLeg) {
|
|
44
|
+
const dataset = datasets.dosleg;
|
|
50
45
|
if (!options["silent"]) {
|
|
51
46
|
console.log(`Converting database ${dataset.database} data into files…`);
|
|
52
47
|
}
|
|
53
|
-
const doslegReorganizedRootDir =
|
|
54
|
-
const dossiersReorganizedDir =
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
for await (const loi of
|
|
48
|
+
const doslegReorganizedRootDir = path.join(dataDir, dataset.database);
|
|
49
|
+
const dossiersReorganizedDir = path.join(doslegReorganizedRootDir, DOSLEG_DOSSIERS_FOLDER);
|
|
50
|
+
ensureAndClearDir(doslegReorganizedRootDir);
|
|
51
|
+
ensureAndClearDir(dossiersReorganizedDir);
|
|
52
|
+
for await (const loi of findAllLois()) {
|
|
58
53
|
if (options["verbose"]) {
|
|
59
54
|
console.log(`Converting ${loi.signet} file…`);
|
|
60
55
|
}
|
|
61
|
-
let loiReorganizedDir =
|
|
62
|
-
const signetParts =
|
|
56
|
+
let loiReorganizedDir = path.join(dossiersReorganizedDir, UNDEFINED_SESSION);
|
|
57
|
+
const signetParts = SIGNET_STRUCTURE_REGEXP.exec(loi.signet)?.groups;
|
|
63
58
|
if (signetParts && "session" in signetParts) {
|
|
64
59
|
const { session } = signetParts;
|
|
65
|
-
const formattedSession =
|
|
66
|
-
loiReorganizedDir =
|
|
60
|
+
const formattedSession = formatToFourDigitSession(session);
|
|
61
|
+
loiReorganizedDir = path.join(dossiersReorganizedDir, formattedSession);
|
|
67
62
|
}
|
|
68
|
-
|
|
63
|
+
fs.ensureDirSync(loiReorganizedDir);
|
|
69
64
|
const loiFileName = `${loi.signet}.json`;
|
|
70
|
-
|
|
65
|
+
fs.writeJSONSync(path.join(loiReorganizedDir, loiFileName), loi, {
|
|
71
66
|
spaces: 2,
|
|
72
67
|
});
|
|
73
68
|
}
|
|
74
69
|
await convertTexteUrls(dataDir);
|
|
75
70
|
await convertRapportUrls(dataDir);
|
|
76
71
|
}
|
|
77
|
-
if (enabledDatasets &
|
|
78
|
-
const dataset =
|
|
72
|
+
if (enabledDatasets & EnabledDatasets.Questions) {
|
|
73
|
+
const dataset = datasets.questions;
|
|
79
74
|
if (!options["silent"]) {
|
|
80
75
|
console.log(`Converting database ${dataset.database} data into files…`);
|
|
81
76
|
}
|
|
82
|
-
const questionsReorganizedRootDir =
|
|
83
|
-
|
|
84
|
-
for await (const question of
|
|
77
|
+
const questionsReorganizedRootDir = path.join(dataDir, dataset.database);
|
|
78
|
+
ensureAndClearDir(questionsReorganizedRootDir);
|
|
79
|
+
for await (const question of findAllQuestions()) {
|
|
85
80
|
if (options["verbose"]) {
|
|
86
81
|
console.log(`Converting ${question.reference} file…`);
|
|
87
82
|
}
|
|
88
83
|
const legislature = question.legislature ? question.legislature : 0;
|
|
89
|
-
const questionReorganizedDir =
|
|
90
|
-
|
|
84
|
+
const questionReorganizedDir = path.join(questionsReorganizedRootDir, String(legislature));
|
|
85
|
+
fs.ensureDirSync(questionReorganizedDir);
|
|
91
86
|
const questionFileName = `${question.reference}.json`;
|
|
92
|
-
|
|
87
|
+
fs.writeJSONSync(path.join(questionReorganizedDir, questionFileName), question, { spaces: 2 });
|
|
93
88
|
}
|
|
94
89
|
}
|
|
95
|
-
if (enabledDatasets &
|
|
96
|
-
const dataset =
|
|
90
|
+
if (enabledDatasets & EnabledDatasets.Sens) {
|
|
91
|
+
const dataset = datasets.sens;
|
|
97
92
|
if (!options["silent"]) {
|
|
98
93
|
console.log(`Converting database ${dataset.database} data into files…`);
|
|
99
94
|
}
|
|
100
|
-
const sensReorganizedRootDir =
|
|
101
|
-
const senateursReorganizedDir =
|
|
102
|
-
const circonscriptionsReorganizedDir =
|
|
103
|
-
const organismesReorganizedDir =
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
for await (const sen of
|
|
95
|
+
const sensReorganizedRootDir = path.join(dataDir, dataset.database);
|
|
96
|
+
const senateursReorganizedDir = path.join(sensReorganizedRootDir, SENS_SENATEURS_FOLDER);
|
|
97
|
+
const circonscriptionsReorganizedDir = path.join(sensReorganizedRootDir, SENS_CIRCONSCRIPTIONS_FOLDER);
|
|
98
|
+
const organismesReorganizedDir = path.join(sensReorganizedRootDir, SENS_ORGANISMES_FOLDER);
|
|
99
|
+
ensureAndClearDir(sensReorganizedRootDir);
|
|
100
|
+
ensureAndClearDir(senateursReorganizedDir);
|
|
101
|
+
ensureAndClearDir(circonscriptionsReorganizedDir);
|
|
102
|
+
ensureAndClearDir(organismesReorganizedDir);
|
|
103
|
+
for await (const sen of findAllSens()) {
|
|
109
104
|
if (options["verbose"]) {
|
|
110
105
|
console.log(`Converting ${sen.matricule} file…`);
|
|
111
106
|
}
|
|
112
107
|
const senFileName = `${sen.matricule}.json`;
|
|
113
|
-
|
|
108
|
+
fs.writeJSONSync(path.join(senateursReorganizedDir, senFileName), sen, {
|
|
114
109
|
spaces: 2,
|
|
115
110
|
});
|
|
116
111
|
}
|
|
117
|
-
for await (const circonscription of
|
|
112
|
+
for await (const circonscription of findAllCirconscriptions()) {
|
|
118
113
|
if (options["verbose"]) {
|
|
119
114
|
console.log(`Converting ${circonscription.identifiant} file…`);
|
|
120
115
|
}
|
|
121
116
|
const circonscriptionFileName = `${circonscription.identifiant}.json`;
|
|
122
|
-
|
|
117
|
+
fs.writeJSONSync(path.join(circonscriptionsReorganizedDir, circonscriptionFileName), circonscription, { spaces: 2 });
|
|
123
118
|
}
|
|
124
|
-
for await (const organisme of
|
|
119
|
+
for await (const organisme of findAllOrganismes()) {
|
|
125
120
|
if (options["verbose"]) {
|
|
126
121
|
console.log(`Converting ${organisme.code} file…`);
|
|
127
122
|
}
|
|
128
123
|
const organismeFileName = `${organisme.code}.json`;
|
|
129
|
-
|
|
124
|
+
fs.writeJSONSync(path.join(organismesReorganizedDir, organismeFileName), organisme, { spaces: 2 });
|
|
130
125
|
}
|
|
131
126
|
}
|
|
132
127
|
if (!options["silent"]) {
|
|
@@ -134,13 +129,13 @@ async function convertData() {
|
|
|
134
129
|
}
|
|
135
130
|
}
|
|
136
131
|
async function convertTexteUrls(dataDir) {
|
|
137
|
-
const textesDir =
|
|
138
|
-
|
|
139
|
-
const originalTextesDir =
|
|
140
|
-
for await (const texte of
|
|
141
|
-
const texteName =
|
|
142
|
-
const texteDir =
|
|
143
|
-
|
|
132
|
+
const textesDir = path.join(dataDir, TEXTE_FOLDER);
|
|
133
|
+
fs.ensureDirSync(textesDir);
|
|
134
|
+
const originalTextesDir = path.join(textesDir, TEXTE_ORIGINAL_FOLDER);
|
|
135
|
+
for await (const texte of findSenatTexteUrls(options["sessions"])) {
|
|
136
|
+
const texteName = path.parse(texte.url).name;
|
|
137
|
+
const texteDir = path.join(originalTextesDir, `${texte.session ?? UNDEFINED_SESSION}`, texteName);
|
|
138
|
+
fs.ensureDirSync(texteDir);
|
|
144
139
|
const metadata = {
|
|
145
140
|
name: texteName,
|
|
146
141
|
session: texte.session,
|
|
@@ -151,26 +146,26 @@ async function convertTexteUrls(dataDir) {
|
|
|
151
146
|
url_html: new URL(`${texteName}.html`, SENAT_TEXTE_BASE_URL),
|
|
152
147
|
url_pdf: new URL(`${texteName}.pdf`, SENAT_TEXTE_BASE_URL),
|
|
153
148
|
};
|
|
154
|
-
|
|
149
|
+
fs.writeJSONSync(path.join(texteDir, DOCUMENT_METADATA_FILE), metadata, {
|
|
155
150
|
spaces: 2,
|
|
156
151
|
});
|
|
157
152
|
}
|
|
158
153
|
}
|
|
159
154
|
async function convertRapportUrls(dataDir) {
|
|
160
|
-
const rapportsDir =
|
|
161
|
-
|
|
162
|
-
for await (const rapport of
|
|
163
|
-
const parsedRapportUrl =
|
|
155
|
+
const rapportsDir = path.join(dataDir, RAPPORT_FOLDER);
|
|
156
|
+
fs.ensureDirSync(rapportsDir);
|
|
157
|
+
for await (const rapport of findSenatRapportUrls(options["sessions"])) {
|
|
158
|
+
const parsedRapportUrl = path.parse(rapport.url);
|
|
164
159
|
const rapportName = parsedRapportUrl.name;
|
|
165
|
-
const rapportDir =
|
|
166
|
-
|
|
160
|
+
const rapportDir = path.join(rapportsDir, `${rapport.session ?? UNDEFINED_SESSION}`, rapportName);
|
|
161
|
+
fs.ensureDirSync(rapportDir);
|
|
167
162
|
const rapportHtmlUrlBase = `${rapportName}_mono.html`;
|
|
168
|
-
const rapportHtmlUrl =
|
|
163
|
+
const rapportHtmlUrl = path.format({
|
|
169
164
|
dir: parsedRapportUrl.dir,
|
|
170
165
|
base: rapportHtmlUrlBase,
|
|
171
166
|
});
|
|
172
167
|
const rapportPdfUrlBase = `${rapportName}1.pdf`;
|
|
173
|
-
const rapportPdfUrl =
|
|
168
|
+
const rapportPdfUrl = path.format({
|
|
174
169
|
dir: parsedRapportUrl.dir,
|
|
175
170
|
base: rapportPdfUrlBase,
|
|
176
171
|
});
|
|
@@ -180,7 +175,7 @@ async function convertRapportUrls(dataDir) {
|
|
|
180
175
|
url_html: new URL(rapportHtmlUrl, SENAT_RAPPORT_BASE_URL),
|
|
181
176
|
url_pdf: new URL(rapportPdfUrl, SENAT_RAPPORT_BASE_URL),
|
|
182
177
|
};
|
|
183
|
-
|
|
178
|
+
fs.writeJSONSync(path.join(rapportDir, DOCUMENT_METADATA_FILE), metadata, {
|
|
184
179
|
spaces: 2,
|
|
185
180
|
});
|
|
186
181
|
}
|
package/lib/scripts/datautil.js
CHANGED
|
@@ -1,21 +1,17 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
const
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
exports.AKN_IDENTIFICATION_STRUCTURE_REGEXP = /^\/akn\/fr\/(?<type>[a-z]+)\/(?<session>\d{4}-\d{4})\/?(?<numTexte>\d+)\/fr@(?<version>\b(?:RECT|RECT_BIS|RECT_TER|RECT_QUATER|RECT_QUINQUIES)\b)?/;
|
|
9
|
-
exports.AKN_WORKFLOW_IDENTIFICATION_STRUCTURE_REGEXP = /^\/akn\/fr\/(?<type>[a-z]+)\/(?<session>\d{2,4})\/?(?<numTexte>[a-zA-Z0-9]+)\/fr@(?<version>\b(?:RECT|RECT_BIS|RECT_TER|RECT_QUATER|RECT_QUINQUIES)\b)?/;
|
|
10
|
-
exports.UNDEFINED_SESSION = "0";
|
|
11
|
-
function formatToFourDigitSession(session) {
|
|
1
|
+
import { DateTime, Settings } from "luxon";
|
|
2
|
+
Settings.twoDigitCutoffYear = 50;
|
|
3
|
+
export const SIGNET_STRUCTURE_REGEXP = /^(?<type>[a-z]+)(?<session>\d{2,4})-?(?<numTexte>\d+)?/;
|
|
4
|
+
export const AKN_IDENTIFICATION_STRUCTURE_REGEXP = /^\/akn\/fr\/(?<type>[a-z]+)\/(?<session>\d{4}-\d{4})\/?(?<numTexte>\d+)\/fr@(?<version>\b(?:RECT|RECT_BIS|RECT_TER|RECT_QUATER|RECT_QUINQUIES)\b)?/;
|
|
5
|
+
export const AKN_WORKFLOW_IDENTIFICATION_STRUCTURE_REGEXP = /^\/akn\/fr\/(?<type>[a-z]+)\/(?<session>\d{2,4})\/?(?<numTexte>[a-zA-Z0-9]+)\/fr@(?<version>\b(?:RECT|RECT_BIS|RECT_TER|RECT_QUATER|RECT_QUINQUIES)\b)?/;
|
|
6
|
+
export const UNDEFINED_SESSION = "0";
|
|
7
|
+
export function formatToFourDigitSession(session) {
|
|
12
8
|
if (session.length >= 2) {
|
|
13
9
|
const sessionFirstTwoDigits = session.substring(0, 2);
|
|
14
10
|
const sessionLastTwoDigits = session.substring(session.length - 2);
|
|
15
11
|
const twoDigitSession = parseInt(sessionFirstTwoDigits) === parseInt(sessionLastTwoDigits) - 1
|
|
16
12
|
? sessionFirstTwoDigits
|
|
17
13
|
: sessionLastTwoDigits;
|
|
18
|
-
return
|
|
14
|
+
return DateTime.fromFormat(String(twoDigitSession), "yy").toFormat("yyyy");
|
|
19
15
|
}
|
|
20
|
-
return
|
|
16
|
+
return UNDEFINED_SESSION;
|
|
21
17
|
}
|
|
@@ -1,44 +1,39 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
const
|
|
10
|
-
const
|
|
11
|
-
const texte_1 = require("../model/texte");
|
|
12
|
-
const cli_helpers_1 = require("./shared/cli_helpers");
|
|
13
|
-
const util_1 = require("./shared/util");
|
|
14
|
-
const optionsDefinitions = [...cli_helpers_1.commonOptions];
|
|
15
|
-
const options = (0, command_line_args_1.default)(optionsDefinitions);
|
|
1
|
+
import assert from "assert";
|
|
2
|
+
import commandLineArgs from "command-line-args";
|
|
3
|
+
import fs from "fs-extra";
|
|
4
|
+
import path from "path";
|
|
5
|
+
import { iterFilePaths, TEXTE_FOLDER, TEXTE_ORIGINAL_FOLDER, TEXTE_TRANSFORMED_FOLDER, } from "../loaders";
|
|
6
|
+
import { parseExposeDesMotifsFromFile, parseTexteFromFile, } from "../model/texte";
|
|
7
|
+
import { commonOptions } from "./shared/cli_helpers";
|
|
8
|
+
import { ensureAndClearDir } from "./shared/util";
|
|
9
|
+
const optionsDefinitions = [...commonOptions];
|
|
10
|
+
const options = commandLineArgs(optionsDefinitions);
|
|
16
11
|
async function main() {
|
|
17
12
|
const dataDir = options["dataDir"];
|
|
18
|
-
(
|
|
19
|
-
const transformedTextesDir =
|
|
20
|
-
|
|
21
|
-
for (const filePath of
|
|
22
|
-
const parsedFilePath =
|
|
13
|
+
assert(dataDir, "Missing argument: data directory");
|
|
14
|
+
const transformedTextesDir = path.join(options["dataDir"], TEXTE_FOLDER, TEXTE_TRANSFORMED_FOLDER);
|
|
15
|
+
ensureAndClearDir(transformedTextesDir);
|
|
16
|
+
for (const filePath of iterFilePaths(path.join(dataDir, TEXTE_FOLDER, TEXTE_ORIGINAL_FOLDER))) {
|
|
17
|
+
const parsedFilePath = path.parse(filePath);
|
|
23
18
|
if (parsedFilePath.ext !== ".xml") {
|
|
24
19
|
continue;
|
|
25
20
|
}
|
|
26
|
-
const texteDirFromOriginal = parsedFilePath.dir.substring(filePath.indexOf(
|
|
27
|
-
const transformedTexteDir =
|
|
28
|
-
|
|
21
|
+
const texteDirFromOriginal = parsedFilePath.dir.substring(filePath.indexOf(TEXTE_ORIGINAL_FOLDER) + TEXTE_ORIGINAL_FOLDER.length);
|
|
22
|
+
const transformedTexteDir = path.join(transformedTextesDir, texteDirFromOriginal);
|
|
23
|
+
fs.ensureDirSync(transformedTexteDir);
|
|
29
24
|
if (!options["silent"]) {
|
|
30
25
|
console.log(`Parsing texte ${parsedFilePath.name}.xml…`);
|
|
31
26
|
}
|
|
32
|
-
const parsedTexte = await
|
|
27
|
+
const parsedTexte = await parseTexteFromFile(filePath);
|
|
33
28
|
const exposeDesMotifsFileName = `${parsedFilePath.name}-expose`;
|
|
34
|
-
const exposeDesMotifsFilePath =
|
|
35
|
-
if (parsedTexte &&
|
|
29
|
+
const exposeDesMotifsFilePath = path.join(parsedFilePath.dir, `${exposeDesMotifsFileName}.html`);
|
|
30
|
+
if (parsedTexte && fs.existsSync(exposeDesMotifsFilePath)) {
|
|
36
31
|
if (!options["silent"]) {
|
|
37
32
|
console.log(`Parsing exposé des motifs ${exposeDesMotifsFileName}.html…`);
|
|
38
33
|
}
|
|
39
|
-
parsedTexte.exposeDesMotifs = await
|
|
34
|
+
parsedTexte.exposeDesMotifs = await parseExposeDesMotifsFromFile(exposeDesMotifsFilePath);
|
|
40
35
|
}
|
|
41
|
-
|
|
36
|
+
fs.writeJSONSync(path.join(transformedTexteDir, `${parsedFilePath.name}.json`), parsedTexte, { spaces: 2 });
|
|
42
37
|
}
|
|
43
38
|
}
|
|
44
39
|
main()
|
|
@@ -1,19 +1,14 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
const loaders_1 = require("../loaders");
|
|
11
|
-
const texte_1 = require("../model/texte");
|
|
12
|
-
const datautil_1 = require("./datautil");
|
|
13
|
-
const cli_helpers_1 = require("./shared/cli_helpers");
|
|
14
|
-
const util_1 = require("./shared/util");
|
|
1
|
+
import assert from "assert";
|
|
2
|
+
import commandLineArgs from "command-line-args";
|
|
3
|
+
import fs from "fs-extra";
|
|
4
|
+
import path from "path";
|
|
5
|
+
import { iterLoadSenatDossiersLegislatifsRapportUrls, iterLoadSenatDossiersLegislatifsTexteUrls, RAPPORT_FOLDER, TEXTE_FOLDER, TEXTE_ORIGINAL_FOLDER, TEXTE_TRANSFORMED_FOLDER, } from "../loaders";
|
|
6
|
+
import { parseExposeDesMotifs, parseTexte, parseTexteFromFile, } from "../model/texte";
|
|
7
|
+
import { UNDEFINED_SESSION } from "./datautil";
|
|
8
|
+
import { commonOptions } from "./shared/cli_helpers";
|
|
9
|
+
import { ensureAndClearDir, fetchWithRetry, isOptionEmptyOrHasValue, } from "./shared/util";
|
|
15
10
|
const optionsDefinitions = [
|
|
16
|
-
...
|
|
11
|
+
...commonOptions,
|
|
17
12
|
{
|
|
18
13
|
help: "sessions of textes to retrieve; leave empty for all",
|
|
19
14
|
multiple: true,
|
|
@@ -44,14 +39,14 @@ const optionsDefinitions = [
|
|
|
44
39
|
type: Boolean,
|
|
45
40
|
},
|
|
46
41
|
];
|
|
47
|
-
const options = (
|
|
42
|
+
const options = commandLineArgs(optionsDefinitions);
|
|
48
43
|
const textDecoder = new TextDecoder("utf8");
|
|
49
44
|
async function retrieveDocument(documentUrl) {
|
|
50
45
|
if (!options["silent"]) {
|
|
51
46
|
console.log(`Retrieving document ${documentUrl}…`);
|
|
52
47
|
}
|
|
53
48
|
try {
|
|
54
|
-
const response = await
|
|
49
|
+
const response = await fetchWithRetry(documentUrl);
|
|
55
50
|
if (!response.ok) {
|
|
56
51
|
if (response.status === 404) {
|
|
57
52
|
console.warn(`Texte ${documentUrl} not found`);
|
|
@@ -69,36 +64,36 @@ async function retrieveDocument(documentUrl) {
|
|
|
69
64
|
}
|
|
70
65
|
}
|
|
71
66
|
async function retrieveTextes(dataDir) {
|
|
72
|
-
const textesDir =
|
|
73
|
-
|
|
74
|
-
const originalTextesDir =
|
|
75
|
-
const transformedTextesDir =
|
|
67
|
+
const textesDir = path.join(dataDir, TEXTE_FOLDER);
|
|
68
|
+
fs.ensureDirSync(textesDir);
|
|
69
|
+
const originalTextesDir = path.join(textesDir, TEXTE_ORIGINAL_FOLDER);
|
|
70
|
+
const transformedTextesDir = path.join(textesDir, TEXTE_TRANSFORMED_FOLDER);
|
|
76
71
|
if (options["parseDocuments"]) {
|
|
77
|
-
|
|
72
|
+
ensureAndClearDir(transformedTextesDir);
|
|
78
73
|
}
|
|
79
74
|
let retrievedTextesCount = 0;
|
|
80
75
|
const texteUrlsNotFoundOrError = [];
|
|
81
76
|
const texteUrlsParseError = [];
|
|
82
77
|
for (const session of options["sessions"]) {
|
|
83
|
-
for (const { item: texteMetadata, } of
|
|
84
|
-
const texteDir =
|
|
85
|
-
|
|
78
|
+
for (const { item: texteMetadata, } of iterLoadSenatDossiersLegislatifsTexteUrls(dataDir, session)) {
|
|
79
|
+
const texteDir = path.join(originalTextesDir, `${texteMetadata.session ?? UNDEFINED_SESSION}`, texteMetadata.name);
|
|
80
|
+
fs.ensureDirSync(texteDir);
|
|
86
81
|
let exposeDesMotifsContent = null;
|
|
87
82
|
if (texteMetadata.url_expose_des_motifs) {
|
|
88
83
|
if (!options["silent"]) {
|
|
89
84
|
console.log("Retrieving exposé des motifs…");
|
|
90
85
|
}
|
|
91
|
-
const exposeDesMotifsPath =
|
|
86
|
+
const exposeDesMotifsPath = path.join(texteDir, `${texteMetadata.name}-expose.html`);
|
|
92
87
|
exposeDesMotifsContent = await retrieveDocument(texteMetadata.url_expose_des_motifs.toString());
|
|
93
88
|
if (!exposeDesMotifsContent) {
|
|
94
89
|
continue;
|
|
95
90
|
}
|
|
96
|
-
|
|
91
|
+
fs.writeFileSync(exposeDesMotifsPath, Buffer.from(exposeDesMotifsContent));
|
|
97
92
|
}
|
|
98
|
-
if (
|
|
99
|
-
const textePath =
|
|
93
|
+
if (isOptionEmptyOrHasValue(options["formats"], "xml")) {
|
|
94
|
+
const textePath = path.join(texteDir, `${texteMetadata.name}.xml`);
|
|
100
95
|
let texteBuffer = null;
|
|
101
|
-
if (!options["force"] &&
|
|
96
|
+
if (!options["force"] && fs.existsSync(textePath)) {
|
|
102
97
|
if (!options["silent"]) {
|
|
103
98
|
console.info(`Already retrieved texte ${textePath}…`);
|
|
104
99
|
}
|
|
@@ -109,7 +104,7 @@ async function retrieveTextes(dataDir) {
|
|
|
109
104
|
texteUrlsNotFoundOrError.push(texteMetadata.url_xml);
|
|
110
105
|
continue;
|
|
111
106
|
}
|
|
112
|
-
|
|
107
|
+
fs.writeFileSync(textePath, Buffer.from(texteBuffer));
|
|
113
108
|
retrievedTextesCount++;
|
|
114
109
|
}
|
|
115
110
|
if (options["parseDocuments"]) {
|
|
@@ -119,10 +114,10 @@ async function retrieveTextes(dataDir) {
|
|
|
119
114
|
let parsedTexte = null;
|
|
120
115
|
if (texteBuffer) {
|
|
121
116
|
const texteXml = textDecoder.decode(texteBuffer);
|
|
122
|
-
parsedTexte =
|
|
117
|
+
parsedTexte = parseTexte(texteXml);
|
|
123
118
|
}
|
|
124
119
|
else {
|
|
125
|
-
parsedTexte = await
|
|
120
|
+
parsedTexte = await parseTexteFromFile(textePath);
|
|
126
121
|
}
|
|
127
122
|
if (!parsedTexte) {
|
|
128
123
|
texteUrlsParseError.push(texteMetadata.url_xml);
|
|
@@ -134,16 +129,16 @@ async function retrieveTextes(dataDir) {
|
|
|
134
129
|
}
|
|
135
130
|
const exposeDesMotifsHtml = textDecoder.decode(exposeDesMotifsContent);
|
|
136
131
|
parsedTexte.exposeDesMotifs =
|
|
137
|
-
|
|
132
|
+
parseExposeDesMotifs(exposeDesMotifsHtml);
|
|
138
133
|
}
|
|
139
|
-
const transformedTexteDir =
|
|
140
|
-
|
|
141
|
-
|
|
134
|
+
const transformedTexteDir = path.join(transformedTextesDir, `${texteMetadata.session ?? UNDEFINED_SESSION}`, texteMetadata.name);
|
|
135
|
+
fs.ensureDirSync(transformedTexteDir);
|
|
136
|
+
fs.writeJSONSync(path.join(transformedTexteDir, `${texteMetadata.name}.json`), parsedTexte, { spaces: 2 });
|
|
142
137
|
}
|
|
143
138
|
}
|
|
144
|
-
if (
|
|
145
|
-
const textePath =
|
|
146
|
-
if (!options["force"] &&
|
|
139
|
+
if (isOptionEmptyOrHasValue(options["formats"], "html")) {
|
|
140
|
+
const textePath = path.join(texteDir, `${texteMetadata.name}.html`);
|
|
141
|
+
if (!options["force"] && fs.existsSync(textePath)) {
|
|
147
142
|
if (!options["silent"]) {
|
|
148
143
|
console.info(`Already retrieved texte ${textePath}…`);
|
|
149
144
|
}
|
|
@@ -154,13 +149,13 @@ async function retrieveTextes(dataDir) {
|
|
|
154
149
|
texteUrlsNotFoundOrError.push(texteMetadata.url_html);
|
|
155
150
|
continue;
|
|
156
151
|
}
|
|
157
|
-
|
|
152
|
+
fs.writeFileSync(textePath, Buffer.from(texteBuffer));
|
|
158
153
|
retrievedTextesCount++;
|
|
159
154
|
}
|
|
160
155
|
}
|
|
161
|
-
if (
|
|
162
|
-
const textePath =
|
|
163
|
-
if (!options["force"] &&
|
|
156
|
+
if (isOptionEmptyOrHasValue(options["formats"], "pdf")) {
|
|
157
|
+
const textePath = path.join(texteDir, `${texteMetadata.name}.pdf`);
|
|
158
|
+
if (!options["force"] && fs.existsSync(textePath)) {
|
|
164
159
|
if (!options["silent"]) {
|
|
165
160
|
console.info(`Already retrieved texte ${textePath}…`);
|
|
166
161
|
}
|
|
@@ -171,7 +166,7 @@ async function retrieveTextes(dataDir) {
|
|
|
171
166
|
texteUrlsNotFoundOrError.push(texteMetadata.url_pdf);
|
|
172
167
|
continue;
|
|
173
168
|
}
|
|
174
|
-
|
|
169
|
+
fs.writeFileSync(textePath, Buffer.from(texteBuffer));
|
|
175
170
|
retrievedTextesCount++;
|
|
176
171
|
}
|
|
177
172
|
}
|
|
@@ -186,17 +181,17 @@ async function retrieveTextes(dataDir) {
|
|
|
186
181
|
}
|
|
187
182
|
}
|
|
188
183
|
async function retrieveRapports(dataDir) {
|
|
189
|
-
const rapportsDir =
|
|
190
|
-
|
|
184
|
+
const rapportsDir = path.join(dataDir, RAPPORT_FOLDER);
|
|
185
|
+
fs.ensureDirSync(rapportsDir);
|
|
191
186
|
let retrievedRapportsCount = 0;
|
|
192
187
|
const rapportUrlsNotFoundOrError = [];
|
|
193
188
|
for (const session of options["sessions"]) {
|
|
194
|
-
for (const { item: rapportMetadata, } of
|
|
195
|
-
const rapportDir =
|
|
196
|
-
|
|
197
|
-
if (
|
|
198
|
-
const rapportPath =
|
|
199
|
-
if (!options["force"] &&
|
|
189
|
+
for (const { item: rapportMetadata, } of iterLoadSenatDossiersLegislatifsRapportUrls(dataDir, session)) {
|
|
190
|
+
const rapportDir = path.join(rapportsDir, `${rapportMetadata.session ?? UNDEFINED_SESSION}`, rapportMetadata.name);
|
|
191
|
+
fs.ensureDirSync(rapportDir);
|
|
192
|
+
if (isOptionEmptyOrHasValue(options["formats"], "html")) {
|
|
193
|
+
const rapportPath = path.join(rapportDir, `${rapportMetadata.name}.html`);
|
|
194
|
+
if (!options["force"] && fs.existsSync(rapportPath)) {
|
|
200
195
|
if (!options["silent"]) {
|
|
201
196
|
console.info(`Already retrieved rapport ${rapportPath}…`);
|
|
202
197
|
}
|
|
@@ -207,12 +202,12 @@ async function retrieveRapports(dataDir) {
|
|
|
207
202
|
rapportUrlsNotFoundOrError.push(rapportMetadata.url_html);
|
|
208
203
|
continue;
|
|
209
204
|
}
|
|
210
|
-
|
|
205
|
+
fs.writeFileSync(rapportPath, Buffer.from(rapportBuffer));
|
|
211
206
|
retrievedRapportsCount++;
|
|
212
207
|
}
|
|
213
|
-
if (
|
|
214
|
-
const rapportPath =
|
|
215
|
-
if (!options["force"] &&
|
|
208
|
+
if (isOptionEmptyOrHasValue(options["formats"], "pdf")) {
|
|
209
|
+
const rapportPath = path.join(rapportDir, `${rapportMetadata.name}.pdf`);
|
|
210
|
+
if (!options["force"] && fs.existsSync(rapportPath)) {
|
|
216
211
|
if (!options["silent"]) {
|
|
217
212
|
console.info(`Already retrieved rapport ${rapportPath}…`);
|
|
218
213
|
}
|
|
@@ -223,7 +218,7 @@ async function retrieveRapports(dataDir) {
|
|
|
223
218
|
rapportUrlsNotFoundOrError.push(rapportMetadata.url_pdf);
|
|
224
219
|
continue;
|
|
225
220
|
}
|
|
226
|
-
|
|
221
|
+
fs.writeFileSync(rapportPath, Buffer.from(rapportBuffer));
|
|
227
222
|
retrievedRapportsCount++;
|
|
228
223
|
}
|
|
229
224
|
}
|
|
@@ -235,12 +230,12 @@ async function retrieveRapports(dataDir) {
|
|
|
235
230
|
}
|
|
236
231
|
async function main() {
|
|
237
232
|
const dataDir = options["dataDir"];
|
|
238
|
-
(
|
|
233
|
+
assert(dataDir, "Missing argument: data directory");
|
|
239
234
|
console.time("documents processing time");
|
|
240
|
-
if (
|
|
235
|
+
if (isOptionEmptyOrHasValue(options["types"], "textes")) {
|
|
241
236
|
await retrieveTextes(dataDir);
|
|
242
237
|
}
|
|
243
|
-
if (
|
|
238
|
+
if (isOptionEmptyOrHasValue(options["types"], "rapports")) {
|
|
244
239
|
await retrieveRapports(dataDir);
|
|
245
240
|
}
|
|
246
241
|
if (!options["silent"]) {
|