@tricoteuses/senat 1.3.1 → 1.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/config.d.ts +1 -0
- package/lib/config.js +14 -45
- package/lib/databases.js +86 -143
- package/lib/datasets.js +78 -83
- package/lib/index.d.ts +7 -4
- package/lib/index.js +42 -419
- package/lib/loaders.js +149 -654
- package/lib/model/ameli.js +83 -21
- package/lib/model/debats.js +0 -1
- package/lib/model/dosleg.d.ts +1 -1
- package/lib/model/dosleg.js +179 -73
- package/lib/model/index.d.ts +3 -3
- package/lib/model/index.js +12 -46
- package/lib/model/questions.js +68 -39
- package/lib/model/sens.js +383 -113
- package/lib/model/texte.js +220 -290
- package/lib/model/util.js +9 -26
- package/lib/raw_types/ameli.js +5 -6
- package/lib/raw_types/debats.js +5 -6
- package/lib/raw_types/dosleg.js +5 -6
- package/lib/raw_types/questions.js +5 -6
- package/lib/raw_types/sens.js +5 -6
- package/lib/raw_types_schemats/ameli.js +1 -43
- package/lib/raw_types_schemats/debats.js +1 -22
- package/lib/raw_types_schemats/dosleg.js +1 -96
- package/lib/raw_types_schemats/questions.js +1 -22
- package/lib/raw_types_schemats/sens.js +1 -112
- package/lib/scripts/convert_data.js +181 -631
- package/lib/scripts/datautil.js +17 -60
- package/lib/scripts/parse_textes.js +46 -129
- package/lib/scripts/retrieve_documents.js +247 -513
- package/lib/scripts/retrieve_open_data.js +211 -368
- package/lib/scripts/retrieve_senateurs_photos.js +144 -239
- package/lib/scripts/shared/cli_helpers.js +30 -30
- package/lib/scripts/shared/util.js +28 -94
- package/lib/strings.js +20 -45
- package/lib/types/ameli.d.ts +1 -1
- package/lib/types/ameli.js +14 -25
- package/lib/types/debats.d.ts +1 -1
- package/lib/types/debats.js +3 -21
- package/lib/types/dosleg.d.ts +1 -1
- package/lib/types/dosleg.js +152 -119
- package/lib/types/questions.d.ts +1 -1
- package/lib/types/questions.js +1 -13
- package/lib/types/sens.d.ts +1 -1
- package/lib/types/sens.js +1 -13
- package/lib/types/sessions.js +44 -49
- package/lib/types/texte.js +17 -22
- package/lib/validators/config.js +47 -111
- package/lib/validators/senat.js +1 -5
- package/package.json +16 -38
- package/lib/aggregates.d.ts +0 -52
- package/lib/aggregates.mjs +0 -930
- package/lib/aggregates.ts +0 -833
- package/lib/config.mjs +0 -16
- package/lib/config.ts +0 -26
- package/lib/data/legislatures.json +0 -38
- package/lib/databases.mjs +0 -57
- package/lib/databases.ts +0 -71
- package/lib/datasets.mjs +0 -78
- package/lib/datasets.ts +0 -118
- package/lib/fields.d.ts +0 -10
- package/lib/fields.mjs +0 -68
- package/lib/fields.ts +0 -29
- package/lib/index.mjs +0 -4
- package/lib/index.ts +0 -42
- package/lib/inserters.d.ts +0 -98
- package/lib/inserters.mjs +0 -500
- package/lib/inserters.ts +0 -521
- package/lib/loaders.mjs +0 -158
- package/lib/loaders.ts +0 -271
- package/lib/model/ameli.mjs +0 -84
- package/lib/model/ameli.ts +0 -100
- package/lib/model/debats.mjs +0 -1
- package/lib/model/debats.ts +0 -0
- package/lib/model/dosleg.mjs +0 -196
- package/lib/model/dosleg.ts +0 -240
- package/lib/model/index.mjs +0 -4
- package/lib/model/index.ts +0 -14
- package/lib/model/questions.mjs +0 -71
- package/lib/model/questions.ts +0 -93
- package/lib/model/sens.mjs +0 -415
- package/lib/model/sens.ts +0 -516
- package/lib/model/texte.mjs +0 -208
- package/lib/model/texte.ts +0 -229
- package/lib/model/util.mjs +0 -19
- package/lib/model/util.ts +0 -32
- package/lib/raw_types/ameli.mjs +0 -5
- package/lib/raw_types/ameli.ts +0 -951
- package/lib/raw_types/debats.mjs +0 -5
- package/lib/raw_types/debats.ts +0 -222
- package/lib/raw_types/dosleg.mjs +0 -5
- package/lib/raw_types/dosleg.ts +0 -3625
- package/lib/raw_types/questions.mjs +0 -5
- package/lib/raw_types/questions.ts +0 -427
- package/lib/raw_types/sens.mjs +0 -5
- package/lib/raw_types/sens.ts +0 -4499
- package/lib/raw_types_kysely/ameli.d.ts +0 -6
- package/lib/raw_types_kysely/ameli.mjs +0 -7
- package/lib/raw_types_kysely/ameli.ts +0 -6
- package/lib/raw_types_kysely/debats.d.ts +0 -6
- package/lib/raw_types_kysely/debats.mjs +0 -7
- package/lib/raw_types_kysely/debats.ts +0 -6
- package/lib/raw_types_kysely/dosleg.d.ts +0 -6
- package/lib/raw_types_kysely/dosleg.mjs +0 -7
- package/lib/raw_types_kysely/dosleg.ts +0 -6
- package/lib/raw_types_kysely/questions.d.ts +0 -6
- package/lib/raw_types_kysely/questions.mjs +0 -7
- package/lib/raw_types_kysely/questions.ts +0 -6
- package/lib/raw_types_kysely/sens.d.ts +0 -6
- package/lib/raw_types_kysely/sens.mjs +0 -7
- package/lib/raw_types_kysely/sens.ts +0 -6
- package/lib/raw_types_kysely/texte.d.ts +0 -45
- package/lib/raw_types_kysely/texte.mjs +0 -7
- package/lib/raw_types_kysely/texte.ts +0 -53
- package/lib/raw_types_schemats/ameli.mjs +0 -2
- package/lib/raw_types_schemats/ameli.ts +0 -601
- package/lib/raw_types_schemats/debats.mjs +0 -2
- package/lib/raw_types_schemats/debats.ts +0 -145
- package/lib/raw_types_schemats/dosleg.mjs +0 -2
- package/lib/raw_types_schemats/dosleg.ts +0 -2195
- package/lib/raw_types_schemats/questions.mjs +0 -2
- package/lib/raw_types_schemats/questions.ts +0 -251
- package/lib/raw_types_schemats/sens.mjs +0 -2
- package/lib/raw_types_schemats/sens.ts +0 -2907
- package/lib/scripts/convert_data.mjs +0 -181
- package/lib/scripts/convert_data.ts +0 -243
- package/lib/scripts/datautil.mjs +0 -16
- package/lib/scripts/datautil.ts +0 -19
- package/lib/scripts/images/transparent_150x192.jpg +0 -0
- package/lib/scripts/images/transparent_155x225.jpg +0 -0
- package/lib/scripts/parse_textes.mjs +0 -46
- package/lib/scripts/parse_textes.ts +0 -65
- package/lib/scripts/retrieve_documents.mjs +0 -249
- package/lib/scripts/retrieve_documents.ts +0 -298
- package/lib/scripts/retrieve_open_data.mjs +0 -217
- package/lib/scripts/retrieve_open_data.ts +0 -274
- package/lib/scripts/retrieve_senateurs_photos.mjs +0 -147
- package/lib/scripts/retrieve_senateurs_photos.ts +0 -177
- package/lib/scripts/retrieve_textes.d.ts +0 -1
- package/lib/scripts/retrieve_textes.mjs +0 -328
- package/lib/scripts/retrieve_textes.ts +0 -143
- package/lib/scripts/shared/cli_helpers.ts +0 -36
- package/lib/scripts/shared/util.ts +0 -33
- package/lib/src/aggregates.d.ts +0 -52
- package/lib/src/aggregates.mjs +0 -726
- package/lib/src/config.d.ts +0 -2
- package/lib/src/config.mjs +0 -16
- package/lib/src/databases.d.ts +0 -18
- package/lib/src/databases.mjs +0 -55
- package/lib/src/datasets.d.ts +0 -28
- package/lib/src/datasets.mjs +0 -78
- package/lib/src/fields.d.ts +0 -10
- package/lib/src/fields.mjs +0 -22
- package/lib/src/index.d.ts +0 -8
- package/lib/src/index.mjs +0 -7
- package/lib/src/inserters.d.ts +0 -98
- package/lib/src/inserters.mjs +0 -360
- package/lib/src/loaders.d.ts +0 -36
- package/lib/src/loaders.mjs +0 -107
- package/lib/src/model/ameli.d.ts +0 -4
- package/lib/src/model/ameli.js +0 -57
- package/lib/src/model/debats.d.ts +0 -4
- package/lib/src/model/debats.js +0 -43
- package/lib/src/model/dosleg.d.ts +0 -197
- package/lib/src/model/dosleg.js +0 -169
- package/lib/src/model/index.d.ts +0 -4
- package/lib/src/model/index.js +0 -4
- package/lib/src/model/questions.d.ts +0 -89
- package/lib/src/model/questions.js +0 -76
- package/lib/src/model/sens.d.ts +0 -390
- package/lib/src/model/sens.js +0 -339
- package/lib/src/model/texte.d.ts +0 -7
- package/lib/src/model/texte.js +0 -183
- package/lib/src/raw_types_kysely/ameli.d.ts +0 -915
- package/lib/src/raw_types_kysely/ameli.js +0 -5
- package/lib/src/raw_types_kysely/debats.d.ts +0 -207
- package/lib/src/raw_types_kysely/debats.js +0 -5
- package/lib/src/raw_types_kysely/dosleg.d.ts +0 -3532
- package/lib/src/raw_types_kysely/dosleg.js +0 -5
- package/lib/src/raw_types_kysely/questions.d.ts +0 -414
- package/lib/src/raw_types_kysely/questions.js +0 -5
- package/lib/src/raw_types_kysely/sens.d.ts +0 -4394
- package/lib/src/raw_types_kysely/sens.js +0 -5
- package/lib/src/raw_types_schemats/ameli.d.ts +0 -541
- package/lib/src/raw_types_schemats/ameli.js +0 -2
- package/lib/src/raw_types_schemats/debats.d.ts +0 -127
- package/lib/src/raw_types_schemats/debats.js +0 -2
- package/lib/src/raw_types_schemats/dosleg.d.ts +0 -2027
- package/lib/src/raw_types_schemats/dosleg.js +0 -2
- package/lib/src/raw_types_schemats/questions.d.ts +0 -231
- package/lib/src/raw_types_schemats/questions.js +0 -2
- package/lib/src/raw_types_schemats/sens.d.ts +0 -2709
- package/lib/src/raw_types_schemats/sens.js +0 -2
- package/lib/src/scripts/convert_data.d.ts +0 -1
- package/lib/src/scripts/convert_data.js +0 -95
- package/lib/src/scripts/datautil.d.ts +0 -5
- package/lib/src/scripts/datautil.js +0 -16
- package/lib/src/scripts/parse_textes.d.ts +0 -1
- package/lib/src/scripts/parse_textes.js +0 -47
- package/lib/src/scripts/retrieve_documents.d.ts +0 -1
- package/lib/src/scripts/retrieve_documents.js +0 -258
- package/lib/src/scripts/retrieve_open_data.d.ts +0 -1
- package/lib/src/scripts/retrieve_open_data.js +0 -214
- package/lib/src/scripts/retrieve_senateurs_photos.d.ts +0 -1
- package/lib/src/scripts/retrieve_senateurs_photos.js +0 -147
- package/lib/src/scripts/shared/cli_helpers.d.ts +0 -44
- package/lib/src/scripts/shared/cli_helpers.js +0 -32
- package/lib/src/scripts/shared/util.d.ts +0 -3
- package/lib/src/scripts/shared/util.js +0 -28
- package/lib/src/strings.d.ts +0 -1
- package/lib/src/strings.mjs +0 -18
- package/lib/src/types/ameli.d.ts +0 -10
- package/lib/src/types/ameli.js +0 -13
- package/lib/src/types/debats.d.ts +0 -4
- package/lib/src/types/debats.js +0 -2
- package/lib/src/types/dosleg.d.ts +0 -98
- package/lib/src/types/dosleg.js +0 -151
- package/lib/src/types/questions.d.ts +0 -2
- package/lib/src/types/questions.js +0 -1
- package/lib/src/types/sens.d.ts +0 -10
- package/lib/src/types/sens.js +0 -1
- package/lib/src/types/sessions.d.ts +0 -42
- package/lib/src/types/sessions.js +0 -43
- package/lib/src/types/texte.d.ts +0 -61
- package/lib/src/types/texte.js +0 -16
- package/lib/src/validators/config.d.ts +0 -1
- package/lib/src/validators/config.js +0 -54
- package/lib/src/validators/senat.d.ts +0 -0
- package/lib/src/validators/senat.js +0 -24
- package/lib/strings.mjs +0 -18
- package/lib/strings.ts +0 -26
- package/lib/types/ameli.mjs +0 -13
- package/lib/types/ameli.ts +0 -21
- package/lib/types/debats.mjs +0 -2
- package/lib/types/debats.ts +0 -6
- package/lib/types/dosleg.mjs +0 -151
- package/lib/types/dosleg.ts +0 -284
- package/lib/types/questions.mjs +0 -1
- package/lib/types/questions.ts +0 -3
- package/lib/types/sens.mjs +0 -1
- package/lib/types/sens.ts +0 -12
- package/lib/types/sessions.mjs +0 -43
- package/lib/types/sessions.ts +0 -42
- package/lib/types/texte.mjs +0 -16
- package/lib/types/texte.ts +0 -76
- package/lib/typings/windows-1252.d.js +0 -2
- package/lib/typings/windows-1252.d.mjs +0 -2
- package/lib/typings/windows-1252.d.ts +0 -11
- package/lib/validators/config.mjs +0 -54
- package/lib/validators/config.ts +0 -79
- package/lib/validators/senat.mjs +0 -24
- package/lib/validators/senat.ts +0 -26
|
@@ -1,249 +0,0 @@
|
|
|
1
|
-
import assert from "assert";
|
|
2
|
-
import commandLineArgs from "command-line-args";
|
|
3
|
-
import fs from "fs-extra";
|
|
4
|
-
import path from "path";
|
|
5
|
-
import { iterLoadSenatDossiersLegislatifsRapportUrls, iterLoadSenatDossiersLegislatifsTexteUrls, RAPPORT_FOLDER, TEXTE_FOLDER, TEXTE_ORIGINAL_FOLDER, TEXTE_TRANSFORMED_FOLDER, } from "../loaders";
|
|
6
|
-
import { parseExposeDesMotifs, parseTexte, parseTexteFromFile } from "../model/texte";
|
|
7
|
-
import { UNDEFINED_SESSION } from "./datautil";
|
|
8
|
-
import { commonOptions } from "./shared/cli_helpers";
|
|
9
|
-
import { ensureAndClearDir, fetchWithRetry, isOptionEmptyOrHasValue } from "./shared/util";
|
|
10
|
-
const optionsDefinitions = [
|
|
11
|
-
...commonOptions,
|
|
12
|
-
{
|
|
13
|
-
help: "sessions of textes to retrieve; leave empty for all",
|
|
14
|
-
multiple: true,
|
|
15
|
-
name: "sessions",
|
|
16
|
-
type: String,
|
|
17
|
-
},
|
|
18
|
-
{
|
|
19
|
-
help: "parse and convert documents into JSON (textes only for now, requires format xml)",
|
|
20
|
-
name: "parseDocuments",
|
|
21
|
-
type: Boolean,
|
|
22
|
-
},
|
|
23
|
-
{
|
|
24
|
-
alias: "F",
|
|
25
|
-
help: "formats of documents to retrieve (xml/html/pdf for textes, html/pdf for rapports); leave empty for all",
|
|
26
|
-
multiple: true,
|
|
27
|
-
name: "formats",
|
|
28
|
-
type: String,
|
|
29
|
-
},
|
|
30
|
-
{
|
|
31
|
-
help: "types of documents to retrieve (textes/rapports); leave empty for all",
|
|
32
|
-
multiple: true,
|
|
33
|
-
name: "types",
|
|
34
|
-
type: String,
|
|
35
|
-
},
|
|
36
|
-
{
|
|
37
|
-
help: "force retrieve all documents, even already retrieved ones",
|
|
38
|
-
name: "force",
|
|
39
|
-
type: Boolean,
|
|
40
|
-
},
|
|
41
|
-
];
|
|
42
|
-
const options = commandLineArgs(optionsDefinitions);
|
|
43
|
-
const textDecoder = new TextDecoder("utf8");
|
|
44
|
-
async function retrieveDocument(documentUrl) {
|
|
45
|
-
if (!options.silent) {
|
|
46
|
-
console.log(`Retrieving document ${documentUrl}…`);
|
|
47
|
-
}
|
|
48
|
-
try {
|
|
49
|
-
const response = await fetchWithRetry(documentUrl);
|
|
50
|
-
if (!response.ok) {
|
|
51
|
-
if (response.status === 404) {
|
|
52
|
-
console.warn(`Texte ${documentUrl} not found`);
|
|
53
|
-
}
|
|
54
|
-
else {
|
|
55
|
-
console.error(`An error occurred while retrieving texte ${documentUrl}: ${response.status}`);
|
|
56
|
-
}
|
|
57
|
-
return null;
|
|
58
|
-
}
|
|
59
|
-
return response.arrayBuffer();
|
|
60
|
-
}
|
|
61
|
-
catch (error) {
|
|
62
|
-
console.error(error.message);
|
|
63
|
-
return null;
|
|
64
|
-
}
|
|
65
|
-
}
|
|
66
|
-
async function retrieveTextes(dataDir) {
|
|
67
|
-
const textesDir = path.join(dataDir, TEXTE_FOLDER);
|
|
68
|
-
fs.ensureDirSync(textesDir);
|
|
69
|
-
const originalTextesDir = path.join(textesDir, TEXTE_ORIGINAL_FOLDER);
|
|
70
|
-
const transformedTextesDir = path.join(textesDir, TEXTE_TRANSFORMED_FOLDER);
|
|
71
|
-
if (options.parseDocuments) {
|
|
72
|
-
ensureAndClearDir(transformedTextesDir);
|
|
73
|
-
}
|
|
74
|
-
let retrievedTextesCount = 0;
|
|
75
|
-
const texteUrlsNotFoundOrError = [];
|
|
76
|
-
const texteUrlsParseError = [];
|
|
77
|
-
for (const session of options.sessions) {
|
|
78
|
-
for (const { item: texteMetadata } of iterLoadSenatDossiersLegislatifsTexteUrls(dataDir, session)) {
|
|
79
|
-
const texteDir = path.join(originalTextesDir, `${texteMetadata.session ?? UNDEFINED_SESSION}`, texteMetadata.name);
|
|
80
|
-
fs.ensureDirSync(texteDir);
|
|
81
|
-
let exposeDesMotifsContent = null;
|
|
82
|
-
if (texteMetadata.url_expose_des_motifs) {
|
|
83
|
-
if (!options.silent) {
|
|
84
|
-
console.log("Retrieving exposé des motifs…");
|
|
85
|
-
}
|
|
86
|
-
const exposeDesMotifsPath = path.join(texteDir, `${texteMetadata.name}-expose.html`);
|
|
87
|
-
exposeDesMotifsContent = await retrieveDocument(texteMetadata.url_expose_des_motifs.toString());
|
|
88
|
-
if (!exposeDesMotifsContent) {
|
|
89
|
-
continue;
|
|
90
|
-
}
|
|
91
|
-
fs.writeFileSync(exposeDesMotifsPath, Buffer.from(exposeDesMotifsContent));
|
|
92
|
-
}
|
|
93
|
-
if (isOptionEmptyOrHasValue(options.formats, "xml")) {
|
|
94
|
-
const textePath = path.join(texteDir, `${texteMetadata.name}.xml`);
|
|
95
|
-
let texteBuffer = null;
|
|
96
|
-
if (!options.force && fs.existsSync(textePath)) {
|
|
97
|
-
if (!options.silent) {
|
|
98
|
-
console.info(`Already retrieved texte ${textePath}…`);
|
|
99
|
-
}
|
|
100
|
-
}
|
|
101
|
-
else {
|
|
102
|
-
texteBuffer = await retrieveDocument(texteMetadata.url_xml.toString());
|
|
103
|
-
if (!texteBuffer) {
|
|
104
|
-
texteUrlsNotFoundOrError.push(texteMetadata.url_xml);
|
|
105
|
-
continue;
|
|
106
|
-
}
|
|
107
|
-
fs.writeFileSync(textePath, Buffer.from(texteBuffer));
|
|
108
|
-
retrievedTextesCount++;
|
|
109
|
-
}
|
|
110
|
-
if (options.parseDocuments) {
|
|
111
|
-
if (!options.silent) {
|
|
112
|
-
console.log(`Parsing texte ${texteMetadata.name}.xml…`);
|
|
113
|
-
}
|
|
114
|
-
let parsedTexte = null;
|
|
115
|
-
if (texteBuffer) {
|
|
116
|
-
const texteXml = textDecoder.decode(texteBuffer);
|
|
117
|
-
parsedTexte = parseTexte(texteXml);
|
|
118
|
-
}
|
|
119
|
-
else {
|
|
120
|
-
parsedTexte = await parseTexteFromFile(textePath);
|
|
121
|
-
}
|
|
122
|
-
if (!parsedTexte) {
|
|
123
|
-
texteUrlsParseError.push(texteMetadata.url_xml);
|
|
124
|
-
continue;
|
|
125
|
-
}
|
|
126
|
-
if (exposeDesMotifsContent) {
|
|
127
|
-
if (!options.silent) {
|
|
128
|
-
console.log("Parsing exposé des motifs…");
|
|
129
|
-
}
|
|
130
|
-
const exposeDesMotifsHtml = textDecoder.decode(exposeDesMotifsContent);
|
|
131
|
-
parsedTexte.exposeDesMotifs = parseExposeDesMotifs(exposeDesMotifsHtml);
|
|
132
|
-
}
|
|
133
|
-
const transformedTexteDir = path.join(transformedTextesDir, `${texteMetadata.session ?? UNDEFINED_SESSION}`, texteMetadata.name);
|
|
134
|
-
fs.ensureDirSync(transformedTexteDir);
|
|
135
|
-
fs.writeJSONSync(path.join(transformedTexteDir, `${texteMetadata.name}.json`), parsedTexte, { spaces: 2 });
|
|
136
|
-
}
|
|
137
|
-
}
|
|
138
|
-
if (isOptionEmptyOrHasValue(options.formats, "html")) {
|
|
139
|
-
const textePath = path.join(texteDir, `${texteMetadata.name}.html`);
|
|
140
|
-
if (!options.force && fs.existsSync(textePath)) {
|
|
141
|
-
if (!options.silent) {
|
|
142
|
-
console.info(`Already retrieved texte ${textePath}…`);
|
|
143
|
-
}
|
|
144
|
-
}
|
|
145
|
-
else {
|
|
146
|
-
const texteBuffer = await retrieveDocument(texteMetadata.url_html.toString());
|
|
147
|
-
if (!texteBuffer) {
|
|
148
|
-
texteUrlsNotFoundOrError.push(texteMetadata.url_html);
|
|
149
|
-
continue;
|
|
150
|
-
}
|
|
151
|
-
fs.writeFileSync(textePath, Buffer.from(texteBuffer));
|
|
152
|
-
retrievedTextesCount++;
|
|
153
|
-
}
|
|
154
|
-
}
|
|
155
|
-
if (isOptionEmptyOrHasValue(options.formats, "pdf")) {
|
|
156
|
-
const textePath = path.join(texteDir, `${texteMetadata.name}.pdf`);
|
|
157
|
-
if (!options.force && fs.existsSync(textePath)) {
|
|
158
|
-
if (!options.silent) {
|
|
159
|
-
console.info(`Already retrieved texte ${textePath}…`);
|
|
160
|
-
}
|
|
161
|
-
}
|
|
162
|
-
else {
|
|
163
|
-
const texteBuffer = await retrieveDocument(texteMetadata.url_pdf.toString());
|
|
164
|
-
if (!texteBuffer) {
|
|
165
|
-
texteUrlsNotFoundOrError.push(texteMetadata.url_pdf);
|
|
166
|
-
continue;
|
|
167
|
-
}
|
|
168
|
-
fs.writeFileSync(textePath, Buffer.from(texteBuffer));
|
|
169
|
-
retrievedTextesCount++;
|
|
170
|
-
}
|
|
171
|
-
}
|
|
172
|
-
}
|
|
173
|
-
}
|
|
174
|
-
if (options.verbose) {
|
|
175
|
-
console.log(`${retrievedTextesCount} textes retrieved`);
|
|
176
|
-
console.log(`${texteUrlsNotFoundOrError.length} textes failed to be retrieved with URLs ${texteUrlsNotFoundOrError.join(", ")}`);
|
|
177
|
-
if (options.parseDocuments) {
|
|
178
|
-
console.log(`${texteUrlsParseError.length} textes failed to be parsed with URLs ${texteUrlsParseError.join(", ")}`);
|
|
179
|
-
}
|
|
180
|
-
}
|
|
181
|
-
}
|
|
182
|
-
async function retrieveRapports(dataDir) {
|
|
183
|
-
const rapportsDir = path.join(dataDir, RAPPORT_FOLDER);
|
|
184
|
-
fs.ensureDirSync(rapportsDir);
|
|
185
|
-
let retrievedRapportsCount = 0;
|
|
186
|
-
const rapportUrlsNotFoundOrError = [];
|
|
187
|
-
for (const session of options.sessions) {
|
|
188
|
-
for (const { item: rapportMetadata } of iterLoadSenatDossiersLegislatifsRapportUrls(dataDir, session)) {
|
|
189
|
-
const rapportDir = path.join(rapportsDir, `${rapportMetadata.session ?? UNDEFINED_SESSION}`, rapportMetadata.name);
|
|
190
|
-
fs.ensureDirSync(rapportDir);
|
|
191
|
-
if (isOptionEmptyOrHasValue(options.formats, "html")) {
|
|
192
|
-
const rapportPath = path.join(rapportDir, `${rapportMetadata.name}.html`);
|
|
193
|
-
if (!options.force && fs.existsSync(rapportPath)) {
|
|
194
|
-
if (!options.silent) {
|
|
195
|
-
console.info(`Already retrieved rapport ${rapportPath}…`);
|
|
196
|
-
}
|
|
197
|
-
continue;
|
|
198
|
-
}
|
|
199
|
-
const rapportBuffer = await retrieveDocument(rapportMetadata.url_html.toString());
|
|
200
|
-
if (!rapportBuffer) {
|
|
201
|
-
rapportUrlsNotFoundOrError.push(rapportMetadata.url_html);
|
|
202
|
-
continue;
|
|
203
|
-
}
|
|
204
|
-
fs.writeFileSync(rapportPath, Buffer.from(rapportBuffer));
|
|
205
|
-
retrievedRapportsCount++;
|
|
206
|
-
}
|
|
207
|
-
if (isOptionEmptyOrHasValue(options.formats, "pdf")) {
|
|
208
|
-
const rapportPath = path.join(rapportDir, `${rapportMetadata.name}.pdf`);
|
|
209
|
-
if (!options.force && fs.existsSync(rapportPath)) {
|
|
210
|
-
if (!options.silent) {
|
|
211
|
-
console.info(`Already retrieved rapport ${rapportPath}…`);
|
|
212
|
-
}
|
|
213
|
-
continue;
|
|
214
|
-
}
|
|
215
|
-
const rapportBuffer = await retrieveDocument(rapportMetadata.url_pdf.toString());
|
|
216
|
-
if (!rapportBuffer) {
|
|
217
|
-
rapportUrlsNotFoundOrError.push(rapportMetadata.url_pdf);
|
|
218
|
-
continue;
|
|
219
|
-
}
|
|
220
|
-
fs.writeFileSync(rapportPath, Buffer.from(rapportBuffer));
|
|
221
|
-
retrievedRapportsCount++;
|
|
222
|
-
}
|
|
223
|
-
}
|
|
224
|
-
}
|
|
225
|
-
if (options.verbose) {
|
|
226
|
-
console.log(`${retrievedRapportsCount} rapports retrieved`);
|
|
227
|
-
console.log(`${rapportUrlsNotFoundOrError.length} rapports failed with URLs ${rapportUrlsNotFoundOrError.join(", ")}`);
|
|
228
|
-
}
|
|
229
|
-
}
|
|
230
|
-
async function main() {
|
|
231
|
-
const dataDir = options.dataDir;
|
|
232
|
-
assert(dataDir, "Missing argument: data directory");
|
|
233
|
-
console.time("documents processing time");
|
|
234
|
-
if (isOptionEmptyOrHasValue(options.types, "textes")) {
|
|
235
|
-
await retrieveTextes(dataDir);
|
|
236
|
-
}
|
|
237
|
-
if (isOptionEmptyOrHasValue(options.types, "rapports")) {
|
|
238
|
-
await retrieveRapports(dataDir);
|
|
239
|
-
}
|
|
240
|
-
if (!options.silent) {
|
|
241
|
-
console.timeEnd("documents processing time");
|
|
242
|
-
}
|
|
243
|
-
}
|
|
244
|
-
main()
|
|
245
|
-
.then(() => process.exit(0))
|
|
246
|
-
.catch((error) => {
|
|
247
|
-
console.log(error);
|
|
248
|
-
process.exit(1);
|
|
249
|
-
});
|
|
@@ -1,298 +0,0 @@
|
|
|
1
|
-
import assert from "assert"
|
|
2
|
-
import commandLineArgs from "command-line-args"
|
|
3
|
-
import fs from "fs-extra"
|
|
4
|
-
import path from "path"
|
|
5
|
-
|
|
6
|
-
import {
|
|
7
|
-
iterLoadSenatDossiersLegislatifsRapportUrls,
|
|
8
|
-
iterLoadSenatDossiersLegislatifsTexteUrls,
|
|
9
|
-
RAPPORT_FOLDER,
|
|
10
|
-
TEXTE_FOLDER,
|
|
11
|
-
TEXTE_ORIGINAL_FOLDER,
|
|
12
|
-
TEXTE_TRANSFORMED_FOLDER,
|
|
13
|
-
} from "../loaders"
|
|
14
|
-
import { parseExposeDesMotifs, parseTexte, parseTexteFromFile } from "../model/texte"
|
|
15
|
-
import { UNDEFINED_SESSION } from "./datautil"
|
|
16
|
-
import { commonOptions } from "./shared/cli_helpers"
|
|
17
|
-
import { ensureAndClearDir, fetchWithRetry, isOptionEmptyOrHasValue } from "./shared/util"
|
|
18
|
-
|
|
19
|
-
const optionsDefinitions = [
|
|
20
|
-
...commonOptions,
|
|
21
|
-
{
|
|
22
|
-
help: "sessions of textes to retrieve; leave empty for all",
|
|
23
|
-
multiple: true,
|
|
24
|
-
name: "sessions",
|
|
25
|
-
type: String,
|
|
26
|
-
},
|
|
27
|
-
{
|
|
28
|
-
help: "parse and convert documents into JSON (textes only for now, requires format xml)",
|
|
29
|
-
name: "parseDocuments",
|
|
30
|
-
type: Boolean,
|
|
31
|
-
},
|
|
32
|
-
{
|
|
33
|
-
alias: "F",
|
|
34
|
-
help: "formats of documents to retrieve (xml/html/pdf for textes, html/pdf for rapports); leave empty for all",
|
|
35
|
-
multiple: true,
|
|
36
|
-
name: "formats",
|
|
37
|
-
type: String,
|
|
38
|
-
},
|
|
39
|
-
{
|
|
40
|
-
help: "types of documents to retrieve (textes/rapports); leave empty for all",
|
|
41
|
-
multiple: true,
|
|
42
|
-
name: "types",
|
|
43
|
-
type: String,
|
|
44
|
-
},
|
|
45
|
-
{
|
|
46
|
-
help: "force retrieve all documents, even already retrieved ones",
|
|
47
|
-
name: "force",
|
|
48
|
-
type: Boolean,
|
|
49
|
-
},
|
|
50
|
-
]
|
|
51
|
-
const options = commandLineArgs(optionsDefinitions)
|
|
52
|
-
|
|
53
|
-
const textDecoder = new TextDecoder("utf8")
|
|
54
|
-
|
|
55
|
-
async function retrieveDocument (documentUrl: string): Promise<ArrayBuffer | null> {
|
|
56
|
-
if (!options.silent) {
|
|
57
|
-
console.log(`Retrieving document ${documentUrl}…`)
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
try {
|
|
61
|
-
const response = await fetchWithRetry(documentUrl)
|
|
62
|
-
if (!response.ok) {
|
|
63
|
-
if (response.status === 404) {
|
|
64
|
-
console.warn(`Texte ${documentUrl} not found`)
|
|
65
|
-
} else {
|
|
66
|
-
console.error(`An error occurred while retrieving texte ${documentUrl}: ${response.status}`)
|
|
67
|
-
}
|
|
68
|
-
return null
|
|
69
|
-
}
|
|
70
|
-
return response.arrayBuffer()
|
|
71
|
-
} catch (error: any) {
|
|
72
|
-
console.error(error.message)
|
|
73
|
-
return null
|
|
74
|
-
}
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
async function retrieveTextes (dataDir: string) {
|
|
78
|
-
const textesDir = path.join(dataDir, TEXTE_FOLDER)
|
|
79
|
-
fs.ensureDirSync(textesDir)
|
|
80
|
-
|
|
81
|
-
const originalTextesDir = path.join(textesDir, TEXTE_ORIGINAL_FOLDER)
|
|
82
|
-
const transformedTextesDir = path.join(textesDir, TEXTE_TRANSFORMED_FOLDER)
|
|
83
|
-
if (options.parseDocuments) {
|
|
84
|
-
ensureAndClearDir(transformedTextesDir)
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
let retrievedTextesCount = 0
|
|
88
|
-
const texteUrlsNotFoundOrError = []
|
|
89
|
-
const texteUrlsParseError = []
|
|
90
|
-
|
|
91
|
-
for (const session of options.sessions) {
|
|
92
|
-
for (const { item: texteMetadata } of iterLoadSenatDossiersLegislatifsTexteUrls(dataDir, session)) {
|
|
93
|
-
const texteDir = path.join(originalTextesDir, `${texteMetadata.session ?? UNDEFINED_SESSION}`, texteMetadata.name)
|
|
94
|
-
fs.ensureDirSync(texteDir)
|
|
95
|
-
|
|
96
|
-
let exposeDesMotifsContent = null
|
|
97
|
-
if (texteMetadata.url_expose_des_motifs) {
|
|
98
|
-
if (!options.silent) {
|
|
99
|
-
console.log("Retrieving exposé des motifs…")
|
|
100
|
-
}
|
|
101
|
-
|
|
102
|
-
const exposeDesMotifsPath = path.join(texteDir, `${texteMetadata.name}-expose.html`)
|
|
103
|
-
|
|
104
|
-
exposeDesMotifsContent = await retrieveDocument(texteMetadata.url_expose_des_motifs.toString())
|
|
105
|
-
if (!exposeDesMotifsContent) {
|
|
106
|
-
continue
|
|
107
|
-
}
|
|
108
|
-
fs.writeFileSync(exposeDesMotifsPath, Buffer.from(exposeDesMotifsContent))
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
if (isOptionEmptyOrHasValue(options.formats, "xml")) {
|
|
112
|
-
const textePath = path.join(texteDir, `${texteMetadata.name}.xml`)
|
|
113
|
-
let texteBuffer = null
|
|
114
|
-
|
|
115
|
-
if (!options.force && fs.existsSync(textePath)) {
|
|
116
|
-
if (!options.silent) {
|
|
117
|
-
console.info(`Already retrieved texte ${textePath}…`)
|
|
118
|
-
}
|
|
119
|
-
} else {
|
|
120
|
-
texteBuffer = await retrieveDocument(texteMetadata.url_xml.toString())
|
|
121
|
-
if (!texteBuffer) {
|
|
122
|
-
texteUrlsNotFoundOrError.push(texteMetadata.url_xml)
|
|
123
|
-
continue
|
|
124
|
-
}
|
|
125
|
-
fs.writeFileSync(textePath, Buffer.from(texteBuffer))
|
|
126
|
-
retrievedTextesCount++
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
if (options.parseDocuments) {
|
|
130
|
-
if (!options.silent) {
|
|
131
|
-
console.log(`Parsing texte ${texteMetadata.name}.xml…`)
|
|
132
|
-
}
|
|
133
|
-
|
|
134
|
-
let parsedTexte = null
|
|
135
|
-
if (texteBuffer) {
|
|
136
|
-
const texteXml = textDecoder.decode(texteBuffer)
|
|
137
|
-
parsedTexte = parseTexte(texteXml)
|
|
138
|
-
} else {
|
|
139
|
-
parsedTexte = await parseTexteFromFile(textePath)
|
|
140
|
-
}
|
|
141
|
-
|
|
142
|
-
if (!parsedTexte) {
|
|
143
|
-
texteUrlsParseError.push(texteMetadata.url_xml)
|
|
144
|
-
continue
|
|
145
|
-
}
|
|
146
|
-
|
|
147
|
-
if (exposeDesMotifsContent) {
|
|
148
|
-
if (!options.silent) {
|
|
149
|
-
console.log("Parsing exposé des motifs…")
|
|
150
|
-
}
|
|
151
|
-
|
|
152
|
-
const exposeDesMotifsHtml = textDecoder.decode(exposeDesMotifsContent)
|
|
153
|
-
parsedTexte.exposeDesMotifs = parseExposeDesMotifs(exposeDesMotifsHtml)
|
|
154
|
-
}
|
|
155
|
-
|
|
156
|
-
const transformedTexteDir
|
|
157
|
-
= path.join(transformedTextesDir, `${texteMetadata.session ?? UNDEFINED_SESSION}`, texteMetadata.name)
|
|
158
|
-
fs.ensureDirSync(transformedTexteDir)
|
|
159
|
-
|
|
160
|
-
fs.writeJSONSync(path.join(transformedTexteDir, `${texteMetadata.name}.json`), parsedTexte, { spaces: 2 })
|
|
161
|
-
}
|
|
162
|
-
}
|
|
163
|
-
|
|
164
|
-
if (isOptionEmptyOrHasValue(options.formats, "html")) {
|
|
165
|
-
const textePath = path.join(texteDir, `${texteMetadata.name}.html`)
|
|
166
|
-
|
|
167
|
-
if (!options.force && fs.existsSync(textePath)) {
|
|
168
|
-
if (!options.silent) {
|
|
169
|
-
console.info(`Already retrieved texte ${textePath}…`)
|
|
170
|
-
}
|
|
171
|
-
} else {
|
|
172
|
-
|
|
173
|
-
const texteBuffer = await retrieveDocument(texteMetadata.url_html.toString())
|
|
174
|
-
if (!texteBuffer) {
|
|
175
|
-
texteUrlsNotFoundOrError.push(texteMetadata.url_html)
|
|
176
|
-
continue
|
|
177
|
-
}
|
|
178
|
-
fs.writeFileSync(textePath, Buffer.from(texteBuffer))
|
|
179
|
-
retrievedTextesCount++
|
|
180
|
-
}
|
|
181
|
-
}
|
|
182
|
-
|
|
183
|
-
if (isOptionEmptyOrHasValue(options.formats, "pdf")) {
|
|
184
|
-
const textePath = path.join(texteDir, `${texteMetadata.name}.pdf`)
|
|
185
|
-
|
|
186
|
-
if (!options.force && fs.existsSync(textePath)) {
|
|
187
|
-
if (!options.silent) {
|
|
188
|
-
console.info(`Already retrieved texte ${textePath}…`)
|
|
189
|
-
}
|
|
190
|
-
} else {
|
|
191
|
-
|
|
192
|
-
const texteBuffer = await retrieveDocument(texteMetadata.url_pdf.toString())
|
|
193
|
-
if (!texteBuffer) {
|
|
194
|
-
texteUrlsNotFoundOrError.push(texteMetadata.url_pdf)
|
|
195
|
-
continue
|
|
196
|
-
}
|
|
197
|
-
fs.writeFileSync(textePath, Buffer.from(texteBuffer))
|
|
198
|
-
retrievedTextesCount++
|
|
199
|
-
}
|
|
200
|
-
}
|
|
201
|
-
}
|
|
202
|
-
}
|
|
203
|
-
|
|
204
|
-
if (options.verbose) {
|
|
205
|
-
console.log(`${retrievedTextesCount} textes retrieved`)
|
|
206
|
-
console.log(
|
|
207
|
-
`${texteUrlsNotFoundOrError.length} textes failed to be retrieved with URLs ${texteUrlsNotFoundOrError.join(", ")}`
|
|
208
|
-
)
|
|
209
|
-
if (options.parseDocuments) {
|
|
210
|
-
console.log(`${texteUrlsParseError.length} textes failed to be parsed with URLs ${texteUrlsParseError.join(", ")}`)
|
|
211
|
-
}
|
|
212
|
-
}
|
|
213
|
-
}
|
|
214
|
-
|
|
215
|
-
async function retrieveRapports (dataDir: string) {
|
|
216
|
-
const rapportsDir = path.join(dataDir, RAPPORT_FOLDER)
|
|
217
|
-
fs.ensureDirSync(rapportsDir)
|
|
218
|
-
|
|
219
|
-
let retrievedRapportsCount = 0
|
|
220
|
-
const rapportUrlsNotFoundOrError = []
|
|
221
|
-
|
|
222
|
-
for (const session of options.sessions) {
|
|
223
|
-
for (const { item: rapportMetadata } of iterLoadSenatDossiersLegislatifsRapportUrls(dataDir, session)) {
|
|
224
|
-
const rapportDir = path.join(rapportsDir, `${rapportMetadata.session ?? UNDEFINED_SESSION}`, rapportMetadata.name)
|
|
225
|
-
fs.ensureDirSync(rapportDir)
|
|
226
|
-
|
|
227
|
-
if (isOptionEmptyOrHasValue(options.formats, "html")) {
|
|
228
|
-
const rapportPath = path.join(rapportDir, `${rapportMetadata.name}.html`)
|
|
229
|
-
|
|
230
|
-
if (!options.force && fs.existsSync(rapportPath)) {
|
|
231
|
-
if (!options.silent) {
|
|
232
|
-
console.info(`Already retrieved rapport ${rapportPath}…`)
|
|
233
|
-
}
|
|
234
|
-
continue
|
|
235
|
-
}
|
|
236
|
-
|
|
237
|
-
const rapportBuffer = await retrieveDocument(rapportMetadata.url_html.toString())
|
|
238
|
-
if (!rapportBuffer) {
|
|
239
|
-
rapportUrlsNotFoundOrError.push(rapportMetadata.url_html)
|
|
240
|
-
continue
|
|
241
|
-
}
|
|
242
|
-
fs.writeFileSync(rapportPath, Buffer.from(rapportBuffer))
|
|
243
|
-
retrievedRapportsCount++
|
|
244
|
-
}
|
|
245
|
-
|
|
246
|
-
if (isOptionEmptyOrHasValue(options.formats, "pdf")) {
|
|
247
|
-
const rapportPath = path.join(rapportDir, `${rapportMetadata.name}.pdf`)
|
|
248
|
-
|
|
249
|
-
if (!options.force && fs.existsSync(rapportPath)) {
|
|
250
|
-
if (!options.silent) {
|
|
251
|
-
console.info(`Already retrieved rapport ${rapportPath}…`)
|
|
252
|
-
}
|
|
253
|
-
continue
|
|
254
|
-
}
|
|
255
|
-
|
|
256
|
-
const rapportBuffer = await retrieveDocument(rapportMetadata.url_pdf.toString())
|
|
257
|
-
if (!rapportBuffer) {
|
|
258
|
-
rapportUrlsNotFoundOrError.push(rapportMetadata.url_pdf)
|
|
259
|
-
continue
|
|
260
|
-
}
|
|
261
|
-
fs.writeFileSync(rapportPath, Buffer.from(rapportBuffer))
|
|
262
|
-
retrievedRapportsCount++
|
|
263
|
-
}
|
|
264
|
-
}
|
|
265
|
-
}
|
|
266
|
-
|
|
267
|
-
if (options.verbose) {
|
|
268
|
-
console.log(`${retrievedRapportsCount} rapports retrieved`)
|
|
269
|
-
console.log(
|
|
270
|
-
`${rapportUrlsNotFoundOrError.length} rapports failed with URLs ${rapportUrlsNotFoundOrError.join(", ")}`
|
|
271
|
-
)
|
|
272
|
-
}
|
|
273
|
-
}
|
|
274
|
-
|
|
275
|
-
async function main() {
|
|
276
|
-
const dataDir = options.dataDir
|
|
277
|
-
assert(dataDir, "Missing argument: data directory")
|
|
278
|
-
|
|
279
|
-
console.time("documents processing time")
|
|
280
|
-
|
|
281
|
-
if (isOptionEmptyOrHasValue(options.types, "textes")) {
|
|
282
|
-
await retrieveTextes(dataDir)
|
|
283
|
-
}
|
|
284
|
-
if (isOptionEmptyOrHasValue(options.types, "rapports")) {
|
|
285
|
-
await retrieveRapports(dataDir)
|
|
286
|
-
}
|
|
287
|
-
|
|
288
|
-
if (!options.silent) {
|
|
289
|
-
console.timeEnd("documents processing time")
|
|
290
|
-
}
|
|
291
|
-
}
|
|
292
|
-
|
|
293
|
-
main()
|
|
294
|
-
.then(() => process.exit(0))
|
|
295
|
-
.catch((error) => {
|
|
296
|
-
console.log(error)
|
|
297
|
-
process.exit(1)
|
|
298
|
-
})
|