@tricoteuses/senat 2.8.1 → 2.8.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/aggregates.d.ts +52 -0
- package/lib/aggregates.js +949 -0
- package/lib/aggregates.mjs +726 -0
- package/lib/aggregates.ts +852 -0
- package/lib/config.mjs +16 -0
- package/lib/config.ts +26 -0
- package/lib/databases.mjs +55 -0
- package/lib/databases.ts +68 -0
- package/lib/datasets.mjs +78 -0
- package/lib/datasets.ts +118 -0
- package/lib/fields.d.ts +10 -0
- package/lib/fields.js +68 -0
- package/lib/fields.mjs +22 -0
- package/lib/fields.ts +29 -0
- package/lib/index.mjs +7 -0
- package/lib/index.ts +64 -0
- package/lib/inserters.d.ts +98 -0
- package/lib/inserters.js +500 -0
- package/lib/inserters.mjs +360 -0
- package/lib/inserters.ts +521 -0
- package/lib/loaders.mjs +97 -0
- package/lib/loaders.ts +173 -0
- package/lib/model/ameli.mjs +57 -0
- package/lib/model/ameli.ts +86 -0
- package/lib/model/debats.mjs +43 -0
- package/lib/model/debats.ts +68 -0
- package/lib/model/dosleg.mjs +163 -0
- package/lib/model/dosleg.ts +204 -0
- package/lib/model/index.mjs +4 -0
- package/lib/model/index.ts +13 -0
- package/lib/model/questions.d.ts +0 -20
- package/lib/model/questions.js +1 -32
- package/lib/model/questions.mjs +76 -0
- package/lib/model/questions.ts +102 -0
- package/lib/model/sens.mjs +339 -0
- package/lib/model/sens.ts +432 -0
- package/lib/model/texte.mjs +156 -0
- package/lib/model/texte.ts +174 -0
- package/lib/raw_types/ameli.d.ts +20 -0
- package/lib/raw_types/questions.d.ts +4 -70
- package/lib/raw_types_kysely/ameli.d.ts +915 -0
- package/lib/raw_types_kysely/ameli.js +7 -0
- package/lib/raw_types_kysely/ameli.mjs +5 -0
- package/lib/raw_types_kysely/ameli.ts +951 -0
- package/lib/raw_types_kysely/debats.d.ts +207 -0
- package/lib/raw_types_kysely/debats.js +7 -0
- package/lib/raw_types_kysely/debats.mjs +5 -0
- package/lib/raw_types_kysely/debats.ts +222 -0
- package/lib/raw_types_kysely/dosleg.d.ts +3532 -0
- package/lib/raw_types_kysely/dosleg.js +7 -0
- package/lib/raw_types_kysely/dosleg.mjs +5 -0
- package/lib/raw_types_kysely/dosleg.ts +3621 -0
- package/lib/raw_types_kysely/questions.d.ts +414 -0
- package/lib/raw_types_kysely/questions.js +7 -0
- package/lib/raw_types_kysely/questions.mjs +5 -0
- package/lib/raw_types_kysely/questions.ts +426 -0
- package/lib/raw_types_kysely/sens.d.ts +4394 -0
- package/lib/raw_types_kysely/sens.js +7 -0
- package/lib/raw_types_kysely/sens.mjs +5 -0
- package/lib/raw_types_kysely/sens.ts +4499 -0
- package/lib/raw_types_schemats/ameli.mjs +2 -0
- package/lib/raw_types_schemats/ameli.ts +601 -0
- package/lib/raw_types_schemats/debats.mjs +2 -0
- package/lib/raw_types_schemats/debats.ts +145 -0
- package/lib/raw_types_schemats/dosleg.mjs +2 -0
- package/lib/raw_types_schemats/dosleg.ts +2193 -0
- package/lib/raw_types_schemats/questions.mjs +2 -0
- package/lib/raw_types_schemats/questions.ts +249 -0
- package/lib/raw_types_schemats/sens.mjs +2 -0
- package/lib/raw_types_schemats/sens.ts +2907 -0
- package/lib/scripts/convert_data.mjs +95 -0
- package/lib/scripts/convert_data.ts +119 -0
- package/lib/scripts/data-download.d.ts +1 -0
- package/lib/scripts/data-download.js +9 -0
- package/lib/scripts/datautil.mjs +16 -0
- package/lib/scripts/datautil.ts +19 -0
- package/lib/scripts/images/transparent_150x192.jpg +0 -0
- package/lib/scripts/images/transparent_155x225.jpg +0 -0
- package/lib/scripts/parse_textes.mjs +38 -0
- package/lib/scripts/parse_textes.ts +52 -0
- package/lib/scripts/retrieve_documents.mjs +243 -0
- package/lib/scripts/retrieve_documents.ts +279 -0
- package/lib/scripts/retrieve_open_data.js +11 -9
- package/lib/scripts/retrieve_open_data.mjs +214 -0
- package/lib/scripts/retrieve_open_data.ts +261 -0
- package/lib/scripts/retrieve_senateurs_photos.mjs +147 -0
- package/lib/scripts/retrieve_senateurs_photos.ts +177 -0
- package/lib/scripts/retrieve_textes.d.ts +1 -0
- package/lib/scripts/retrieve_textes.mjs +165 -0
- package/lib/scripts/retrieve_textes.ts +79 -0
- package/lib/scripts/shared/cli_helpers.ts +36 -0
- package/lib/scripts/shared/util.ts +33 -0
- package/lib/strings.mjs +18 -0
- package/lib/strings.ts +26 -0
- package/lib/types/ameli.mjs +13 -0
- package/lib/types/ameli.ts +21 -0
- package/lib/types/debats.mjs +2 -0
- package/lib/types/debats.ts +6 -0
- package/lib/types/dosleg.mjs +151 -0
- package/lib/types/dosleg.ts +284 -0
- package/lib/types/questions.mjs +1 -0
- package/lib/types/questions.ts +3 -0
- package/lib/types/sens.mjs +1 -0
- package/lib/types/sens.ts +12 -0
- package/lib/types/sessions.mjs +43 -0
- package/lib/types/sessions.ts +42 -0
- package/lib/types/texte.mjs +16 -0
- package/lib/types/texte.ts +66 -0
- package/lib/typings/windows-1252.d.js +2 -0
- package/lib/typings/windows-1252.d.mjs +2 -0
- package/lib/typings/windows-1252.d.ts +11 -0
- package/lib/validators/config.mjs +54 -0
- package/lib/validators/config.ts +79 -0
- package/lib/validators/senat.mjs +24 -0
- package/lib/validators/senat.ts +26 -0
- package/package.json +6 -4
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
import assert from "assert"
|
|
2
|
+
import commandLineArgs from "command-line-args"
|
|
3
|
+
import fs from "fs-extra"
|
|
4
|
+
import path from "path"
|
|
5
|
+
|
|
6
|
+
import { findSenatRapportUrls, findSenatTexteUrls } from "../model/dosleg"
|
|
7
|
+
import { parseTexte, parseTexteFromFile } from "../model/texte"
|
|
8
|
+
import { UNDEFINED_SESSION } from "./datautil"
|
|
9
|
+
import { commonOptions } from "./shared/cli_helpers"
|
|
10
|
+
import { ensureAndClearDir, fetchWithRetry, isOptionEmptyOrHasValue } from "./shared/util"
|
|
11
|
+
|
|
12
|
+
const optionsDefinitions = [
|
|
13
|
+
...commonOptions,
|
|
14
|
+
{
|
|
15
|
+
help: "sessions of textes to retrieve; leave empty for all",
|
|
16
|
+
multiple: true,
|
|
17
|
+
name: "sessions",
|
|
18
|
+
type: String,
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
help: "parse and convert documents into JSON (textes only for now, requires format xml)",
|
|
22
|
+
name: "parseDocuments",
|
|
23
|
+
type: Boolean,
|
|
24
|
+
},
|
|
25
|
+
{
|
|
26
|
+
alias: "F",
|
|
27
|
+
help: "formats of documents to retrieve (xml/html/pdf for textes, html/pdf for rapports); leave empty for all",
|
|
28
|
+
multiple: true,
|
|
29
|
+
name: "formats",
|
|
30
|
+
type: String,
|
|
31
|
+
},
|
|
32
|
+
{
|
|
33
|
+
help: "types of documents to retrieve (textes/rapports); leave empty for all",
|
|
34
|
+
multiple: true,
|
|
35
|
+
name: "types",
|
|
36
|
+
type: String,
|
|
37
|
+
},
|
|
38
|
+
{
|
|
39
|
+
help: "force retrieve all documents, even already retrieved ones",
|
|
40
|
+
name: "force",
|
|
41
|
+
type: Boolean,
|
|
42
|
+
},
|
|
43
|
+
]
|
|
44
|
+
const options = commandLineArgs(optionsDefinitions)
|
|
45
|
+
|
|
46
|
+
const SENAT_TEXTE_XML_BASE_URL = "https://www.senat.fr/akomantoso/"
|
|
47
|
+
const SENAT_TEXTE_BASE_URL = "https://www.senat.fr/leg/"
|
|
48
|
+
const SENAT_RAPPORT_BASE_URL = "https://www.senat.fr/rap/"
|
|
49
|
+
|
|
50
|
+
const textDecoder = new TextDecoder("utf8")
|
|
51
|
+
|
|
52
|
+
async function retrieveDocument (documentUrl: string): Promise<ArrayBuffer | null> {
|
|
53
|
+
if (!options.silent) {
|
|
54
|
+
console.log(`Retrieving document ${documentUrl}…`)
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
try {
|
|
58
|
+
const response = await fetchWithRetry(documentUrl)
|
|
59
|
+
if (!response.ok) {
|
|
60
|
+
if (response.status === 404) {
|
|
61
|
+
console.warn(`Texte ${documentUrl} not found`)
|
|
62
|
+
} else {
|
|
63
|
+
console.error(`An error occurred while retrieving texte ${documentUrl}: ${response.status}`)
|
|
64
|
+
}
|
|
65
|
+
return null
|
|
66
|
+
}
|
|
67
|
+
return response.arrayBuffer()
|
|
68
|
+
} catch (error: any) {
|
|
69
|
+
console.error(error.message)
|
|
70
|
+
return null
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
async function retrieveTextes (dataDir: string) {
|
|
75
|
+
const textesDir = path.join(dataDir, "leg")
|
|
76
|
+
fs.ensureDirSync(textesDir)
|
|
77
|
+
const originalTextesDir = path.join(textesDir, "original")
|
|
78
|
+
const transformedTextesDir = path.join(textesDir, "transformed")
|
|
79
|
+
ensureAndClearDir(transformedTextesDir)
|
|
80
|
+
|
|
81
|
+
let retrievedTextesCount = 0
|
|
82
|
+
const texteUrlsNotFoundOrError = []
|
|
83
|
+
const texteUrlsParseError = []
|
|
84
|
+
|
|
85
|
+
for await (const texte of findSenatTexteUrls(options.sessions)) {
|
|
86
|
+
const texteName = path.parse(texte.url).name
|
|
87
|
+
const texteDir = path.join(originalTextesDir, `${texte.session ?? UNDEFINED_SESSION}`, texteName)
|
|
88
|
+
fs.ensureDirSync(texteDir)
|
|
89
|
+
|
|
90
|
+
if (isOptionEmptyOrHasValue(options.formats, "xml")) {
|
|
91
|
+
const texteXmlUrl = `${texteName}.akn.xml`
|
|
92
|
+
const texteXmlAbsoluteUrl = new URL(texteXmlUrl, SENAT_TEXTE_XML_BASE_URL).toString()
|
|
93
|
+
const textePath = path.join(texteDir, texteXmlUrl)
|
|
94
|
+
let texteBuffer = null
|
|
95
|
+
|
|
96
|
+
if (!options.force && fs.existsSync(textePath)) {
|
|
97
|
+
if (!options.silent) {
|
|
98
|
+
console.info(`Already retrieved texte ${textePath}…`)
|
|
99
|
+
}
|
|
100
|
+
} else {
|
|
101
|
+
texteBuffer = await retrieveDocument(texteXmlAbsoluteUrl)
|
|
102
|
+
if (!texteBuffer) {
|
|
103
|
+
texteUrlsNotFoundOrError.push(texteXmlAbsoluteUrl)
|
|
104
|
+
continue
|
|
105
|
+
}
|
|
106
|
+
fs.writeFileSync(textePath, Buffer.from(texteBuffer))
|
|
107
|
+
retrievedTextesCount++
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
if (options.parseDocuments) {
|
|
111
|
+
if (!options.silent) {
|
|
112
|
+
console.log(`Parsing texte ${texteXmlUrl}…`)
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
let parsedTexte = null
|
|
116
|
+
if (texteBuffer) {
|
|
117
|
+
const texteXml = textDecoder.decode(texteBuffer)
|
|
118
|
+
parsedTexte = parseTexte(texteXml)
|
|
119
|
+
} else {
|
|
120
|
+
parsedTexte = await parseTexteFromFile(textePath)
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
if (!parsedTexte) {
|
|
124
|
+
texteUrlsParseError.push(texteXmlAbsoluteUrl)
|
|
125
|
+
continue
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
const transformedTexteDir
|
|
129
|
+
= path.join(transformedTextesDir, `${texte.session ?? UNDEFINED_SESSION}`, texteName)
|
|
130
|
+
fs.ensureDirSync(transformedTexteDir)
|
|
131
|
+
fs.writeJSONSync(path.join(transformedTexteDir, `${texteName}.akn.json`), parsedTexte, { spaces: 2 })
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
if (isOptionEmptyOrHasValue(options.formats, "html")) {
|
|
136
|
+
const texteHtmlUrl = `${texteName}.html`
|
|
137
|
+
const texteHtmlAbsoluteUrl = new URL(texteHtmlUrl, SENAT_TEXTE_BASE_URL).toString()
|
|
138
|
+
const textePath = path.join(texteDir, texteHtmlUrl)
|
|
139
|
+
|
|
140
|
+
if (!options.force && fs.existsSync(textePath)) {
|
|
141
|
+
if (!options.silent) {
|
|
142
|
+
console.info(`Already retrieved texte ${textePath}…`)
|
|
143
|
+
}
|
|
144
|
+
continue
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
const texteBuffer = await retrieveDocument(texteHtmlAbsoluteUrl)
|
|
148
|
+
if (!texteBuffer) {
|
|
149
|
+
texteUrlsNotFoundOrError.push(texteHtmlAbsoluteUrl)
|
|
150
|
+
continue
|
|
151
|
+
}
|
|
152
|
+
fs.writeFileSync(textePath, Buffer.from(texteBuffer))
|
|
153
|
+
retrievedTextesCount++
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
if (isOptionEmptyOrHasValue(options.formats, "pdf")) {
|
|
157
|
+
const textePdfUrl = `${texteName}.pdf`
|
|
158
|
+
const textePdfAbsoluteUrl = new URL(textePdfUrl, SENAT_TEXTE_BASE_URL).toString()
|
|
159
|
+
const textePath = path.join(texteDir, textePdfUrl)
|
|
160
|
+
|
|
161
|
+
if (!options.force && fs.existsSync(textePath)) {
|
|
162
|
+
if (!options.silent) {
|
|
163
|
+
console.info(`Already retrieved texte ${textePath}…`)
|
|
164
|
+
}
|
|
165
|
+
continue
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
const texteBuffer = await retrieveDocument(textePdfAbsoluteUrl)
|
|
169
|
+
if (!texteBuffer) {
|
|
170
|
+
texteUrlsNotFoundOrError.push(textePdfAbsoluteUrl)
|
|
171
|
+
continue
|
|
172
|
+
}
|
|
173
|
+
fs.writeFileSync(textePath, Buffer.from(texteBuffer))
|
|
174
|
+
retrievedTextesCount++
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
if (options.verbose) {
|
|
179
|
+
console.log(`${retrievedTextesCount} textes retrieved`)
|
|
180
|
+
console.log(
|
|
181
|
+
`${texteUrlsNotFoundOrError.length} textes failed to be retrieved with URLs ${texteUrlsNotFoundOrError.join(", ")}`
|
|
182
|
+
)
|
|
183
|
+
if (options.parseDocuments) {
|
|
184
|
+
console.log(`${texteUrlsParseError.length} textes failed to be parsed with URLs ${texteUrlsParseError.join(", ")}`)
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
// TODO retrieve exposé des motifs (/leg/exposes-des-motifs)
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
async function retrieveRapports (dataDir: string) {
|
|
191
|
+
const rapportsDir = path.join(dataDir, "rap")
|
|
192
|
+
fs.ensureDirSync(rapportsDir)
|
|
193
|
+
|
|
194
|
+
let retrievedRapportsCount = 0
|
|
195
|
+
const rapportUrlsNotFoundOrError = []
|
|
196
|
+
|
|
197
|
+
for await (const rapport of findSenatRapportUrls(options.sessions)) {
|
|
198
|
+
const parsedRapportUrl = path.parse(rapport.url)
|
|
199
|
+
const rapportName = parsedRapportUrl.name
|
|
200
|
+
const rapportDir = path.join(rapportsDir, `${rapport.session ?? UNDEFINED_SESSION}`, rapportName)
|
|
201
|
+
fs.ensureDirSync(rapportDir)
|
|
202
|
+
|
|
203
|
+
if (isOptionEmptyOrHasValue(options.formats, "html")) {
|
|
204
|
+
const rapportHtmlUrlBase = `${rapportName}_mono.html`
|
|
205
|
+
const rapportHtmlUrl = path.format({
|
|
206
|
+
dir: parsedRapportUrl.dir,
|
|
207
|
+
base: rapportHtmlUrlBase,
|
|
208
|
+
})
|
|
209
|
+
const rapportHtmlAbsoluteUrl = new URL(rapportHtmlUrl, SENAT_RAPPORT_BASE_URL).toString()
|
|
210
|
+
const rapportPath = path.join(rapportDir, rapportHtmlUrlBase)
|
|
211
|
+
|
|
212
|
+
if (!options.force && fs.existsSync(rapportPath)) {
|
|
213
|
+
if (!options.silent) {
|
|
214
|
+
console.info(`Already retrieved rapport ${rapportPath}…`)
|
|
215
|
+
}
|
|
216
|
+
continue
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
const rapportBuffer = await retrieveDocument(rapportHtmlAbsoluteUrl)
|
|
220
|
+
if (!rapportBuffer) {
|
|
221
|
+
rapportUrlsNotFoundOrError.push(rapportHtmlAbsoluteUrl)
|
|
222
|
+
continue
|
|
223
|
+
}
|
|
224
|
+
fs.writeFileSync(rapportPath, Buffer.from(rapportBuffer))
|
|
225
|
+
retrievedRapportsCount++
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
if (isOptionEmptyOrHasValue(options.formats, "pdf")) {
|
|
229
|
+
const rapportPdfUrlBase = `${rapportName}1.pdf`
|
|
230
|
+
const rapportPdfUrl = path.format({
|
|
231
|
+
dir: parsedRapportUrl.dir,
|
|
232
|
+
base: rapportPdfUrlBase,
|
|
233
|
+
})
|
|
234
|
+
const rapportPdfAbsoluteUrl = new URL(rapportPdfUrl, SENAT_RAPPORT_BASE_URL).toString()
|
|
235
|
+
const rapportPath = path.join(rapportDir, rapportPdfUrlBase)
|
|
236
|
+
|
|
237
|
+
if (!options.force && fs.existsSync(rapportPath)) {
|
|
238
|
+
if (!options.silent) {
|
|
239
|
+
console.info(`Already retrieved rapport ${rapportPath}…`)
|
|
240
|
+
}
|
|
241
|
+
continue
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
const rapportBuffer = await retrieveDocument(rapportPdfAbsoluteUrl)
|
|
245
|
+
if (!rapportBuffer) {
|
|
246
|
+
rapportUrlsNotFoundOrError.push(rapportPdfAbsoluteUrl)
|
|
247
|
+
continue
|
|
248
|
+
}
|
|
249
|
+
fs.writeFileSync(rapportPath, Buffer.from(rapportBuffer))
|
|
250
|
+
retrievedRapportsCount++
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
if (options.verbose) {
|
|
255
|
+
console.log(`${retrievedRapportsCount} rapports retrieved`)
|
|
256
|
+
console.log(
|
|
257
|
+
`${rapportUrlsNotFoundOrError.length} rapports failed with URLs ${rapportUrlsNotFoundOrError.join(", ")}`
|
|
258
|
+
)
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
async function main() {
|
|
263
|
+
const dataDir = options.dataDir
|
|
264
|
+
assert(dataDir, "Missing argument: data directory")
|
|
265
|
+
|
|
266
|
+
if (isOptionEmptyOrHasValue(options.types, "textes")) {
|
|
267
|
+
await retrieveTextes(dataDir)
|
|
268
|
+
}
|
|
269
|
+
if (isOptionEmptyOrHasValue(options.types, "rapports")) {
|
|
270
|
+
await retrieveRapports(dataDir)
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
main()
|
|
275
|
+
.then(() => process.exit(0))
|
|
276
|
+
.catch((error) => {
|
|
277
|
+
console.log(error)
|
|
278
|
+
process.exit(1)
|
|
279
|
+
})
|
|
@@ -4,11 +4,11 @@ import commandLineArgs from "command-line-args";
|
|
|
4
4
|
import fs from "fs-extra";
|
|
5
5
|
// import fetch from "node-fetch"
|
|
6
6
|
import path from "path";
|
|
7
|
-
// import stream from "stream"
|
|
8
7
|
import StreamZip from "node-stream-zip";
|
|
9
8
|
import readline from "readline";
|
|
10
|
-
// import util from "util"
|
|
11
9
|
import windows1252 from "windows-1252";
|
|
10
|
+
import { pipeline } from "stream";
|
|
11
|
+
import { promisify } from "util";
|
|
12
12
|
import config from "../config";
|
|
13
13
|
import { datasets, getChosenDatasets, getEnabledDatasets } from "../datasets";
|
|
14
14
|
import { commonOptions } from "./shared/cli_helpers";
|
|
@@ -59,7 +59,14 @@ const optionsDefinitions = [
|
|
|
59
59
|
},
|
|
60
60
|
];
|
|
61
61
|
const options = commandLineArgs(optionsDefinitions);
|
|
62
|
-
|
|
62
|
+
const streamPipeline = promisify(pipeline);
|
|
63
|
+
async function downloadFile(url, dest) {
|
|
64
|
+
const response = await fetch(url);
|
|
65
|
+
if (!response.ok) {
|
|
66
|
+
throw new Error(`Download failed ${response.status} ${response.statusText} for ${url}`);
|
|
67
|
+
}
|
|
68
|
+
await streamPipeline(response.body, fs.createWriteStream(dest));
|
|
69
|
+
}
|
|
63
70
|
async function retrieveDataset(dataDir, dataset) {
|
|
64
71
|
const zipFilename = dataset.url.substring(dataset.url.lastIndexOf("/") + 1);
|
|
65
72
|
const zipFilePath = path.join(dataDir, zipFilename);
|
|
@@ -78,12 +85,7 @@ async function retrieveDataset(dataDir, dataset) {
|
|
|
78
85
|
// }
|
|
79
86
|
// await pipeline(response.body!, fs.createWriteStream(zipFilePath))
|
|
80
87
|
fs.removeSync(zipFilePath);
|
|
81
|
-
|
|
82
|
-
cwd: dataDir,
|
|
83
|
-
env: process.env,
|
|
84
|
-
encoding: "utf-8",
|
|
85
|
-
// stdio: ["ignore", "ignore", "pipe"],
|
|
86
|
-
});
|
|
88
|
+
await downloadFile(dataset.url, zipFilePath);
|
|
87
89
|
}
|
|
88
90
|
const sqlFilename = `${dataset.database}.sql`;
|
|
89
91
|
const sqlFilePath = path.join(dataDir, sqlFilename);
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
import assert from "assert";
|
|
2
|
+
import { execSync } from "child_process";
|
|
3
|
+
import commandLineArgs from "command-line-args";
|
|
4
|
+
import fs from "fs-extra";
|
|
5
|
+
// import fetch from "node-fetch"
|
|
6
|
+
import path from "path";
|
|
7
|
+
// import stream from "stream"
|
|
8
|
+
import StreamZip from "node-stream-zip";
|
|
9
|
+
import readline from "readline";
|
|
10
|
+
// import util from "util"
|
|
11
|
+
import windows1252 from "windows-1252";
|
|
12
|
+
import config from "../config";
|
|
13
|
+
import { getChosenFromEnabledDatasets, } from "../datasets";
|
|
14
|
+
import { commonOptions } from "./shared/cli_helpers";
|
|
15
|
+
const badWindows1252CharacterRegex = /[\u0080-\u009f]/g;
|
|
16
|
+
const optionsDefinitions = [
|
|
17
|
+
...commonOptions,
|
|
18
|
+
{
|
|
19
|
+
alias: "a",
|
|
20
|
+
help: "all options: fetch, unzip, repair-encoding, import",
|
|
21
|
+
name: "all",
|
|
22
|
+
type: Boolean,
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
alias: "f",
|
|
26
|
+
help: "fetch datasets instead of retrieving them from files",
|
|
27
|
+
name: "fetch",
|
|
28
|
+
type: Boolean,
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
alias: "z",
|
|
32
|
+
help: "unzip SQL files",
|
|
33
|
+
name: "unzip",
|
|
34
|
+
type: Boolean,
|
|
35
|
+
},
|
|
36
|
+
{
|
|
37
|
+
alias: "e",
|
|
38
|
+
help: "repair Windows CP 1252 encoding of SQL dumps",
|
|
39
|
+
name: "repairEncoding",
|
|
40
|
+
type: Boolean,
|
|
41
|
+
},
|
|
42
|
+
{
|
|
43
|
+
alias: "i",
|
|
44
|
+
help: "import SQL dumps into a freshly (re-)created database",
|
|
45
|
+
name: "import",
|
|
46
|
+
type: Boolean,
|
|
47
|
+
},
|
|
48
|
+
{
|
|
49
|
+
alias: "d",
|
|
50
|
+
help: "repair database (update schema and types)",
|
|
51
|
+
name: "repairDatabase",
|
|
52
|
+
type: Boolean,
|
|
53
|
+
},
|
|
54
|
+
{
|
|
55
|
+
alias: "c",
|
|
56
|
+
help: "create TypeScript interfaces from databases schemas into src/raw_types_* directories",
|
|
57
|
+
name: "schema",
|
|
58
|
+
type: Boolean,
|
|
59
|
+
},
|
|
60
|
+
];
|
|
61
|
+
const options = commandLineArgs(optionsDefinitions);
|
|
62
|
+
// const pipeline = util.promisify(stream.pipeline)
|
|
63
|
+
async function retrieveDataset(dataDir, dataset) {
|
|
64
|
+
const zipFilename = dataset.url.substring(dataset.url.lastIndexOf("/") + 1);
|
|
65
|
+
const zipFilePath = path.join(dataDir, zipFilename);
|
|
66
|
+
if (options.all || options.fetch) {
|
|
67
|
+
// Fetch & save ZIP file.
|
|
68
|
+
if (!options.silent) {
|
|
69
|
+
console.log(`Loading ${dataset.title}: ${zipFilename}…`);
|
|
70
|
+
}
|
|
71
|
+
// Fetch fails with OpenSSL error: dh key too small.
|
|
72
|
+
// (so does "curl").
|
|
73
|
+
// const response = await fetch(dataset.url)
|
|
74
|
+
// if (!response.ok) {
|
|
75
|
+
// console.error(response.status, response.statusText)
|
|
76
|
+
// console.error(await response.text())
|
|
77
|
+
// throw new Error(`Fetch failed: ${dataset.url}`)
|
|
78
|
+
// }
|
|
79
|
+
// await pipeline(response.body!, fs.createWriteStream(zipFilePath))
|
|
80
|
+
fs.removeSync(zipFilePath);
|
|
81
|
+
execSync(`wget --quiet ${dataset.url}`, {
|
|
82
|
+
cwd: dataDir,
|
|
83
|
+
env: process.env,
|
|
84
|
+
encoding: "utf-8",
|
|
85
|
+
// stdio: ["ignore", "ignore", "pipe"],
|
|
86
|
+
});
|
|
87
|
+
}
|
|
88
|
+
const sqlFilename = `${dataset.database}.sql`;
|
|
89
|
+
const sqlFilePath = path.join(dataDir, sqlFilename);
|
|
90
|
+
if (options.all || options.unzip) {
|
|
91
|
+
if (!options.silent) {
|
|
92
|
+
console.log(`Unzipping ${dataset.title}: ${zipFilename}…`);
|
|
93
|
+
}
|
|
94
|
+
fs.removeSync(sqlFilePath);
|
|
95
|
+
const zip = new StreamZip({
|
|
96
|
+
file: zipFilePath,
|
|
97
|
+
storeEntries: true,
|
|
98
|
+
});
|
|
99
|
+
await new Promise((resolve, reject) => {
|
|
100
|
+
zip.on("ready", () => {
|
|
101
|
+
zip.extract(null, dataDir, (err, _count) => {
|
|
102
|
+
zip.close();
|
|
103
|
+
if (err) {
|
|
104
|
+
reject(err);
|
|
105
|
+
}
|
|
106
|
+
else {
|
|
107
|
+
resolve(null);
|
|
108
|
+
}
|
|
109
|
+
});
|
|
110
|
+
});
|
|
111
|
+
});
|
|
112
|
+
if (dataset.repairZip !== undefined) {
|
|
113
|
+
if (!options.silent) {
|
|
114
|
+
console.log(`Repairing Zip path ${dataset.title}: ${sqlFilename}…`);
|
|
115
|
+
}
|
|
116
|
+
dataset.repairZip(dataset, dataDir);
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
if ((options.all || options.repairEncoding) && dataset.repairEncoding) {
|
|
120
|
+
if (!options.silent) {
|
|
121
|
+
console.log(`Repairing Windows CP1252 encoding of ${dataset.title}: ${sqlFilename}…`);
|
|
122
|
+
}
|
|
123
|
+
const repairedSqlFilePath = sqlFilePath + ".repaired";
|
|
124
|
+
const repairedSqlWriter = fs.createWriteStream(repairedSqlFilePath, {
|
|
125
|
+
encoding: "utf8",
|
|
126
|
+
});
|
|
127
|
+
const lineReader = readline.createInterface({
|
|
128
|
+
input: fs.createReadStream(sqlFilePath, { encoding: "utf8" }),
|
|
129
|
+
crlfDelay: Infinity,
|
|
130
|
+
});
|
|
131
|
+
for await (const line of lineReader) {
|
|
132
|
+
repairedSqlWriter.write(line.replace(badWindows1252CharacterRegex, (match) => windows1252.decode(match, { mode: "fatal" })) + "\n");
|
|
133
|
+
}
|
|
134
|
+
repairedSqlWriter.end();
|
|
135
|
+
await fs.move(repairedSqlFilePath, sqlFilePath, { overwrite: true });
|
|
136
|
+
}
|
|
137
|
+
if (options.all || options.import) {
|
|
138
|
+
if (!options.silent) {
|
|
139
|
+
console.log(`Importing ${dataset.title}: ${sqlFilename}…`);
|
|
140
|
+
}
|
|
141
|
+
execSync(`psql -c "DROP DATABASE IF EXISTS ${dataset.database}"`, {
|
|
142
|
+
cwd: dataDir,
|
|
143
|
+
env: process.env,
|
|
144
|
+
encoding: "utf-8",
|
|
145
|
+
// stdio: ["ignore", "ignore", "pipe"],
|
|
146
|
+
});
|
|
147
|
+
execSync(`psql -c "CREATE DATABASE ${dataset.database} WITH OWNER opendata"`, {
|
|
148
|
+
cwd: dataDir,
|
|
149
|
+
env: process.env,
|
|
150
|
+
encoding: "utf-8",
|
|
151
|
+
// stdio: ["ignore", "ignore", "pipe"],
|
|
152
|
+
});
|
|
153
|
+
execSync(`psql -f ${sqlFilename} ${dataset.database}`, {
|
|
154
|
+
cwd: dataDir,
|
|
155
|
+
env: process.env,
|
|
156
|
+
encoding: "utf-8",
|
|
157
|
+
// stdio: ["ignore", "ignore", "pipe"],
|
|
158
|
+
});
|
|
159
|
+
}
|
|
160
|
+
if (options.schema) {
|
|
161
|
+
let definitionsDir = path.resolve("src", "raw_types_schemats");
|
|
162
|
+
assert(fs.statSync(definitionsDir).isDirectory());
|
|
163
|
+
if (!options.silent) {
|
|
164
|
+
console.log(`Creating TypeScript definitions from schema of database ${dataset.database}…`);
|
|
165
|
+
}
|
|
166
|
+
const dbConnectionString = `postgres://${process.env.PGUSER}:${process.env.PGPASSWORD}@${process.env.PGHOST}:${process.env.PGPORT}/${dataset.database}`;
|
|
167
|
+
let definitionFilePath = path.join(definitionsDir, `${dataset.database}.ts`);
|
|
168
|
+
execSync(`npx schemats generate -c ${dbConnectionString} -s ${dataset.schema} -o ${definitionFilePath}`, {
|
|
169
|
+
// cwd: dataDir,
|
|
170
|
+
env: process.env,
|
|
171
|
+
encoding: "utf-8",
|
|
172
|
+
// stdio: ["ignore", "ignore", "pipe"],
|
|
173
|
+
});
|
|
174
|
+
const definition = fs.readFileSync(definitionFilePath, { encoding: "utf8" });
|
|
175
|
+
const definitionRepaired = definition
|
|
176
|
+
.replace(/\r\n/g, "\n")
|
|
177
|
+
.replace(/AUTO-GENERATED FILE @ \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}/, "AUTO-GENERATED FILE");
|
|
178
|
+
fs.writeFileSync(definitionFilePath, definitionRepaired);
|
|
179
|
+
definitionsDir = path.resolve("src", "raw_types_kysely");
|
|
180
|
+
definitionFilePath = path.join(definitionsDir, `${dataset.database}.ts`);
|
|
181
|
+
execSync(`kysely-codegen --url ${dbConnectionString} --schema=${dataset.schema} --out-file=${definitionFilePath}`, {
|
|
182
|
+
// cwd: dataDir,
|
|
183
|
+
env: process.env,
|
|
184
|
+
encoding: "utf-8",
|
|
185
|
+
// stdio: ["ignore", "ignore", "pipe"],
|
|
186
|
+
});
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
async function retrieveOpenData() {
|
|
190
|
+
const dataDir = options.dataDir;
|
|
191
|
+
assert(dataDir, "Missing argument: data directory");
|
|
192
|
+
process.env = {
|
|
193
|
+
...process.env,
|
|
194
|
+
PGHOST: process.env.PGHOST || config.db.host,
|
|
195
|
+
PGPORT: process.env.PGPORT || config.db.port,
|
|
196
|
+
PGUSER: process.env.PGUSER || config.db.user,
|
|
197
|
+
PGPASSWORD: process.env.PGPASSWORD || config.db.password,
|
|
198
|
+
};
|
|
199
|
+
assert(process.env.PGHOST
|
|
200
|
+
&& process.env.PGPORT
|
|
201
|
+
&& process.env.PGUSER
|
|
202
|
+
&& process.env.PGPASSWORD, "Missing database configuration: environment variables PGHOST, PGPORT, PGUSER and PGPASSWORD or TRICOTEUSES_SENAT_DB_* in .env file");
|
|
203
|
+
const choosenDatasets = getChosenFromEnabledDatasets(options.categories);
|
|
204
|
+
// await Promise.all(choosenDatasets.map(dataset => retrieveDataset(dataDir, dataset)))
|
|
205
|
+
for (const dataset of choosenDatasets) {
|
|
206
|
+
await retrieveDataset(dataDir, dataset);
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
retrieveOpenData()
|
|
210
|
+
.then(() => process.exit(0))
|
|
211
|
+
.catch((error) => {
|
|
212
|
+
console.log(error);
|
|
213
|
+
process.exit(1);
|
|
214
|
+
});
|