@tricoteuses/senat 1.3.1 → 1.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (254) hide show
  1. package/lib/config.d.ts +1 -0
  2. package/lib/config.js +14 -45
  3. package/lib/databases.js +86 -143
  4. package/lib/datasets.js +78 -83
  5. package/lib/index.d.ts +12 -4
  6. package/lib/index.js +42 -419
  7. package/lib/loaders.js +149 -654
  8. package/lib/model/ameli.js +83 -21
  9. package/lib/model/debats.js +0 -1
  10. package/lib/model/dosleg.d.ts +1 -1
  11. package/lib/model/dosleg.js +179 -73
  12. package/lib/model/index.d.ts +3 -3
  13. package/lib/model/index.js +12 -46
  14. package/lib/model/questions.js +68 -39
  15. package/lib/model/sens.d.ts +1 -1
  16. package/lib/model/sens.js +383 -113
  17. package/lib/model/texte.js +220 -290
  18. package/lib/model/util.js +9 -26
  19. package/lib/raw_types/ameli.js +5 -6
  20. package/lib/raw_types/debats.js +5 -6
  21. package/lib/raw_types/dosleg.js +5 -6
  22. package/lib/raw_types/questions.js +5 -6
  23. package/lib/raw_types/sens.js +5 -6
  24. package/lib/raw_types_schemats/ameli.js +1 -43
  25. package/lib/raw_types_schemats/debats.js +1 -22
  26. package/lib/raw_types_schemats/dosleg.js +1 -96
  27. package/lib/raw_types_schemats/questions.js +1 -22
  28. package/lib/raw_types_schemats/sens.js +1 -112
  29. package/lib/scripts/convert_data.js +181 -631
  30. package/lib/scripts/datautil.js +17 -60
  31. package/lib/scripts/parse_textes.js +46 -129
  32. package/lib/scripts/retrieve_documents.js +247 -513
  33. package/lib/scripts/retrieve_open_data.js +211 -368
  34. package/lib/scripts/retrieve_senateurs_photos.js +144 -239
  35. package/lib/scripts/shared/cli_helpers.js +30 -30
  36. package/lib/scripts/shared/util.js +28 -94
  37. package/lib/strings.js +20 -45
  38. package/lib/types/ameli.d.ts +1 -1
  39. package/lib/types/ameli.js +14 -25
  40. package/lib/types/debats.d.ts +1 -1
  41. package/lib/types/debats.js +3 -21
  42. package/lib/types/dosleg.d.ts +1 -1
  43. package/lib/types/dosleg.js +152 -119
  44. package/lib/types/questions.d.ts +1 -1
  45. package/lib/types/questions.js +1 -13
  46. package/lib/types/sens.d.ts +1 -1
  47. package/lib/types/sens.js +1 -13
  48. package/lib/types/sessions.js +44 -49
  49. package/lib/types/texte.js +17 -22
  50. package/lib/validators/config.js +47 -111
  51. package/lib/validators/senat.js +1 -5
  52. package/package.json +18 -40
  53. package/lib/aggregates.d.ts +0 -52
  54. package/lib/aggregates.mjs +0 -930
  55. package/lib/aggregates.ts +0 -833
  56. package/lib/config.mjs +0 -16
  57. package/lib/config.ts +0 -26
  58. package/lib/data/legislatures.json +0 -38
  59. package/lib/databases.mjs +0 -57
  60. package/lib/databases.ts +0 -71
  61. package/lib/datasets.mjs +0 -78
  62. package/lib/datasets.ts +0 -118
  63. package/lib/fields.d.ts +0 -10
  64. package/lib/fields.mjs +0 -68
  65. package/lib/fields.ts +0 -29
  66. package/lib/index.mjs +0 -4
  67. package/lib/index.ts +0 -42
  68. package/lib/inserters.d.ts +0 -98
  69. package/lib/inserters.mjs +0 -500
  70. package/lib/inserters.ts +0 -521
  71. package/lib/loaders.mjs +0 -158
  72. package/lib/loaders.ts +0 -271
  73. package/lib/model/ameli.mjs +0 -84
  74. package/lib/model/ameli.ts +0 -100
  75. package/lib/model/debats.mjs +0 -1
  76. package/lib/model/debats.ts +0 -0
  77. package/lib/model/dosleg.mjs +0 -196
  78. package/lib/model/dosleg.ts +0 -240
  79. package/lib/model/index.mjs +0 -4
  80. package/lib/model/index.ts +0 -14
  81. package/lib/model/questions.mjs +0 -71
  82. package/lib/model/questions.ts +0 -93
  83. package/lib/model/sens.mjs +0 -415
  84. package/lib/model/sens.ts +0 -516
  85. package/lib/model/texte.mjs +0 -208
  86. package/lib/model/texte.ts +0 -229
  87. package/lib/model/util.mjs +0 -19
  88. package/lib/model/util.ts +0 -32
  89. package/lib/raw_types/ameli.mjs +0 -5
  90. package/lib/raw_types/ameli.ts +0 -951
  91. package/lib/raw_types/debats.mjs +0 -5
  92. package/lib/raw_types/debats.ts +0 -222
  93. package/lib/raw_types/dosleg.mjs +0 -5
  94. package/lib/raw_types/dosleg.ts +0 -3625
  95. package/lib/raw_types/questions.mjs +0 -5
  96. package/lib/raw_types/questions.ts +0 -427
  97. package/lib/raw_types/sens.mjs +0 -5
  98. package/lib/raw_types/sens.ts +0 -4499
  99. package/lib/raw_types_kysely/ameli.d.ts +0 -6
  100. package/lib/raw_types_kysely/ameli.mjs +0 -7
  101. package/lib/raw_types_kysely/ameli.ts +0 -6
  102. package/lib/raw_types_kysely/debats.d.ts +0 -6
  103. package/lib/raw_types_kysely/debats.mjs +0 -7
  104. package/lib/raw_types_kysely/debats.ts +0 -6
  105. package/lib/raw_types_kysely/dosleg.d.ts +0 -6
  106. package/lib/raw_types_kysely/dosleg.mjs +0 -7
  107. package/lib/raw_types_kysely/dosleg.ts +0 -6
  108. package/lib/raw_types_kysely/questions.d.ts +0 -6
  109. package/lib/raw_types_kysely/questions.mjs +0 -7
  110. package/lib/raw_types_kysely/questions.ts +0 -6
  111. package/lib/raw_types_kysely/sens.d.ts +0 -6
  112. package/lib/raw_types_kysely/sens.mjs +0 -7
  113. package/lib/raw_types_kysely/sens.ts +0 -6
  114. package/lib/raw_types_kysely/texte.d.ts +0 -45
  115. package/lib/raw_types_kysely/texte.mjs +0 -7
  116. package/lib/raw_types_kysely/texte.ts +0 -53
  117. package/lib/raw_types_schemats/ameli.mjs +0 -2
  118. package/lib/raw_types_schemats/ameli.ts +0 -601
  119. package/lib/raw_types_schemats/debats.mjs +0 -2
  120. package/lib/raw_types_schemats/debats.ts +0 -145
  121. package/lib/raw_types_schemats/dosleg.mjs +0 -2
  122. package/lib/raw_types_schemats/dosleg.ts +0 -2195
  123. package/lib/raw_types_schemats/questions.mjs +0 -2
  124. package/lib/raw_types_schemats/questions.ts +0 -251
  125. package/lib/raw_types_schemats/sens.mjs +0 -2
  126. package/lib/raw_types_schemats/sens.ts +0 -2907
  127. package/lib/scripts/convert_data.mjs +0 -181
  128. package/lib/scripts/convert_data.ts +0 -243
  129. package/lib/scripts/datautil.mjs +0 -16
  130. package/lib/scripts/datautil.ts +0 -19
  131. package/lib/scripts/images/transparent_150x192.jpg +0 -0
  132. package/lib/scripts/images/transparent_155x225.jpg +0 -0
  133. package/lib/scripts/parse_textes.mjs +0 -46
  134. package/lib/scripts/parse_textes.ts +0 -65
  135. package/lib/scripts/retrieve_documents.mjs +0 -249
  136. package/lib/scripts/retrieve_documents.ts +0 -298
  137. package/lib/scripts/retrieve_open_data.mjs +0 -217
  138. package/lib/scripts/retrieve_open_data.ts +0 -274
  139. package/lib/scripts/retrieve_senateurs_photos.mjs +0 -147
  140. package/lib/scripts/retrieve_senateurs_photos.ts +0 -177
  141. package/lib/scripts/retrieve_textes.d.ts +0 -1
  142. package/lib/scripts/retrieve_textes.mjs +0 -328
  143. package/lib/scripts/retrieve_textes.ts +0 -143
  144. package/lib/scripts/shared/cli_helpers.ts +0 -36
  145. package/lib/scripts/shared/util.ts +0 -33
  146. package/lib/src/aggregates.d.ts +0 -52
  147. package/lib/src/aggregates.mjs +0 -726
  148. package/lib/src/config.d.ts +0 -2
  149. package/lib/src/config.mjs +0 -16
  150. package/lib/src/databases.d.ts +0 -18
  151. package/lib/src/databases.mjs +0 -55
  152. package/lib/src/datasets.d.ts +0 -28
  153. package/lib/src/datasets.mjs +0 -78
  154. package/lib/src/fields.d.ts +0 -10
  155. package/lib/src/fields.mjs +0 -22
  156. package/lib/src/index.d.ts +0 -8
  157. package/lib/src/index.mjs +0 -7
  158. package/lib/src/inserters.d.ts +0 -98
  159. package/lib/src/inserters.mjs +0 -360
  160. package/lib/src/loaders.d.ts +0 -36
  161. package/lib/src/loaders.mjs +0 -107
  162. package/lib/src/model/ameli.d.ts +0 -4
  163. package/lib/src/model/ameli.js +0 -57
  164. package/lib/src/model/debats.d.ts +0 -4
  165. package/lib/src/model/debats.js +0 -43
  166. package/lib/src/model/dosleg.d.ts +0 -197
  167. package/lib/src/model/dosleg.js +0 -169
  168. package/lib/src/model/index.d.ts +0 -4
  169. package/lib/src/model/index.js +0 -4
  170. package/lib/src/model/questions.d.ts +0 -89
  171. package/lib/src/model/questions.js +0 -76
  172. package/lib/src/model/sens.d.ts +0 -390
  173. package/lib/src/model/sens.js +0 -339
  174. package/lib/src/model/texte.d.ts +0 -7
  175. package/lib/src/model/texte.js +0 -183
  176. package/lib/src/raw_types_kysely/ameli.d.ts +0 -915
  177. package/lib/src/raw_types_kysely/ameli.js +0 -5
  178. package/lib/src/raw_types_kysely/debats.d.ts +0 -207
  179. package/lib/src/raw_types_kysely/debats.js +0 -5
  180. package/lib/src/raw_types_kysely/dosleg.d.ts +0 -3532
  181. package/lib/src/raw_types_kysely/dosleg.js +0 -5
  182. package/lib/src/raw_types_kysely/questions.d.ts +0 -414
  183. package/lib/src/raw_types_kysely/questions.js +0 -5
  184. package/lib/src/raw_types_kysely/sens.d.ts +0 -4394
  185. package/lib/src/raw_types_kysely/sens.js +0 -5
  186. package/lib/src/raw_types_schemats/ameli.d.ts +0 -541
  187. package/lib/src/raw_types_schemats/ameli.js +0 -2
  188. package/lib/src/raw_types_schemats/debats.d.ts +0 -127
  189. package/lib/src/raw_types_schemats/debats.js +0 -2
  190. package/lib/src/raw_types_schemats/dosleg.d.ts +0 -2027
  191. package/lib/src/raw_types_schemats/dosleg.js +0 -2
  192. package/lib/src/raw_types_schemats/questions.d.ts +0 -231
  193. package/lib/src/raw_types_schemats/questions.js +0 -2
  194. package/lib/src/raw_types_schemats/sens.d.ts +0 -2709
  195. package/lib/src/raw_types_schemats/sens.js +0 -2
  196. package/lib/src/scripts/convert_data.d.ts +0 -1
  197. package/lib/src/scripts/convert_data.js +0 -95
  198. package/lib/src/scripts/datautil.d.ts +0 -5
  199. package/lib/src/scripts/datautil.js +0 -16
  200. package/lib/src/scripts/parse_textes.d.ts +0 -1
  201. package/lib/src/scripts/parse_textes.js +0 -47
  202. package/lib/src/scripts/retrieve_documents.d.ts +0 -1
  203. package/lib/src/scripts/retrieve_documents.js +0 -258
  204. package/lib/src/scripts/retrieve_open_data.d.ts +0 -1
  205. package/lib/src/scripts/retrieve_open_data.js +0 -214
  206. package/lib/src/scripts/retrieve_senateurs_photos.d.ts +0 -1
  207. package/lib/src/scripts/retrieve_senateurs_photos.js +0 -147
  208. package/lib/src/scripts/shared/cli_helpers.d.ts +0 -44
  209. package/lib/src/scripts/shared/cli_helpers.js +0 -32
  210. package/lib/src/scripts/shared/util.d.ts +0 -3
  211. package/lib/src/scripts/shared/util.js +0 -28
  212. package/lib/src/strings.d.ts +0 -1
  213. package/lib/src/strings.mjs +0 -18
  214. package/lib/src/types/ameli.d.ts +0 -10
  215. package/lib/src/types/ameli.js +0 -13
  216. package/lib/src/types/debats.d.ts +0 -4
  217. package/lib/src/types/debats.js +0 -2
  218. package/lib/src/types/dosleg.d.ts +0 -98
  219. package/lib/src/types/dosleg.js +0 -151
  220. package/lib/src/types/questions.d.ts +0 -2
  221. package/lib/src/types/questions.js +0 -1
  222. package/lib/src/types/sens.d.ts +0 -10
  223. package/lib/src/types/sens.js +0 -1
  224. package/lib/src/types/sessions.d.ts +0 -42
  225. package/lib/src/types/sessions.js +0 -43
  226. package/lib/src/types/texte.d.ts +0 -61
  227. package/lib/src/types/texte.js +0 -16
  228. package/lib/src/validators/config.d.ts +0 -1
  229. package/lib/src/validators/config.js +0 -54
  230. package/lib/src/validators/senat.d.ts +0 -0
  231. package/lib/src/validators/senat.js +0 -24
  232. package/lib/strings.mjs +0 -18
  233. package/lib/strings.ts +0 -26
  234. package/lib/types/ameli.mjs +0 -13
  235. package/lib/types/ameli.ts +0 -21
  236. package/lib/types/debats.mjs +0 -2
  237. package/lib/types/debats.ts +0 -6
  238. package/lib/types/dosleg.mjs +0 -151
  239. package/lib/types/dosleg.ts +0 -284
  240. package/lib/types/questions.mjs +0 -1
  241. package/lib/types/questions.ts +0 -3
  242. package/lib/types/sens.mjs +0 -1
  243. package/lib/types/sens.ts +0 -12
  244. package/lib/types/sessions.mjs +0 -43
  245. package/lib/types/sessions.ts +0 -42
  246. package/lib/types/texte.mjs +0 -16
  247. package/lib/types/texte.ts +0 -76
  248. package/lib/typings/windows-1252.d.js +0 -2
  249. package/lib/typings/windows-1252.d.mjs +0 -2
  250. package/lib/typings/windows-1252.d.ts +0 -11
  251. package/lib/validators/config.mjs +0 -54
  252. package/lib/validators/config.ts +0 -79
  253. package/lib/validators/senat.mjs +0 -24
  254. package/lib/validators/senat.ts +0 -26
@@ -1,249 +0,0 @@
1
- import assert from "assert";
2
- import commandLineArgs from "command-line-args";
3
- import fs from "fs-extra";
4
- import path from "path";
5
- import { iterLoadSenatDossiersLegislatifsRapportUrls, iterLoadSenatDossiersLegislatifsTexteUrls, RAPPORT_FOLDER, TEXTE_FOLDER, TEXTE_ORIGINAL_FOLDER, TEXTE_TRANSFORMED_FOLDER, } from "../loaders";
6
- import { parseExposeDesMotifs, parseTexte, parseTexteFromFile } from "../model/texte";
7
- import { UNDEFINED_SESSION } from "./datautil";
8
- import { commonOptions } from "./shared/cli_helpers";
9
- import { ensureAndClearDir, fetchWithRetry, isOptionEmptyOrHasValue } from "./shared/util";
10
- const optionsDefinitions = [
11
- ...commonOptions,
12
- {
13
- help: "sessions of textes to retrieve; leave empty for all",
14
- multiple: true,
15
- name: "sessions",
16
- type: String,
17
- },
18
- {
19
- help: "parse and convert documents into JSON (textes only for now, requires format xml)",
20
- name: "parseDocuments",
21
- type: Boolean,
22
- },
23
- {
24
- alias: "F",
25
- help: "formats of documents to retrieve (xml/html/pdf for textes, html/pdf for rapports); leave empty for all",
26
- multiple: true,
27
- name: "formats",
28
- type: String,
29
- },
30
- {
31
- help: "types of documents to retrieve (textes/rapports); leave empty for all",
32
- multiple: true,
33
- name: "types",
34
- type: String,
35
- },
36
- {
37
- help: "force retrieve all documents, even already retrieved ones",
38
- name: "force",
39
- type: Boolean,
40
- },
41
- ];
42
- const options = commandLineArgs(optionsDefinitions);
43
- const textDecoder = new TextDecoder("utf8");
44
- async function retrieveDocument(documentUrl) {
45
- if (!options.silent) {
46
- console.log(`Retrieving document ${documentUrl}…`);
47
- }
48
- try {
49
- const response = await fetchWithRetry(documentUrl);
50
- if (!response.ok) {
51
- if (response.status === 404) {
52
- console.warn(`Texte ${documentUrl} not found`);
53
- }
54
- else {
55
- console.error(`An error occurred while retrieving texte ${documentUrl}: ${response.status}`);
56
- }
57
- return null;
58
- }
59
- return response.arrayBuffer();
60
- }
61
- catch (error) {
62
- console.error(error.message);
63
- return null;
64
- }
65
- }
66
- async function retrieveTextes(dataDir) {
67
- const textesDir = path.join(dataDir, TEXTE_FOLDER);
68
- fs.ensureDirSync(textesDir);
69
- const originalTextesDir = path.join(textesDir, TEXTE_ORIGINAL_FOLDER);
70
- const transformedTextesDir = path.join(textesDir, TEXTE_TRANSFORMED_FOLDER);
71
- if (options.parseDocuments) {
72
- ensureAndClearDir(transformedTextesDir);
73
- }
74
- let retrievedTextesCount = 0;
75
- const texteUrlsNotFoundOrError = [];
76
- const texteUrlsParseError = [];
77
- for (const session of options.sessions) {
78
- for (const { item: texteMetadata } of iterLoadSenatDossiersLegislatifsTexteUrls(dataDir, session)) {
79
- const texteDir = path.join(originalTextesDir, `${texteMetadata.session ?? UNDEFINED_SESSION}`, texteMetadata.name);
80
- fs.ensureDirSync(texteDir);
81
- let exposeDesMotifsContent = null;
82
- if (texteMetadata.url_expose_des_motifs) {
83
- if (!options.silent) {
84
- console.log("Retrieving exposé des motifs…");
85
- }
86
- const exposeDesMotifsPath = path.join(texteDir, `${texteMetadata.name}-expose.html`);
87
- exposeDesMotifsContent = await retrieveDocument(texteMetadata.url_expose_des_motifs.toString());
88
- if (!exposeDesMotifsContent) {
89
- continue;
90
- }
91
- fs.writeFileSync(exposeDesMotifsPath, Buffer.from(exposeDesMotifsContent));
92
- }
93
- if (isOptionEmptyOrHasValue(options.formats, "xml")) {
94
- const textePath = path.join(texteDir, `${texteMetadata.name}.xml`);
95
- let texteBuffer = null;
96
- if (!options.force && fs.existsSync(textePath)) {
97
- if (!options.silent) {
98
- console.info(`Already retrieved texte ${textePath}…`);
99
- }
100
- }
101
- else {
102
- texteBuffer = await retrieveDocument(texteMetadata.url_xml.toString());
103
- if (!texteBuffer) {
104
- texteUrlsNotFoundOrError.push(texteMetadata.url_xml);
105
- continue;
106
- }
107
- fs.writeFileSync(textePath, Buffer.from(texteBuffer));
108
- retrievedTextesCount++;
109
- }
110
- if (options.parseDocuments) {
111
- if (!options.silent) {
112
- console.log(`Parsing texte ${texteMetadata.name}.xml…`);
113
- }
114
- let parsedTexte = null;
115
- if (texteBuffer) {
116
- const texteXml = textDecoder.decode(texteBuffer);
117
- parsedTexte = parseTexte(texteXml);
118
- }
119
- else {
120
- parsedTexte = await parseTexteFromFile(textePath);
121
- }
122
- if (!parsedTexte) {
123
- texteUrlsParseError.push(texteMetadata.url_xml);
124
- continue;
125
- }
126
- if (exposeDesMotifsContent) {
127
- if (!options.silent) {
128
- console.log("Parsing exposé des motifs…");
129
- }
130
- const exposeDesMotifsHtml = textDecoder.decode(exposeDesMotifsContent);
131
- parsedTexte.exposeDesMotifs = parseExposeDesMotifs(exposeDesMotifsHtml);
132
- }
133
- const transformedTexteDir = path.join(transformedTextesDir, `${texteMetadata.session ?? UNDEFINED_SESSION}`, texteMetadata.name);
134
- fs.ensureDirSync(transformedTexteDir);
135
- fs.writeJSONSync(path.join(transformedTexteDir, `${texteMetadata.name}.json`), parsedTexte, { spaces: 2 });
136
- }
137
- }
138
- if (isOptionEmptyOrHasValue(options.formats, "html")) {
139
- const textePath = path.join(texteDir, `${texteMetadata.name}.html`);
140
- if (!options.force && fs.existsSync(textePath)) {
141
- if (!options.silent) {
142
- console.info(`Already retrieved texte ${textePath}…`);
143
- }
144
- }
145
- else {
146
- const texteBuffer = await retrieveDocument(texteMetadata.url_html.toString());
147
- if (!texteBuffer) {
148
- texteUrlsNotFoundOrError.push(texteMetadata.url_html);
149
- continue;
150
- }
151
- fs.writeFileSync(textePath, Buffer.from(texteBuffer));
152
- retrievedTextesCount++;
153
- }
154
- }
155
- if (isOptionEmptyOrHasValue(options.formats, "pdf")) {
156
- const textePath = path.join(texteDir, `${texteMetadata.name}.pdf`);
157
- if (!options.force && fs.existsSync(textePath)) {
158
- if (!options.silent) {
159
- console.info(`Already retrieved texte ${textePath}…`);
160
- }
161
- }
162
- else {
163
- const texteBuffer = await retrieveDocument(texteMetadata.url_pdf.toString());
164
- if (!texteBuffer) {
165
- texteUrlsNotFoundOrError.push(texteMetadata.url_pdf);
166
- continue;
167
- }
168
- fs.writeFileSync(textePath, Buffer.from(texteBuffer));
169
- retrievedTextesCount++;
170
- }
171
- }
172
- }
173
- }
174
- if (options.verbose) {
175
- console.log(`${retrievedTextesCount} textes retrieved`);
176
- console.log(`${texteUrlsNotFoundOrError.length} textes failed to be retrieved with URLs ${texteUrlsNotFoundOrError.join(", ")}`);
177
- if (options.parseDocuments) {
178
- console.log(`${texteUrlsParseError.length} textes failed to be parsed with URLs ${texteUrlsParseError.join(", ")}`);
179
- }
180
- }
181
- }
182
- async function retrieveRapports(dataDir) {
183
- const rapportsDir = path.join(dataDir, RAPPORT_FOLDER);
184
- fs.ensureDirSync(rapportsDir);
185
- let retrievedRapportsCount = 0;
186
- const rapportUrlsNotFoundOrError = [];
187
- for (const session of options.sessions) {
188
- for (const { item: rapportMetadata } of iterLoadSenatDossiersLegislatifsRapportUrls(dataDir, session)) {
189
- const rapportDir = path.join(rapportsDir, `${rapportMetadata.session ?? UNDEFINED_SESSION}`, rapportMetadata.name);
190
- fs.ensureDirSync(rapportDir);
191
- if (isOptionEmptyOrHasValue(options.formats, "html")) {
192
- const rapportPath = path.join(rapportDir, `${rapportMetadata.name}.html`);
193
- if (!options.force && fs.existsSync(rapportPath)) {
194
- if (!options.silent) {
195
- console.info(`Already retrieved rapport ${rapportPath}…`);
196
- }
197
- continue;
198
- }
199
- const rapportBuffer = await retrieveDocument(rapportMetadata.url_html.toString());
200
- if (!rapportBuffer) {
201
- rapportUrlsNotFoundOrError.push(rapportMetadata.url_html);
202
- continue;
203
- }
204
- fs.writeFileSync(rapportPath, Buffer.from(rapportBuffer));
205
- retrievedRapportsCount++;
206
- }
207
- if (isOptionEmptyOrHasValue(options.formats, "pdf")) {
208
- const rapportPath = path.join(rapportDir, `${rapportMetadata.name}.pdf`);
209
- if (!options.force && fs.existsSync(rapportPath)) {
210
- if (!options.silent) {
211
- console.info(`Already retrieved rapport ${rapportPath}…`);
212
- }
213
- continue;
214
- }
215
- const rapportBuffer = await retrieveDocument(rapportMetadata.url_pdf.toString());
216
- if (!rapportBuffer) {
217
- rapportUrlsNotFoundOrError.push(rapportMetadata.url_pdf);
218
- continue;
219
- }
220
- fs.writeFileSync(rapportPath, Buffer.from(rapportBuffer));
221
- retrievedRapportsCount++;
222
- }
223
- }
224
- }
225
- if (options.verbose) {
226
- console.log(`${retrievedRapportsCount} rapports retrieved`);
227
- console.log(`${rapportUrlsNotFoundOrError.length} rapports failed with URLs ${rapportUrlsNotFoundOrError.join(", ")}`);
228
- }
229
- }
230
- async function main() {
231
- const dataDir = options.dataDir;
232
- assert(dataDir, "Missing argument: data directory");
233
- console.time("documents processing time");
234
- if (isOptionEmptyOrHasValue(options.types, "textes")) {
235
- await retrieveTextes(dataDir);
236
- }
237
- if (isOptionEmptyOrHasValue(options.types, "rapports")) {
238
- await retrieveRapports(dataDir);
239
- }
240
- if (!options.silent) {
241
- console.timeEnd("documents processing time");
242
- }
243
- }
244
- main()
245
- .then(() => process.exit(0))
246
- .catch((error) => {
247
- console.log(error);
248
- process.exit(1);
249
- });
@@ -1,298 +0,0 @@
1
- import assert from "assert"
2
- import commandLineArgs from "command-line-args"
3
- import fs from "fs-extra"
4
- import path from "path"
5
-
6
- import {
7
- iterLoadSenatDossiersLegislatifsRapportUrls,
8
- iterLoadSenatDossiersLegislatifsTexteUrls,
9
- RAPPORT_FOLDER,
10
- TEXTE_FOLDER,
11
- TEXTE_ORIGINAL_FOLDER,
12
- TEXTE_TRANSFORMED_FOLDER,
13
- } from "../loaders"
14
- import { parseExposeDesMotifs, parseTexte, parseTexteFromFile } from "../model/texte"
15
- import { UNDEFINED_SESSION } from "./datautil"
16
- import { commonOptions } from "./shared/cli_helpers"
17
- import { ensureAndClearDir, fetchWithRetry, isOptionEmptyOrHasValue } from "./shared/util"
18
-
19
- const optionsDefinitions = [
20
- ...commonOptions,
21
- {
22
- help: "sessions of textes to retrieve; leave empty for all",
23
- multiple: true,
24
- name: "sessions",
25
- type: String,
26
- },
27
- {
28
- help: "parse and convert documents into JSON (textes only for now, requires format xml)",
29
- name: "parseDocuments",
30
- type: Boolean,
31
- },
32
- {
33
- alias: "F",
34
- help: "formats of documents to retrieve (xml/html/pdf for textes, html/pdf for rapports); leave empty for all",
35
- multiple: true,
36
- name: "formats",
37
- type: String,
38
- },
39
- {
40
- help: "types of documents to retrieve (textes/rapports); leave empty for all",
41
- multiple: true,
42
- name: "types",
43
- type: String,
44
- },
45
- {
46
- help: "force retrieve all documents, even already retrieved ones",
47
- name: "force",
48
- type: Boolean,
49
- },
50
- ]
51
- const options = commandLineArgs(optionsDefinitions)
52
-
53
- const textDecoder = new TextDecoder("utf8")
54
-
55
- async function retrieveDocument (documentUrl: string): Promise<ArrayBuffer | null> {
56
- if (!options.silent) {
57
- console.log(`Retrieving document ${documentUrl}…`)
58
- }
59
-
60
- try {
61
- const response = await fetchWithRetry(documentUrl)
62
- if (!response.ok) {
63
- if (response.status === 404) {
64
- console.warn(`Texte ${documentUrl} not found`)
65
- } else {
66
- console.error(`An error occurred while retrieving texte ${documentUrl}: ${response.status}`)
67
- }
68
- return null
69
- }
70
- return response.arrayBuffer()
71
- } catch (error: any) {
72
- console.error(error.message)
73
- return null
74
- }
75
- }
76
-
77
- async function retrieveTextes (dataDir: string) {
78
- const textesDir = path.join(dataDir, TEXTE_FOLDER)
79
- fs.ensureDirSync(textesDir)
80
-
81
- const originalTextesDir = path.join(textesDir, TEXTE_ORIGINAL_FOLDER)
82
- const transformedTextesDir = path.join(textesDir, TEXTE_TRANSFORMED_FOLDER)
83
- if (options.parseDocuments) {
84
- ensureAndClearDir(transformedTextesDir)
85
- }
86
-
87
- let retrievedTextesCount = 0
88
- const texteUrlsNotFoundOrError = []
89
- const texteUrlsParseError = []
90
-
91
- for (const session of options.sessions) {
92
- for (const { item: texteMetadata } of iterLoadSenatDossiersLegislatifsTexteUrls(dataDir, session)) {
93
- const texteDir = path.join(originalTextesDir, `${texteMetadata.session ?? UNDEFINED_SESSION}`, texteMetadata.name)
94
- fs.ensureDirSync(texteDir)
95
-
96
- let exposeDesMotifsContent = null
97
- if (texteMetadata.url_expose_des_motifs) {
98
- if (!options.silent) {
99
- console.log("Retrieving exposé des motifs…")
100
- }
101
-
102
- const exposeDesMotifsPath = path.join(texteDir, `${texteMetadata.name}-expose.html`)
103
-
104
- exposeDesMotifsContent = await retrieveDocument(texteMetadata.url_expose_des_motifs.toString())
105
- if (!exposeDesMotifsContent) {
106
- continue
107
- }
108
- fs.writeFileSync(exposeDesMotifsPath, Buffer.from(exposeDesMotifsContent))
109
- }
110
-
111
- if (isOptionEmptyOrHasValue(options.formats, "xml")) {
112
- const textePath = path.join(texteDir, `${texteMetadata.name}.xml`)
113
- let texteBuffer = null
114
-
115
- if (!options.force && fs.existsSync(textePath)) {
116
- if (!options.silent) {
117
- console.info(`Already retrieved texte ${textePath}…`)
118
- }
119
- } else {
120
- texteBuffer = await retrieveDocument(texteMetadata.url_xml.toString())
121
- if (!texteBuffer) {
122
- texteUrlsNotFoundOrError.push(texteMetadata.url_xml)
123
- continue
124
- }
125
- fs.writeFileSync(textePath, Buffer.from(texteBuffer))
126
- retrievedTextesCount++
127
- }
128
-
129
- if (options.parseDocuments) {
130
- if (!options.silent) {
131
- console.log(`Parsing texte ${texteMetadata.name}.xml…`)
132
- }
133
-
134
- let parsedTexte = null
135
- if (texteBuffer) {
136
- const texteXml = textDecoder.decode(texteBuffer)
137
- parsedTexte = parseTexte(texteXml)
138
- } else {
139
- parsedTexte = await parseTexteFromFile(textePath)
140
- }
141
-
142
- if (!parsedTexte) {
143
- texteUrlsParseError.push(texteMetadata.url_xml)
144
- continue
145
- }
146
-
147
- if (exposeDesMotifsContent) {
148
- if (!options.silent) {
149
- console.log("Parsing exposé des motifs…")
150
- }
151
-
152
- const exposeDesMotifsHtml = textDecoder.decode(exposeDesMotifsContent)
153
- parsedTexte.exposeDesMotifs = parseExposeDesMotifs(exposeDesMotifsHtml)
154
- }
155
-
156
- const transformedTexteDir
157
- = path.join(transformedTextesDir, `${texteMetadata.session ?? UNDEFINED_SESSION}`, texteMetadata.name)
158
- fs.ensureDirSync(transformedTexteDir)
159
-
160
- fs.writeJSONSync(path.join(transformedTexteDir, `${texteMetadata.name}.json`), parsedTexte, { spaces: 2 })
161
- }
162
- }
163
-
164
- if (isOptionEmptyOrHasValue(options.formats, "html")) {
165
- const textePath = path.join(texteDir, `${texteMetadata.name}.html`)
166
-
167
- if (!options.force && fs.existsSync(textePath)) {
168
- if (!options.silent) {
169
- console.info(`Already retrieved texte ${textePath}…`)
170
- }
171
- } else {
172
-
173
- const texteBuffer = await retrieveDocument(texteMetadata.url_html.toString())
174
- if (!texteBuffer) {
175
- texteUrlsNotFoundOrError.push(texteMetadata.url_html)
176
- continue
177
- }
178
- fs.writeFileSync(textePath, Buffer.from(texteBuffer))
179
- retrievedTextesCount++
180
- }
181
- }
182
-
183
- if (isOptionEmptyOrHasValue(options.formats, "pdf")) {
184
- const textePath = path.join(texteDir, `${texteMetadata.name}.pdf`)
185
-
186
- if (!options.force && fs.existsSync(textePath)) {
187
- if (!options.silent) {
188
- console.info(`Already retrieved texte ${textePath}…`)
189
- }
190
- } else {
191
-
192
- const texteBuffer = await retrieveDocument(texteMetadata.url_pdf.toString())
193
- if (!texteBuffer) {
194
- texteUrlsNotFoundOrError.push(texteMetadata.url_pdf)
195
- continue
196
- }
197
- fs.writeFileSync(textePath, Buffer.from(texteBuffer))
198
- retrievedTextesCount++
199
- }
200
- }
201
- }
202
- }
203
-
204
- if (options.verbose) {
205
- console.log(`${retrievedTextesCount} textes retrieved`)
206
- console.log(
207
- `${texteUrlsNotFoundOrError.length} textes failed to be retrieved with URLs ${texteUrlsNotFoundOrError.join(", ")}`
208
- )
209
- if (options.parseDocuments) {
210
- console.log(`${texteUrlsParseError.length} textes failed to be parsed with URLs ${texteUrlsParseError.join(", ")}`)
211
- }
212
- }
213
- }
214
-
215
- async function retrieveRapports (dataDir: string) {
216
- const rapportsDir = path.join(dataDir, RAPPORT_FOLDER)
217
- fs.ensureDirSync(rapportsDir)
218
-
219
- let retrievedRapportsCount = 0
220
- const rapportUrlsNotFoundOrError = []
221
-
222
- for (const session of options.sessions) {
223
- for (const { item: rapportMetadata } of iterLoadSenatDossiersLegislatifsRapportUrls(dataDir, session)) {
224
- const rapportDir = path.join(rapportsDir, `${rapportMetadata.session ?? UNDEFINED_SESSION}`, rapportMetadata.name)
225
- fs.ensureDirSync(rapportDir)
226
-
227
- if (isOptionEmptyOrHasValue(options.formats, "html")) {
228
- const rapportPath = path.join(rapportDir, `${rapportMetadata.name}.html`)
229
-
230
- if (!options.force && fs.existsSync(rapportPath)) {
231
- if (!options.silent) {
232
- console.info(`Already retrieved rapport ${rapportPath}…`)
233
- }
234
- continue
235
- }
236
-
237
- const rapportBuffer = await retrieveDocument(rapportMetadata.url_html.toString())
238
- if (!rapportBuffer) {
239
- rapportUrlsNotFoundOrError.push(rapportMetadata.url_html)
240
- continue
241
- }
242
- fs.writeFileSync(rapportPath, Buffer.from(rapportBuffer))
243
- retrievedRapportsCount++
244
- }
245
-
246
- if (isOptionEmptyOrHasValue(options.formats, "pdf")) {
247
- const rapportPath = path.join(rapportDir, `${rapportMetadata.name}.pdf`)
248
-
249
- if (!options.force && fs.existsSync(rapportPath)) {
250
- if (!options.silent) {
251
- console.info(`Already retrieved rapport ${rapportPath}…`)
252
- }
253
- continue
254
- }
255
-
256
- const rapportBuffer = await retrieveDocument(rapportMetadata.url_pdf.toString())
257
- if (!rapportBuffer) {
258
- rapportUrlsNotFoundOrError.push(rapportMetadata.url_pdf)
259
- continue
260
- }
261
- fs.writeFileSync(rapportPath, Buffer.from(rapportBuffer))
262
- retrievedRapportsCount++
263
- }
264
- }
265
- }
266
-
267
- if (options.verbose) {
268
- console.log(`${retrievedRapportsCount} rapports retrieved`)
269
- console.log(
270
- `${rapportUrlsNotFoundOrError.length} rapports failed with URLs ${rapportUrlsNotFoundOrError.join(", ")}`
271
- )
272
- }
273
- }
274
-
275
- async function main() {
276
- const dataDir = options.dataDir
277
- assert(dataDir, "Missing argument: data directory")
278
-
279
- console.time("documents processing time")
280
-
281
- if (isOptionEmptyOrHasValue(options.types, "textes")) {
282
- await retrieveTextes(dataDir)
283
- }
284
- if (isOptionEmptyOrHasValue(options.types, "rapports")) {
285
- await retrieveRapports(dataDir)
286
- }
287
-
288
- if (!options.silent) {
289
- console.timeEnd("documents processing time")
290
- }
291
- }
292
-
293
- main()
294
- .then(() => process.exit(0))
295
- .catch((error) => {
296
- console.log(error)
297
- process.exit(1)
298
- })