@tricoteuses/senat 1.3.1 → 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (253) hide show
  1. package/lib/config.d.ts +1 -0
  2. package/lib/config.js +14 -45
  3. package/lib/databases.js +86 -143
  4. package/lib/datasets.js +78 -83
  5. package/lib/index.d.ts +7 -4
  6. package/lib/index.js +42 -419
  7. package/lib/loaders.js +149 -654
  8. package/lib/model/ameli.js +83 -21
  9. package/lib/model/debats.js +0 -1
  10. package/lib/model/dosleg.d.ts +1 -1
  11. package/lib/model/dosleg.js +179 -73
  12. package/lib/model/index.d.ts +3 -3
  13. package/lib/model/index.js +12 -46
  14. package/lib/model/questions.js +68 -39
  15. package/lib/model/sens.js +383 -113
  16. package/lib/model/texte.js +220 -290
  17. package/lib/model/util.js +9 -26
  18. package/lib/raw_types/ameli.js +5 -6
  19. package/lib/raw_types/debats.js +5 -6
  20. package/lib/raw_types/dosleg.js +5 -6
  21. package/lib/raw_types/questions.js +5 -6
  22. package/lib/raw_types/sens.js +5 -6
  23. package/lib/raw_types_schemats/ameli.js +1 -43
  24. package/lib/raw_types_schemats/debats.js +1 -22
  25. package/lib/raw_types_schemats/dosleg.js +1 -96
  26. package/lib/raw_types_schemats/questions.js +1 -22
  27. package/lib/raw_types_schemats/sens.js +1 -112
  28. package/lib/scripts/convert_data.js +181 -631
  29. package/lib/scripts/datautil.js +17 -60
  30. package/lib/scripts/parse_textes.js +46 -129
  31. package/lib/scripts/retrieve_documents.js +247 -513
  32. package/lib/scripts/retrieve_open_data.js +211 -368
  33. package/lib/scripts/retrieve_senateurs_photos.js +144 -239
  34. package/lib/scripts/shared/cli_helpers.js +30 -30
  35. package/lib/scripts/shared/util.js +28 -94
  36. package/lib/strings.js +20 -45
  37. package/lib/types/ameli.d.ts +1 -1
  38. package/lib/types/ameli.js +14 -25
  39. package/lib/types/debats.d.ts +1 -1
  40. package/lib/types/debats.js +3 -21
  41. package/lib/types/dosleg.d.ts +1 -1
  42. package/lib/types/dosleg.js +152 -119
  43. package/lib/types/questions.d.ts +1 -1
  44. package/lib/types/questions.js +1 -13
  45. package/lib/types/sens.d.ts +1 -1
  46. package/lib/types/sens.js +1 -13
  47. package/lib/types/sessions.js +44 -49
  48. package/lib/types/texte.js +17 -22
  49. package/lib/validators/config.js +47 -111
  50. package/lib/validators/senat.js +1 -5
  51. package/package.json +16 -38
  52. package/lib/aggregates.d.ts +0 -52
  53. package/lib/aggregates.mjs +0 -930
  54. package/lib/aggregates.ts +0 -833
  55. package/lib/config.mjs +0 -16
  56. package/lib/config.ts +0 -26
  57. package/lib/data/legislatures.json +0 -38
  58. package/lib/databases.mjs +0 -57
  59. package/lib/databases.ts +0 -71
  60. package/lib/datasets.mjs +0 -78
  61. package/lib/datasets.ts +0 -118
  62. package/lib/fields.d.ts +0 -10
  63. package/lib/fields.mjs +0 -68
  64. package/lib/fields.ts +0 -29
  65. package/lib/index.mjs +0 -4
  66. package/lib/index.ts +0 -42
  67. package/lib/inserters.d.ts +0 -98
  68. package/lib/inserters.mjs +0 -500
  69. package/lib/inserters.ts +0 -521
  70. package/lib/loaders.mjs +0 -158
  71. package/lib/loaders.ts +0 -271
  72. package/lib/model/ameli.mjs +0 -84
  73. package/lib/model/ameli.ts +0 -100
  74. package/lib/model/debats.mjs +0 -1
  75. package/lib/model/debats.ts +0 -0
  76. package/lib/model/dosleg.mjs +0 -196
  77. package/lib/model/dosleg.ts +0 -240
  78. package/lib/model/index.mjs +0 -4
  79. package/lib/model/index.ts +0 -14
  80. package/lib/model/questions.mjs +0 -71
  81. package/lib/model/questions.ts +0 -93
  82. package/lib/model/sens.mjs +0 -415
  83. package/lib/model/sens.ts +0 -516
  84. package/lib/model/texte.mjs +0 -208
  85. package/lib/model/texte.ts +0 -229
  86. package/lib/model/util.mjs +0 -19
  87. package/lib/model/util.ts +0 -32
  88. package/lib/raw_types/ameli.mjs +0 -5
  89. package/lib/raw_types/ameli.ts +0 -951
  90. package/lib/raw_types/debats.mjs +0 -5
  91. package/lib/raw_types/debats.ts +0 -222
  92. package/lib/raw_types/dosleg.mjs +0 -5
  93. package/lib/raw_types/dosleg.ts +0 -3625
  94. package/lib/raw_types/questions.mjs +0 -5
  95. package/lib/raw_types/questions.ts +0 -427
  96. package/lib/raw_types/sens.mjs +0 -5
  97. package/lib/raw_types/sens.ts +0 -4499
  98. package/lib/raw_types_kysely/ameli.d.ts +0 -6
  99. package/lib/raw_types_kysely/ameli.mjs +0 -7
  100. package/lib/raw_types_kysely/ameli.ts +0 -6
  101. package/lib/raw_types_kysely/debats.d.ts +0 -6
  102. package/lib/raw_types_kysely/debats.mjs +0 -7
  103. package/lib/raw_types_kysely/debats.ts +0 -6
  104. package/lib/raw_types_kysely/dosleg.d.ts +0 -6
  105. package/lib/raw_types_kysely/dosleg.mjs +0 -7
  106. package/lib/raw_types_kysely/dosleg.ts +0 -6
  107. package/lib/raw_types_kysely/questions.d.ts +0 -6
  108. package/lib/raw_types_kysely/questions.mjs +0 -7
  109. package/lib/raw_types_kysely/questions.ts +0 -6
  110. package/lib/raw_types_kysely/sens.d.ts +0 -6
  111. package/lib/raw_types_kysely/sens.mjs +0 -7
  112. package/lib/raw_types_kysely/sens.ts +0 -6
  113. package/lib/raw_types_kysely/texte.d.ts +0 -45
  114. package/lib/raw_types_kysely/texte.mjs +0 -7
  115. package/lib/raw_types_kysely/texte.ts +0 -53
  116. package/lib/raw_types_schemats/ameli.mjs +0 -2
  117. package/lib/raw_types_schemats/ameli.ts +0 -601
  118. package/lib/raw_types_schemats/debats.mjs +0 -2
  119. package/lib/raw_types_schemats/debats.ts +0 -145
  120. package/lib/raw_types_schemats/dosleg.mjs +0 -2
  121. package/lib/raw_types_schemats/dosleg.ts +0 -2195
  122. package/lib/raw_types_schemats/questions.mjs +0 -2
  123. package/lib/raw_types_schemats/questions.ts +0 -251
  124. package/lib/raw_types_schemats/sens.mjs +0 -2
  125. package/lib/raw_types_schemats/sens.ts +0 -2907
  126. package/lib/scripts/convert_data.mjs +0 -181
  127. package/lib/scripts/convert_data.ts +0 -243
  128. package/lib/scripts/datautil.mjs +0 -16
  129. package/lib/scripts/datautil.ts +0 -19
  130. package/lib/scripts/images/transparent_150x192.jpg +0 -0
  131. package/lib/scripts/images/transparent_155x225.jpg +0 -0
  132. package/lib/scripts/parse_textes.mjs +0 -46
  133. package/lib/scripts/parse_textes.ts +0 -65
  134. package/lib/scripts/retrieve_documents.mjs +0 -249
  135. package/lib/scripts/retrieve_documents.ts +0 -298
  136. package/lib/scripts/retrieve_open_data.mjs +0 -217
  137. package/lib/scripts/retrieve_open_data.ts +0 -274
  138. package/lib/scripts/retrieve_senateurs_photos.mjs +0 -147
  139. package/lib/scripts/retrieve_senateurs_photos.ts +0 -177
  140. package/lib/scripts/retrieve_textes.d.ts +0 -1
  141. package/lib/scripts/retrieve_textes.mjs +0 -328
  142. package/lib/scripts/retrieve_textes.ts +0 -143
  143. package/lib/scripts/shared/cli_helpers.ts +0 -36
  144. package/lib/scripts/shared/util.ts +0 -33
  145. package/lib/src/aggregates.d.ts +0 -52
  146. package/lib/src/aggregates.mjs +0 -726
  147. package/lib/src/config.d.ts +0 -2
  148. package/lib/src/config.mjs +0 -16
  149. package/lib/src/databases.d.ts +0 -18
  150. package/lib/src/databases.mjs +0 -55
  151. package/lib/src/datasets.d.ts +0 -28
  152. package/lib/src/datasets.mjs +0 -78
  153. package/lib/src/fields.d.ts +0 -10
  154. package/lib/src/fields.mjs +0 -22
  155. package/lib/src/index.d.ts +0 -8
  156. package/lib/src/index.mjs +0 -7
  157. package/lib/src/inserters.d.ts +0 -98
  158. package/lib/src/inserters.mjs +0 -360
  159. package/lib/src/loaders.d.ts +0 -36
  160. package/lib/src/loaders.mjs +0 -107
  161. package/lib/src/model/ameli.d.ts +0 -4
  162. package/lib/src/model/ameli.js +0 -57
  163. package/lib/src/model/debats.d.ts +0 -4
  164. package/lib/src/model/debats.js +0 -43
  165. package/lib/src/model/dosleg.d.ts +0 -197
  166. package/lib/src/model/dosleg.js +0 -169
  167. package/lib/src/model/index.d.ts +0 -4
  168. package/lib/src/model/index.js +0 -4
  169. package/lib/src/model/questions.d.ts +0 -89
  170. package/lib/src/model/questions.js +0 -76
  171. package/lib/src/model/sens.d.ts +0 -390
  172. package/lib/src/model/sens.js +0 -339
  173. package/lib/src/model/texte.d.ts +0 -7
  174. package/lib/src/model/texte.js +0 -183
  175. package/lib/src/raw_types_kysely/ameli.d.ts +0 -915
  176. package/lib/src/raw_types_kysely/ameli.js +0 -5
  177. package/lib/src/raw_types_kysely/debats.d.ts +0 -207
  178. package/lib/src/raw_types_kysely/debats.js +0 -5
  179. package/lib/src/raw_types_kysely/dosleg.d.ts +0 -3532
  180. package/lib/src/raw_types_kysely/dosleg.js +0 -5
  181. package/lib/src/raw_types_kysely/questions.d.ts +0 -414
  182. package/lib/src/raw_types_kysely/questions.js +0 -5
  183. package/lib/src/raw_types_kysely/sens.d.ts +0 -4394
  184. package/lib/src/raw_types_kysely/sens.js +0 -5
  185. package/lib/src/raw_types_schemats/ameli.d.ts +0 -541
  186. package/lib/src/raw_types_schemats/ameli.js +0 -2
  187. package/lib/src/raw_types_schemats/debats.d.ts +0 -127
  188. package/lib/src/raw_types_schemats/debats.js +0 -2
  189. package/lib/src/raw_types_schemats/dosleg.d.ts +0 -2027
  190. package/lib/src/raw_types_schemats/dosleg.js +0 -2
  191. package/lib/src/raw_types_schemats/questions.d.ts +0 -231
  192. package/lib/src/raw_types_schemats/questions.js +0 -2
  193. package/lib/src/raw_types_schemats/sens.d.ts +0 -2709
  194. package/lib/src/raw_types_schemats/sens.js +0 -2
  195. package/lib/src/scripts/convert_data.d.ts +0 -1
  196. package/lib/src/scripts/convert_data.js +0 -95
  197. package/lib/src/scripts/datautil.d.ts +0 -5
  198. package/lib/src/scripts/datautil.js +0 -16
  199. package/lib/src/scripts/parse_textes.d.ts +0 -1
  200. package/lib/src/scripts/parse_textes.js +0 -47
  201. package/lib/src/scripts/retrieve_documents.d.ts +0 -1
  202. package/lib/src/scripts/retrieve_documents.js +0 -258
  203. package/lib/src/scripts/retrieve_open_data.d.ts +0 -1
  204. package/lib/src/scripts/retrieve_open_data.js +0 -214
  205. package/lib/src/scripts/retrieve_senateurs_photos.d.ts +0 -1
  206. package/lib/src/scripts/retrieve_senateurs_photos.js +0 -147
  207. package/lib/src/scripts/shared/cli_helpers.d.ts +0 -44
  208. package/lib/src/scripts/shared/cli_helpers.js +0 -32
  209. package/lib/src/scripts/shared/util.d.ts +0 -3
  210. package/lib/src/scripts/shared/util.js +0 -28
  211. package/lib/src/strings.d.ts +0 -1
  212. package/lib/src/strings.mjs +0 -18
  213. package/lib/src/types/ameli.d.ts +0 -10
  214. package/lib/src/types/ameli.js +0 -13
  215. package/lib/src/types/debats.d.ts +0 -4
  216. package/lib/src/types/debats.js +0 -2
  217. package/lib/src/types/dosleg.d.ts +0 -98
  218. package/lib/src/types/dosleg.js +0 -151
  219. package/lib/src/types/questions.d.ts +0 -2
  220. package/lib/src/types/questions.js +0 -1
  221. package/lib/src/types/sens.d.ts +0 -10
  222. package/lib/src/types/sens.js +0 -1
  223. package/lib/src/types/sessions.d.ts +0 -42
  224. package/lib/src/types/sessions.js +0 -43
  225. package/lib/src/types/texte.d.ts +0 -61
  226. package/lib/src/types/texte.js +0 -16
  227. package/lib/src/validators/config.d.ts +0 -1
  228. package/lib/src/validators/config.js +0 -54
  229. package/lib/src/validators/senat.d.ts +0 -0
  230. package/lib/src/validators/senat.js +0 -24
  231. package/lib/strings.mjs +0 -18
  232. package/lib/strings.ts +0 -26
  233. package/lib/types/ameli.mjs +0 -13
  234. package/lib/types/ameli.ts +0 -21
  235. package/lib/types/debats.mjs +0 -2
  236. package/lib/types/debats.ts +0 -6
  237. package/lib/types/dosleg.mjs +0 -151
  238. package/lib/types/dosleg.ts +0 -284
  239. package/lib/types/questions.mjs +0 -1
  240. package/lib/types/questions.ts +0 -3
  241. package/lib/types/sens.mjs +0 -1
  242. package/lib/types/sens.ts +0 -12
  243. package/lib/types/sessions.mjs +0 -43
  244. package/lib/types/sessions.ts +0 -42
  245. package/lib/types/texte.mjs +0 -16
  246. package/lib/types/texte.ts +0 -76
  247. package/lib/typings/windows-1252.d.js +0 -2
  248. package/lib/typings/windows-1252.d.mjs +0 -2
  249. package/lib/typings/windows-1252.d.ts +0 -11
  250. package/lib/validators/config.mjs +0 -54
  251. package/lib/validators/config.ts +0 -79
  252. package/lib/validators/senat.mjs +0 -24
  253. package/lib/validators/senat.ts +0 -26
@@ -1,249 +0,0 @@
1
- import assert from "assert";
2
- import commandLineArgs from "command-line-args";
3
- import fs from "fs-extra";
4
- import path from "path";
5
- import { iterLoadSenatDossiersLegislatifsRapportUrls, iterLoadSenatDossiersLegislatifsTexteUrls, RAPPORT_FOLDER, TEXTE_FOLDER, TEXTE_ORIGINAL_FOLDER, TEXTE_TRANSFORMED_FOLDER, } from "../loaders";
6
- import { parseExposeDesMotifs, parseTexte, parseTexteFromFile } from "../model/texte";
7
- import { UNDEFINED_SESSION } from "./datautil";
8
- import { commonOptions } from "./shared/cli_helpers";
9
- import { ensureAndClearDir, fetchWithRetry, isOptionEmptyOrHasValue } from "./shared/util";
10
- const optionsDefinitions = [
11
- ...commonOptions,
12
- {
13
- help: "sessions of textes to retrieve; leave empty for all",
14
- multiple: true,
15
- name: "sessions",
16
- type: String,
17
- },
18
- {
19
- help: "parse and convert documents into JSON (textes only for now, requires format xml)",
20
- name: "parseDocuments",
21
- type: Boolean,
22
- },
23
- {
24
- alias: "F",
25
- help: "formats of documents to retrieve (xml/html/pdf for textes, html/pdf for rapports); leave empty for all",
26
- multiple: true,
27
- name: "formats",
28
- type: String,
29
- },
30
- {
31
- help: "types of documents to retrieve (textes/rapports); leave empty for all",
32
- multiple: true,
33
- name: "types",
34
- type: String,
35
- },
36
- {
37
- help: "force retrieve all documents, even already retrieved ones",
38
- name: "force",
39
- type: Boolean,
40
- },
41
- ];
42
- const options = commandLineArgs(optionsDefinitions);
43
- const textDecoder = new TextDecoder("utf8");
44
- async function retrieveDocument(documentUrl) {
45
- if (!options.silent) {
46
- console.log(`Retrieving document ${documentUrl}…`);
47
- }
48
- try {
49
- const response = await fetchWithRetry(documentUrl);
50
- if (!response.ok) {
51
- if (response.status === 404) {
52
- console.warn(`Texte ${documentUrl} not found`);
53
- }
54
- else {
55
- console.error(`An error occurred while retrieving texte ${documentUrl}: ${response.status}`);
56
- }
57
- return null;
58
- }
59
- return response.arrayBuffer();
60
- }
61
- catch (error) {
62
- console.error(error.message);
63
- return null;
64
- }
65
- }
66
- async function retrieveTextes(dataDir) {
67
- const textesDir = path.join(dataDir, TEXTE_FOLDER);
68
- fs.ensureDirSync(textesDir);
69
- const originalTextesDir = path.join(textesDir, TEXTE_ORIGINAL_FOLDER);
70
- const transformedTextesDir = path.join(textesDir, TEXTE_TRANSFORMED_FOLDER);
71
- if (options.parseDocuments) {
72
- ensureAndClearDir(transformedTextesDir);
73
- }
74
- let retrievedTextesCount = 0;
75
- const texteUrlsNotFoundOrError = [];
76
- const texteUrlsParseError = [];
77
- for (const session of options.sessions) {
78
- for (const { item: texteMetadata } of iterLoadSenatDossiersLegislatifsTexteUrls(dataDir, session)) {
79
- const texteDir = path.join(originalTextesDir, `${texteMetadata.session ?? UNDEFINED_SESSION}`, texteMetadata.name);
80
- fs.ensureDirSync(texteDir);
81
- let exposeDesMotifsContent = null;
82
- if (texteMetadata.url_expose_des_motifs) {
83
- if (!options.silent) {
84
- console.log("Retrieving exposé des motifs…");
85
- }
86
- const exposeDesMotifsPath = path.join(texteDir, `${texteMetadata.name}-expose.html`);
87
- exposeDesMotifsContent = await retrieveDocument(texteMetadata.url_expose_des_motifs.toString());
88
- if (!exposeDesMotifsContent) {
89
- continue;
90
- }
91
- fs.writeFileSync(exposeDesMotifsPath, Buffer.from(exposeDesMotifsContent));
92
- }
93
- if (isOptionEmptyOrHasValue(options.formats, "xml")) {
94
- const textePath = path.join(texteDir, `${texteMetadata.name}.xml`);
95
- let texteBuffer = null;
96
- if (!options.force && fs.existsSync(textePath)) {
97
- if (!options.silent) {
98
- console.info(`Already retrieved texte ${textePath}…`);
99
- }
100
- }
101
- else {
102
- texteBuffer = await retrieveDocument(texteMetadata.url_xml.toString());
103
- if (!texteBuffer) {
104
- texteUrlsNotFoundOrError.push(texteMetadata.url_xml);
105
- continue;
106
- }
107
- fs.writeFileSync(textePath, Buffer.from(texteBuffer));
108
- retrievedTextesCount++;
109
- }
110
- if (options.parseDocuments) {
111
- if (!options.silent) {
112
- console.log(`Parsing texte ${texteMetadata.name}.xml…`);
113
- }
114
- let parsedTexte = null;
115
- if (texteBuffer) {
116
- const texteXml = textDecoder.decode(texteBuffer);
117
- parsedTexte = parseTexte(texteXml);
118
- }
119
- else {
120
- parsedTexte = await parseTexteFromFile(textePath);
121
- }
122
- if (!parsedTexte) {
123
- texteUrlsParseError.push(texteMetadata.url_xml);
124
- continue;
125
- }
126
- if (exposeDesMotifsContent) {
127
- if (!options.silent) {
128
- console.log("Parsing exposé des motifs…");
129
- }
130
- const exposeDesMotifsHtml = textDecoder.decode(exposeDesMotifsContent);
131
- parsedTexte.exposeDesMotifs = parseExposeDesMotifs(exposeDesMotifsHtml);
132
- }
133
- const transformedTexteDir = path.join(transformedTextesDir, `${texteMetadata.session ?? UNDEFINED_SESSION}`, texteMetadata.name);
134
- fs.ensureDirSync(transformedTexteDir);
135
- fs.writeJSONSync(path.join(transformedTexteDir, `${texteMetadata.name}.json`), parsedTexte, { spaces: 2 });
136
- }
137
- }
138
- if (isOptionEmptyOrHasValue(options.formats, "html")) {
139
- const textePath = path.join(texteDir, `${texteMetadata.name}.html`);
140
- if (!options.force && fs.existsSync(textePath)) {
141
- if (!options.silent) {
142
- console.info(`Already retrieved texte ${textePath}…`);
143
- }
144
- }
145
- else {
146
- const texteBuffer = await retrieveDocument(texteMetadata.url_html.toString());
147
- if (!texteBuffer) {
148
- texteUrlsNotFoundOrError.push(texteMetadata.url_html);
149
- continue;
150
- }
151
- fs.writeFileSync(textePath, Buffer.from(texteBuffer));
152
- retrievedTextesCount++;
153
- }
154
- }
155
- if (isOptionEmptyOrHasValue(options.formats, "pdf")) {
156
- const textePath = path.join(texteDir, `${texteMetadata.name}.pdf`);
157
- if (!options.force && fs.existsSync(textePath)) {
158
- if (!options.silent) {
159
- console.info(`Already retrieved texte ${textePath}…`);
160
- }
161
- }
162
- else {
163
- const texteBuffer = await retrieveDocument(texteMetadata.url_pdf.toString());
164
- if (!texteBuffer) {
165
- texteUrlsNotFoundOrError.push(texteMetadata.url_pdf);
166
- continue;
167
- }
168
- fs.writeFileSync(textePath, Buffer.from(texteBuffer));
169
- retrievedTextesCount++;
170
- }
171
- }
172
- }
173
- }
174
- if (options.verbose) {
175
- console.log(`${retrievedTextesCount} textes retrieved`);
176
- console.log(`${texteUrlsNotFoundOrError.length} textes failed to be retrieved with URLs ${texteUrlsNotFoundOrError.join(", ")}`);
177
- if (options.parseDocuments) {
178
- console.log(`${texteUrlsParseError.length} textes failed to be parsed with URLs ${texteUrlsParseError.join(", ")}`);
179
- }
180
- }
181
- }
182
- async function retrieveRapports(dataDir) {
183
- const rapportsDir = path.join(dataDir, RAPPORT_FOLDER);
184
- fs.ensureDirSync(rapportsDir);
185
- let retrievedRapportsCount = 0;
186
- const rapportUrlsNotFoundOrError = [];
187
- for (const session of options.sessions) {
188
- for (const { item: rapportMetadata } of iterLoadSenatDossiersLegislatifsRapportUrls(dataDir, session)) {
189
- const rapportDir = path.join(rapportsDir, `${rapportMetadata.session ?? UNDEFINED_SESSION}`, rapportMetadata.name);
190
- fs.ensureDirSync(rapportDir);
191
- if (isOptionEmptyOrHasValue(options.formats, "html")) {
192
- const rapportPath = path.join(rapportDir, `${rapportMetadata.name}.html`);
193
- if (!options.force && fs.existsSync(rapportPath)) {
194
- if (!options.silent) {
195
- console.info(`Already retrieved rapport ${rapportPath}…`);
196
- }
197
- continue;
198
- }
199
- const rapportBuffer = await retrieveDocument(rapportMetadata.url_html.toString());
200
- if (!rapportBuffer) {
201
- rapportUrlsNotFoundOrError.push(rapportMetadata.url_html);
202
- continue;
203
- }
204
- fs.writeFileSync(rapportPath, Buffer.from(rapportBuffer));
205
- retrievedRapportsCount++;
206
- }
207
- if (isOptionEmptyOrHasValue(options.formats, "pdf")) {
208
- const rapportPath = path.join(rapportDir, `${rapportMetadata.name}.pdf`);
209
- if (!options.force && fs.existsSync(rapportPath)) {
210
- if (!options.silent) {
211
- console.info(`Already retrieved rapport ${rapportPath}…`);
212
- }
213
- continue;
214
- }
215
- const rapportBuffer = await retrieveDocument(rapportMetadata.url_pdf.toString());
216
- if (!rapportBuffer) {
217
- rapportUrlsNotFoundOrError.push(rapportMetadata.url_pdf);
218
- continue;
219
- }
220
- fs.writeFileSync(rapportPath, Buffer.from(rapportBuffer));
221
- retrievedRapportsCount++;
222
- }
223
- }
224
- }
225
- if (options.verbose) {
226
- console.log(`${retrievedRapportsCount} rapports retrieved`);
227
- console.log(`${rapportUrlsNotFoundOrError.length} rapports failed with URLs ${rapportUrlsNotFoundOrError.join(", ")}`);
228
- }
229
- }
230
- async function main() {
231
- const dataDir = options.dataDir;
232
- assert(dataDir, "Missing argument: data directory");
233
- console.time("documents processing time");
234
- if (isOptionEmptyOrHasValue(options.types, "textes")) {
235
- await retrieveTextes(dataDir);
236
- }
237
- if (isOptionEmptyOrHasValue(options.types, "rapports")) {
238
- await retrieveRapports(dataDir);
239
- }
240
- if (!options.silent) {
241
- console.timeEnd("documents processing time");
242
- }
243
- }
244
- main()
245
- .then(() => process.exit(0))
246
- .catch((error) => {
247
- console.log(error);
248
- process.exit(1);
249
- });
@@ -1,298 +0,0 @@
1
- import assert from "assert"
2
- import commandLineArgs from "command-line-args"
3
- import fs from "fs-extra"
4
- import path from "path"
5
-
6
- import {
7
- iterLoadSenatDossiersLegislatifsRapportUrls,
8
- iterLoadSenatDossiersLegislatifsTexteUrls,
9
- RAPPORT_FOLDER,
10
- TEXTE_FOLDER,
11
- TEXTE_ORIGINAL_FOLDER,
12
- TEXTE_TRANSFORMED_FOLDER,
13
- } from "../loaders"
14
- import { parseExposeDesMotifs, parseTexte, parseTexteFromFile } from "../model/texte"
15
- import { UNDEFINED_SESSION } from "./datautil"
16
- import { commonOptions } from "./shared/cli_helpers"
17
- import { ensureAndClearDir, fetchWithRetry, isOptionEmptyOrHasValue } from "./shared/util"
18
-
19
- const optionsDefinitions = [
20
- ...commonOptions,
21
- {
22
- help: "sessions of textes to retrieve; leave empty for all",
23
- multiple: true,
24
- name: "sessions",
25
- type: String,
26
- },
27
- {
28
- help: "parse and convert documents into JSON (textes only for now, requires format xml)",
29
- name: "parseDocuments",
30
- type: Boolean,
31
- },
32
- {
33
- alias: "F",
34
- help: "formats of documents to retrieve (xml/html/pdf for textes, html/pdf for rapports); leave empty for all",
35
- multiple: true,
36
- name: "formats",
37
- type: String,
38
- },
39
- {
40
- help: "types of documents to retrieve (textes/rapports); leave empty for all",
41
- multiple: true,
42
- name: "types",
43
- type: String,
44
- },
45
- {
46
- help: "force retrieve all documents, even already retrieved ones",
47
- name: "force",
48
- type: Boolean,
49
- },
50
- ]
51
- const options = commandLineArgs(optionsDefinitions)
52
-
53
- const textDecoder = new TextDecoder("utf8")
54
-
55
- async function retrieveDocument (documentUrl: string): Promise<ArrayBuffer | null> {
56
- if (!options.silent) {
57
- console.log(`Retrieving document ${documentUrl}…`)
58
- }
59
-
60
- try {
61
- const response = await fetchWithRetry(documentUrl)
62
- if (!response.ok) {
63
- if (response.status === 404) {
64
- console.warn(`Texte ${documentUrl} not found`)
65
- } else {
66
- console.error(`An error occurred while retrieving texte ${documentUrl}: ${response.status}`)
67
- }
68
- return null
69
- }
70
- return response.arrayBuffer()
71
- } catch (error: any) {
72
- console.error(error.message)
73
- return null
74
- }
75
- }
76
-
77
- async function retrieveTextes (dataDir: string) {
78
- const textesDir = path.join(dataDir, TEXTE_FOLDER)
79
- fs.ensureDirSync(textesDir)
80
-
81
- const originalTextesDir = path.join(textesDir, TEXTE_ORIGINAL_FOLDER)
82
- const transformedTextesDir = path.join(textesDir, TEXTE_TRANSFORMED_FOLDER)
83
- if (options.parseDocuments) {
84
- ensureAndClearDir(transformedTextesDir)
85
- }
86
-
87
- let retrievedTextesCount = 0
88
- const texteUrlsNotFoundOrError = []
89
- const texteUrlsParseError = []
90
-
91
- for (const session of options.sessions) {
92
- for (const { item: texteMetadata } of iterLoadSenatDossiersLegislatifsTexteUrls(dataDir, session)) {
93
- const texteDir = path.join(originalTextesDir, `${texteMetadata.session ?? UNDEFINED_SESSION}`, texteMetadata.name)
94
- fs.ensureDirSync(texteDir)
95
-
96
- let exposeDesMotifsContent = null
97
- if (texteMetadata.url_expose_des_motifs) {
98
- if (!options.silent) {
99
- console.log("Retrieving exposé des motifs…")
100
- }
101
-
102
- const exposeDesMotifsPath = path.join(texteDir, `${texteMetadata.name}-expose.html`)
103
-
104
- exposeDesMotifsContent = await retrieveDocument(texteMetadata.url_expose_des_motifs.toString())
105
- if (!exposeDesMotifsContent) {
106
- continue
107
- }
108
- fs.writeFileSync(exposeDesMotifsPath, Buffer.from(exposeDesMotifsContent))
109
- }
110
-
111
- if (isOptionEmptyOrHasValue(options.formats, "xml")) {
112
- const textePath = path.join(texteDir, `${texteMetadata.name}.xml`)
113
- let texteBuffer = null
114
-
115
- if (!options.force && fs.existsSync(textePath)) {
116
- if (!options.silent) {
117
- console.info(`Already retrieved texte ${textePath}…`)
118
- }
119
- } else {
120
- texteBuffer = await retrieveDocument(texteMetadata.url_xml.toString())
121
- if (!texteBuffer) {
122
- texteUrlsNotFoundOrError.push(texteMetadata.url_xml)
123
- continue
124
- }
125
- fs.writeFileSync(textePath, Buffer.from(texteBuffer))
126
- retrievedTextesCount++
127
- }
128
-
129
- if (options.parseDocuments) {
130
- if (!options.silent) {
131
- console.log(`Parsing texte ${texteMetadata.name}.xml…`)
132
- }
133
-
134
- let parsedTexte = null
135
- if (texteBuffer) {
136
- const texteXml = textDecoder.decode(texteBuffer)
137
- parsedTexte = parseTexte(texteXml)
138
- } else {
139
- parsedTexte = await parseTexteFromFile(textePath)
140
- }
141
-
142
- if (!parsedTexte) {
143
- texteUrlsParseError.push(texteMetadata.url_xml)
144
- continue
145
- }
146
-
147
- if (exposeDesMotifsContent) {
148
- if (!options.silent) {
149
- console.log("Parsing exposé des motifs…")
150
- }
151
-
152
- const exposeDesMotifsHtml = textDecoder.decode(exposeDesMotifsContent)
153
- parsedTexte.exposeDesMotifs = parseExposeDesMotifs(exposeDesMotifsHtml)
154
- }
155
-
156
- const transformedTexteDir
157
- = path.join(transformedTextesDir, `${texteMetadata.session ?? UNDEFINED_SESSION}`, texteMetadata.name)
158
- fs.ensureDirSync(transformedTexteDir)
159
-
160
- fs.writeJSONSync(path.join(transformedTexteDir, `${texteMetadata.name}.json`), parsedTexte, { spaces: 2 })
161
- }
162
- }
163
-
164
- if (isOptionEmptyOrHasValue(options.formats, "html")) {
165
- const textePath = path.join(texteDir, `${texteMetadata.name}.html`)
166
-
167
- if (!options.force && fs.existsSync(textePath)) {
168
- if (!options.silent) {
169
- console.info(`Already retrieved texte ${textePath}…`)
170
- }
171
- } else {
172
-
173
- const texteBuffer = await retrieveDocument(texteMetadata.url_html.toString())
174
- if (!texteBuffer) {
175
- texteUrlsNotFoundOrError.push(texteMetadata.url_html)
176
- continue
177
- }
178
- fs.writeFileSync(textePath, Buffer.from(texteBuffer))
179
- retrievedTextesCount++
180
- }
181
- }
182
-
183
- if (isOptionEmptyOrHasValue(options.formats, "pdf")) {
184
- const textePath = path.join(texteDir, `${texteMetadata.name}.pdf`)
185
-
186
- if (!options.force && fs.existsSync(textePath)) {
187
- if (!options.silent) {
188
- console.info(`Already retrieved texte ${textePath}…`)
189
- }
190
- } else {
191
-
192
- const texteBuffer = await retrieveDocument(texteMetadata.url_pdf.toString())
193
- if (!texteBuffer) {
194
- texteUrlsNotFoundOrError.push(texteMetadata.url_pdf)
195
- continue
196
- }
197
- fs.writeFileSync(textePath, Buffer.from(texteBuffer))
198
- retrievedTextesCount++
199
- }
200
- }
201
- }
202
- }
203
-
204
- if (options.verbose) {
205
- console.log(`${retrievedTextesCount} textes retrieved`)
206
- console.log(
207
- `${texteUrlsNotFoundOrError.length} textes failed to be retrieved with URLs ${texteUrlsNotFoundOrError.join(", ")}`
208
- )
209
- if (options.parseDocuments) {
210
- console.log(`${texteUrlsParseError.length} textes failed to be parsed with URLs ${texteUrlsParseError.join(", ")}`)
211
- }
212
- }
213
- }
214
-
215
- async function retrieveRapports (dataDir: string) {
216
- const rapportsDir = path.join(dataDir, RAPPORT_FOLDER)
217
- fs.ensureDirSync(rapportsDir)
218
-
219
- let retrievedRapportsCount = 0
220
- const rapportUrlsNotFoundOrError = []
221
-
222
- for (const session of options.sessions) {
223
- for (const { item: rapportMetadata } of iterLoadSenatDossiersLegislatifsRapportUrls(dataDir, session)) {
224
- const rapportDir = path.join(rapportsDir, `${rapportMetadata.session ?? UNDEFINED_SESSION}`, rapportMetadata.name)
225
- fs.ensureDirSync(rapportDir)
226
-
227
- if (isOptionEmptyOrHasValue(options.formats, "html")) {
228
- const rapportPath = path.join(rapportDir, `${rapportMetadata.name}.html`)
229
-
230
- if (!options.force && fs.existsSync(rapportPath)) {
231
- if (!options.silent) {
232
- console.info(`Already retrieved rapport ${rapportPath}…`)
233
- }
234
- continue
235
- }
236
-
237
- const rapportBuffer = await retrieveDocument(rapportMetadata.url_html.toString())
238
- if (!rapportBuffer) {
239
- rapportUrlsNotFoundOrError.push(rapportMetadata.url_html)
240
- continue
241
- }
242
- fs.writeFileSync(rapportPath, Buffer.from(rapportBuffer))
243
- retrievedRapportsCount++
244
- }
245
-
246
- if (isOptionEmptyOrHasValue(options.formats, "pdf")) {
247
- const rapportPath = path.join(rapportDir, `${rapportMetadata.name}.pdf`)
248
-
249
- if (!options.force && fs.existsSync(rapportPath)) {
250
- if (!options.silent) {
251
- console.info(`Already retrieved rapport ${rapportPath}…`)
252
- }
253
- continue
254
- }
255
-
256
- const rapportBuffer = await retrieveDocument(rapportMetadata.url_pdf.toString())
257
- if (!rapportBuffer) {
258
- rapportUrlsNotFoundOrError.push(rapportMetadata.url_pdf)
259
- continue
260
- }
261
- fs.writeFileSync(rapportPath, Buffer.from(rapportBuffer))
262
- retrievedRapportsCount++
263
- }
264
- }
265
- }
266
-
267
- if (options.verbose) {
268
- console.log(`${retrievedRapportsCount} rapports retrieved`)
269
- console.log(
270
- `${rapportUrlsNotFoundOrError.length} rapports failed with URLs ${rapportUrlsNotFoundOrError.join(", ")}`
271
- )
272
- }
273
- }
274
-
275
- async function main() {
276
- const dataDir = options.dataDir
277
- assert(dataDir, "Missing argument: data directory")
278
-
279
- console.time("documents processing time")
280
-
281
- if (isOptionEmptyOrHasValue(options.types, "textes")) {
282
- await retrieveTextes(dataDir)
283
- }
284
- if (isOptionEmptyOrHasValue(options.types, "rapports")) {
285
- await retrieveRapports(dataDir)
286
- }
287
-
288
- if (!options.silent) {
289
- console.timeEnd("documents processing time")
290
- }
291
- }
292
-
293
- main()
294
- .then(() => process.exit(0))
295
- .catch((error) => {
296
- console.log(error)
297
- process.exit(1)
298
- })