@tricoteuses/senat 3.1.0 → 3.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (306) hide show
  1. package/lib/src/loaders.d.ts +3 -3
  2. package/lib/src/loaders.js +1 -1
  3. package/lib/src/model/agenda.d.ts +1 -1
  4. package/lib/src/model/commission.d.ts +2 -2
  5. package/lib/src/model/seance.d.ts +1 -1
  6. package/lib/src/types/ameli.d.ts +4 -1761
  7. package/lib/src/types/ameli.js +1 -1074
  8. package/lib/src/types/debats.d.ts +2 -380
  9. package/lib/src/types/debats.js +1 -266
  10. package/lib/src/types/dosleg.d.ts +69 -2953
  11. package/lib/src/types/dosleg.js +1 -2005
  12. package/lib/src/types/questions.d.ts +2 -699
  13. package/lib/src/types/questions.js +1 -493
  14. package/lib/src/types/sens.d.ts +7 -7842
  15. package/lib/src/types/sens.js +1 -4691
  16. package/lib/src/utils/nvs-parsing.d.ts +1 -1
  17. package/lib/src/utils/nvs-parsing.js +9 -1
  18. package/lib/src/videos/pipeline.d.ts +3 -3
  19. package/lib/src/videos/pipeline.js +2 -2
  20. package/package.json +1 -1
  21. package/lib/add-js-extensions-v2.d.ts +0 -1
  22. package/lib/add-js-extensions-v2.js +0 -23
  23. package/lib/add-js-extensions.d.ts +0 -1
  24. package/lib/add-js-extensions.js +0 -17
  25. package/lib/aggregates.d.ts +0 -52
  26. package/lib/aggregates.js +0 -930
  27. package/lib/aggregates.mjs +0 -713
  28. package/lib/aggregates.ts +0 -833
  29. package/lib/config.d.ts +0 -10
  30. package/lib/config.js +0 -16
  31. package/lib/config.mjs +0 -16
  32. package/lib/config.ts +0 -26
  33. package/lib/databases.d.ts +0 -2
  34. package/lib/databases.js +0 -26
  35. package/lib/databases.mjs +0 -57
  36. package/lib/databases.ts +0 -71
  37. package/lib/datasets.d.ts +0 -34
  38. package/lib/datasets.js +0 -233
  39. package/lib/datasets.mjs +0 -78
  40. package/lib/datasets.ts +0 -118
  41. package/lib/fields.d.ts +0 -10
  42. package/lib/fields.js +0 -68
  43. package/lib/fields.mjs +0 -22
  44. package/lib/fields.ts +0 -29
  45. package/lib/git.d.ts +0 -26
  46. package/lib/git.js +0 -167
  47. package/lib/index.d.ts +0 -13
  48. package/lib/index.js +0 -1
  49. package/lib/index.mjs +0 -7
  50. package/lib/index.ts +0 -64
  51. package/lib/inserters.d.ts +0 -98
  52. package/lib/inserters.js +0 -500
  53. package/lib/inserters.mjs +0 -360
  54. package/lib/inserters.ts +0 -521
  55. package/lib/legislatures.json +0 -38
  56. package/lib/loaders.d.ts +0 -58
  57. package/lib/loaders.js +0 -286
  58. package/lib/loaders.mjs +0 -158
  59. package/lib/loaders.ts +0 -271
  60. package/lib/model/agenda.d.ts +0 -6
  61. package/lib/model/agenda.js +0 -148
  62. package/lib/model/ameli.d.ts +0 -51
  63. package/lib/model/ameli.js +0 -149
  64. package/lib/model/ameli.mjs +0 -84
  65. package/lib/model/ameli.ts +0 -100
  66. package/lib/model/commission.d.ts +0 -18
  67. package/lib/model/commission.js +0 -269
  68. package/lib/model/debats.d.ts +0 -67
  69. package/lib/model/debats.js +0 -95
  70. package/lib/model/debats.mjs +0 -43
  71. package/lib/model/debats.ts +0 -68
  72. package/lib/model/documents.d.ts +0 -12
  73. package/lib/model/documents.js +0 -151
  74. package/lib/model/dosleg.d.ts +0 -7
  75. package/lib/model/dosleg.js +0 -326
  76. package/lib/model/dosleg.mjs +0 -196
  77. package/lib/model/dosleg.ts +0 -240
  78. package/lib/model/index.d.ts +0 -7
  79. package/lib/model/index.js +0 -7
  80. package/lib/model/index.mjs +0 -5
  81. package/lib/model/index.ts +0 -15
  82. package/lib/model/questions.d.ts +0 -45
  83. package/lib/model/questions.js +0 -89
  84. package/lib/model/questions.mjs +0 -71
  85. package/lib/model/questions.ts +0 -93
  86. package/lib/model/scrutins.d.ts +0 -13
  87. package/lib/model/scrutins.js +0 -114
  88. package/lib/model/seance.d.ts +0 -3
  89. package/lib/model/seance.js +0 -267
  90. package/lib/model/sens.d.ts +0 -146
  91. package/lib/model/sens.js +0 -454
  92. package/lib/model/sens.mjs +0 -415
  93. package/lib/model/sens.ts +0 -516
  94. package/lib/model/texte.d.ts +0 -7
  95. package/lib/model/texte.js +0 -256
  96. package/lib/model/texte.mjs +0 -208
  97. package/lib/model/texte.ts +0 -229
  98. package/lib/model/util.d.ts +0 -9
  99. package/lib/model/util.js +0 -38
  100. package/lib/model/util.mjs +0 -19
  101. package/lib/model/util.ts +0 -32
  102. package/lib/parsers/texte.d.ts +0 -7
  103. package/lib/parsers/texte.js +0 -228
  104. package/lib/raw_types/ameli.d.ts +0 -914
  105. package/lib/raw_types/ameli.js +0 -5
  106. package/lib/raw_types/ameli.mjs +0 -163
  107. package/lib/raw_types/debats.d.ts +0 -207
  108. package/lib/raw_types/debats.js +0 -5
  109. package/lib/raw_types/debats.mjs +0 -58
  110. package/lib/raw_types/dosleg.d.ts +0 -1619
  111. package/lib/raw_types/dosleg.js +0 -5
  112. package/lib/raw_types/dosleg.mjs +0 -438
  113. package/lib/raw_types/questions.d.ts +0 -419
  114. package/lib/raw_types/questions.js +0 -5
  115. package/lib/raw_types/questions.mjs +0 -11
  116. package/lib/raw_types/senat.d.ts +0 -11368
  117. package/lib/raw_types/senat.js +0 -5
  118. package/lib/raw_types/sens.d.ts +0 -8248
  119. package/lib/raw_types/sens.js +0 -5
  120. package/lib/raw_types/sens.mjs +0 -508
  121. package/lib/raw_types_kysely/ameli.d.ts +0 -915
  122. package/lib/raw_types_kysely/ameli.js +0 -7
  123. package/lib/raw_types_kysely/ameli.mjs +0 -5
  124. package/lib/raw_types_kysely/ameli.ts +0 -951
  125. package/lib/raw_types_kysely/debats.d.ts +0 -207
  126. package/lib/raw_types_kysely/debats.js +0 -7
  127. package/lib/raw_types_kysely/debats.mjs +0 -5
  128. package/lib/raw_types_kysely/debats.ts +0 -222
  129. package/lib/raw_types_kysely/dosleg.d.ts +0 -3532
  130. package/lib/raw_types_kysely/dosleg.js +0 -7
  131. package/lib/raw_types_kysely/dosleg.mjs +0 -5
  132. package/lib/raw_types_kysely/dosleg.ts +0 -3621
  133. package/lib/raw_types_kysely/questions.d.ts +0 -414
  134. package/lib/raw_types_kysely/questions.js +0 -7
  135. package/lib/raw_types_kysely/questions.mjs +0 -5
  136. package/lib/raw_types_kysely/questions.ts +0 -426
  137. package/lib/raw_types_kysely/sens.d.ts +0 -4394
  138. package/lib/raw_types_kysely/sens.js +0 -7
  139. package/lib/raw_types_kysely/sens.mjs +0 -5
  140. package/lib/raw_types_kysely/sens.ts +0 -4499
  141. package/lib/raw_types_schemats/ameli.d.ts +0 -539
  142. package/lib/raw_types_schemats/ameli.js +0 -2
  143. package/lib/raw_types_schemats/ameli.mjs +0 -2
  144. package/lib/raw_types_schemats/ameli.ts +0 -601
  145. package/lib/raw_types_schemats/debats.d.ts +0 -127
  146. package/lib/raw_types_schemats/debats.js +0 -2
  147. package/lib/raw_types_schemats/debats.mjs +0 -2
  148. package/lib/raw_types_schemats/debats.ts +0 -145
  149. package/lib/raw_types_schemats/dosleg.d.ts +0 -977
  150. package/lib/raw_types_schemats/dosleg.js +0 -2
  151. package/lib/raw_types_schemats/dosleg.mjs +0 -2
  152. package/lib/raw_types_schemats/dosleg.ts +0 -2193
  153. package/lib/raw_types_schemats/questions.d.ts +0 -235
  154. package/lib/raw_types_schemats/questions.js +0 -2
  155. package/lib/raw_types_schemats/questions.mjs +0 -2
  156. package/lib/raw_types_schemats/questions.ts +0 -249
  157. package/lib/raw_types_schemats/sens.d.ts +0 -6915
  158. package/lib/raw_types_schemats/sens.js +0 -2
  159. package/lib/raw_types_schemats/sens.mjs +0 -2
  160. package/lib/raw_types_schemats/sens.ts +0 -2907
  161. package/lib/scripts/convert_data.d.ts +0 -1
  162. package/lib/scripts/convert_data.js +0 -354
  163. package/lib/scripts/convert_data.mjs +0 -181
  164. package/lib/scripts/convert_data.ts +0 -243
  165. package/lib/scripts/data-download.d.ts +0 -1
  166. package/lib/scripts/data-download.js +0 -12
  167. package/lib/scripts/datautil.d.ts +0 -8
  168. package/lib/scripts/datautil.js +0 -34
  169. package/lib/scripts/datautil.mjs +0 -16
  170. package/lib/scripts/datautil.ts +0 -19
  171. package/lib/scripts/images/transparent_150x192.jpg +0 -0
  172. package/lib/scripts/images/transparent_155x225.jpg +0 -0
  173. package/lib/scripts/parse_textes.d.ts +0 -1
  174. package/lib/scripts/parse_textes.js +0 -44
  175. package/lib/scripts/parse_textes.mjs +0 -46
  176. package/lib/scripts/parse_textes.ts +0 -65
  177. package/lib/scripts/retrieve_agenda.d.ts +0 -1
  178. package/lib/scripts/retrieve_agenda.js +0 -132
  179. package/lib/scripts/retrieve_cr_commission.d.ts +0 -1
  180. package/lib/scripts/retrieve_cr_commission.js +0 -364
  181. package/lib/scripts/retrieve_cr_seance.d.ts +0 -6
  182. package/lib/scripts/retrieve_cr_seance.js +0 -347
  183. package/lib/scripts/retrieve_documents.d.ts +0 -3
  184. package/lib/scripts/retrieve_documents.js +0 -219
  185. package/lib/scripts/retrieve_documents.mjs +0 -249
  186. package/lib/scripts/retrieve_documents.ts +0 -298
  187. package/lib/scripts/retrieve_open_data.d.ts +0 -1
  188. package/lib/scripts/retrieve_open_data.js +0 -315
  189. package/lib/scripts/retrieve_open_data.mjs +0 -217
  190. package/lib/scripts/retrieve_open_data.ts +0 -268
  191. package/lib/scripts/retrieve_senateurs_photos.d.ts +0 -1
  192. package/lib/scripts/retrieve_senateurs_photos.js +0 -147
  193. package/lib/scripts/retrieve_senateurs_photos.mjs +0 -147
  194. package/lib/scripts/retrieve_senateurs_photos.ts +0 -177
  195. package/lib/scripts/retrieve_videos.d.ts +0 -1
  196. package/lib/scripts/retrieve_videos.js +0 -461
  197. package/lib/scripts/shared/cli_helpers.d.ts +0 -95
  198. package/lib/scripts/shared/cli_helpers.js +0 -91
  199. package/lib/scripts/shared/cli_helpers.ts +0 -36
  200. package/lib/scripts/shared/util.d.ts +0 -4
  201. package/lib/scripts/shared/util.js +0 -35
  202. package/lib/scripts/shared/util.ts +0 -33
  203. package/lib/scripts/test_iter_load.d.ts +0 -1
  204. package/lib/scripts/test_iter_load.js +0 -12
  205. package/lib/src/ameli.d.ts +0 -66
  206. package/lib/src/ameli.js +0 -1
  207. package/lib/src/databases.d.ts +0 -3
  208. package/lib/src/databases.js +0 -26
  209. package/lib/src/db_types/ameli.d.ts +0 -1762
  210. package/lib/src/db_types/ameli.js +0 -1074
  211. package/lib/src/db_types/debats.d.ts +0 -380
  212. package/lib/src/db_types/debats.js +0 -266
  213. package/lib/src/db_types/dosleg.d.ts +0 -2954
  214. package/lib/src/db_types/dosleg.js +0 -2005
  215. package/lib/src/db_types/questions.d.ts +0 -699
  216. package/lib/src/db_types/questions.js +0 -493
  217. package/lib/src/db_types/sens.d.ts +0 -7843
  218. package/lib/src/db_types/sens.js +0 -4691
  219. package/lib/src/debats.d.ts +0 -38
  220. package/lib/src/debats.js +0 -1
  221. package/lib/src/dosleg.d.ts +0 -142
  222. package/lib/src/dosleg.js +0 -193
  223. package/lib/src/model/ameli_postgres.d.ts +0 -67
  224. package/lib/src/model/ameli_postgres.js +0 -150
  225. package/lib/src/other_types/questions.d.ts +0 -2
  226. package/lib/src/other_types/questions.js +0 -1
  227. package/lib/src/questions.d.ts +0 -53
  228. package/lib/src/questions.js +0 -1
  229. package/lib/src/raw_types/senat.d.ts +0 -11372
  230. package/lib/src/raw_types/senat.js +0 -5
  231. package/lib/src/rich_types/agenda.d.ts +0 -45
  232. package/lib/src/rich_types/agenda.js +0 -1
  233. package/lib/src/rich_types/compte_rendu.d.ts +0 -83
  234. package/lib/src/rich_types/compte_rendu.js +0 -1
  235. package/lib/src/rich_types/sessions.d.ts +0 -6
  236. package/lib/src/rich_types/sessions.js +0 -19
  237. package/lib/src/rich_types/texte.d.ts +0 -72
  238. package/lib/src/rich_types/texte.js +0 -15
  239. package/lib/src/scripts/test_iter_load.d.ts +0 -1
  240. package/lib/src/scripts/test_iter_load.js +0 -12
  241. package/lib/src/sens.d.ts +0 -104
  242. package/lib/src/sens.js +0 -1
  243. package/lib/strings.d.ts +0 -1
  244. package/lib/strings.js +0 -18
  245. package/lib/strings.mjs +0 -18
  246. package/lib/strings.ts +0 -26
  247. package/lib/tsconfig.tsbuildinfo +0 -1
  248. package/lib/types/agenda.d.ts +0 -44
  249. package/lib/types/agenda.js +0 -1
  250. package/lib/types/ameli.d.ts +0 -5
  251. package/lib/types/ameli.js +0 -1
  252. package/lib/types/ameli.mjs +0 -13
  253. package/lib/types/ameli.ts +0 -21
  254. package/lib/types/compte_rendu.d.ts +0 -83
  255. package/lib/types/compte_rendu.js +0 -1
  256. package/lib/types/debats.d.ts +0 -2
  257. package/lib/types/debats.js +0 -1
  258. package/lib/types/debats.mjs +0 -2
  259. package/lib/types/debats.ts +0 -6
  260. package/lib/types/dosleg.d.ts +0 -70
  261. package/lib/types/dosleg.js +0 -1
  262. package/lib/types/dosleg.mjs +0 -151
  263. package/lib/types/dosleg.ts +0 -284
  264. package/lib/types/questions.d.ts +0 -2
  265. package/lib/types/questions.js +0 -1
  266. package/lib/types/questions.mjs +0 -1
  267. package/lib/types/questions.ts +0 -3
  268. package/lib/types/sens.d.ts +0 -10
  269. package/lib/types/sens.js +0 -1
  270. package/lib/types/sens.mjs +0 -1
  271. package/lib/types/sens.ts +0 -12
  272. package/lib/types/sessions.d.ts +0 -5
  273. package/lib/types/sessions.js +0 -84
  274. package/lib/types/sessions.mjs +0 -43
  275. package/lib/types/sessions.ts +0 -42
  276. package/lib/types/texte.d.ts +0 -74
  277. package/lib/types/texte.js +0 -16
  278. package/lib/types/texte.mjs +0 -16
  279. package/lib/types/texte.ts +0 -76
  280. package/lib/typings/windows-1252.d.js +0 -2
  281. package/lib/typings/windows-1252.d.mjs +0 -2
  282. package/lib/typings/windows-1252.d.ts +0 -11
  283. package/lib/utils/cr_spliting.d.ts +0 -28
  284. package/lib/utils/cr_spliting.js +0 -265
  285. package/lib/utils/date.d.ts +0 -10
  286. package/lib/utils/date.js +0 -100
  287. package/lib/utils/nvs-timecode.d.ts +0 -7
  288. package/lib/utils/nvs-timecode.js +0 -79
  289. package/lib/utils/reunion_grouping.d.ts +0 -9
  290. package/lib/utils/reunion_grouping.js +0 -361
  291. package/lib/utils/reunion_odj_building.d.ts +0 -5
  292. package/lib/utils/reunion_odj_building.js +0 -154
  293. package/lib/utils/reunion_parsing.d.ts +0 -23
  294. package/lib/utils/reunion_parsing.js +0 -209
  295. package/lib/utils/scoring.d.ts +0 -14
  296. package/lib/utils/scoring.js +0 -147
  297. package/lib/utils/string_cleaning.d.ts +0 -7
  298. package/lib/utils/string_cleaning.js +0 -57
  299. package/lib/validators/config.d.ts +0 -9
  300. package/lib/validators/config.js +0 -10
  301. package/lib/validators/config.mjs +0 -54
  302. package/lib/validators/config.ts +0 -79
  303. package/lib/validators/senat.d.ts +0 -0
  304. package/lib/validators/senat.js +0 -28
  305. package/lib/validators/senat.mjs +0 -24
  306. package/lib/validators/senat.ts +0 -26
@@ -1,347 +0,0 @@
1
- /**
2
- * Needs to be ran after retrieve_agenda.ts script !
3
- * - downloads the ZIP of comptes-rendus des débats (CRI) from data.senat.fr
4
- * - extracts XML files, distributes them by session/year
5
- */
6
- import assert from "assert";
7
- import commandLineArgs from "command-line-args";
8
- import fs, { ensureDirSync } from "fs-extra";
9
- import path from "path";
10
- import StreamZip from "node-stream-zip";
11
- import * as cheerio from "cheerio";
12
- import { AGENDA_FOLDER, COMPTES_RENDUS_FOLDER, DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER } from "../loaders";
13
- import { commonOptions } from "./shared/cli_helpers";
14
- import { parseCompteRenduIntervalFromFile, sessionStartYearFromDate } from "../model/seance";
15
- import { extractSommaireBlocks, makeReunionUid } from "../utils/reunion_parsing";
16
- import { getSessionsFromStart } from "../types/sessions";
17
- import { ensureAndClearDir, fetchWithRetry } from "./shared/util";
18
- import { isNoiseBlock, scoreSommaireBlockForEvent } from "../utils/scoring";
19
- import { parseYYYYMMDD } from "../utils/date";
20
- const optionsDefinitions = [
21
- ...commonOptions,
22
- {
23
- help: "parse and convert comptes-rendus des débats into JSON",
24
- name: "parseDebats",
25
- type: Boolean,
26
- },
27
- ];
28
- const options = commandLineArgs(optionsDefinitions);
29
- const CRI_ZIP_URL = "https://data.senat.fr/data/debats/cri.zip";
30
- class CompteRenduError extends Error {
31
- constructor(message, url) {
32
- super(`An error occurred while retrieving ${url}: ${message}`);
33
- }
34
- }
35
- async function downloadCriZip(zipPath) {
36
- if (!options["silent"])
37
- console.log(`Downloading CRI zip ${CRI_ZIP_URL}…`);
38
- const response = await fetchWithRetry(CRI_ZIP_URL);
39
- if (!response.ok) {
40
- if (response.status === 404) {
41
- console.warn(`CRI zip ${CRI_ZIP_URL} not found`);
42
- return;
43
- }
44
- throw new CompteRenduError(String(response.status), CRI_ZIP_URL);
45
- }
46
- const buf = Buffer.from(await response.arrayBuffer());
47
- await fs.writeFile(zipPath, buf);
48
- if (!options["silent"]) {
49
- const mb = (buf.length / (1024 * 1024)).toFixed(1);
50
- console.log(`[CRI] Downloaded ${mb} MB → ${zipPath}`);
51
- }
52
- }
53
- async function extractAndDistributeXmlBySession(zipPath, originalRoot) {
54
- const zip = new StreamZip.async({ file: zipPath });
55
- const entries = await zip.entries();
56
- let count = 0;
57
- for (const entryName of Object.keys(entries)) {
58
- if (!entryName.toLowerCase().endsWith(".xml"))
59
- continue;
60
- // ex: d20231005.xml
61
- const base = path.basename(entryName);
62
- const m = base.match(/^d(\d{8})\.xml$/i);
63
- if (!m)
64
- continue;
65
- const yyyymmdd = m[1];
66
- const dt = parseYYYYMMDD(yyyymmdd);
67
- if (!dt)
68
- continue;
69
- const session = sessionStartYearFromDate(dt);
70
- const destDir = path.join(originalRoot, String(session));
71
- await fs.ensureDir(destDir);
72
- const outPath = path.join(destDir, base);
73
- await zip.extract(entryName, outPath);
74
- count++;
75
- }
76
- await zip.close();
77
- return count;
78
- }
79
- export async function retrieveCriXmlDump(dataDir, options = {}) {
80
- const root = path.join(dataDir, COMPTES_RENDUS_FOLDER);
81
- ensureDirSync(root);
82
- const originalRoot = path.join(root, DATA_ORIGINAL_FOLDER);
83
- if (!options["keepDir"]) {
84
- ensureAndClearDir(originalRoot);
85
- }
86
- else {
87
- fs.ensureDirSync(originalRoot);
88
- }
89
- const transformedRoot = path.join(root, DATA_TRANSFORMED_FOLDER);
90
- if (!options["keepDir"]) {
91
- ensureAndClearDir(transformedRoot);
92
- }
93
- else {
94
- fs.ensureDirSync(transformedRoot);
95
- }
96
- const sessions = getSessionsFromStart(options["fromSession"]);
97
- // 1) Download ZIP global + distribut by session
98
- const zipPath = path.join(dataDir, "cri.zip");
99
- console.log("[CRI] Downloading global CRI zip…");
100
- await downloadCriZip(zipPath);
101
- console.log("[CRI] Extracting + distributing XMLs by session…");
102
- for (const session of sessions) {
103
- const dir = path.join(originalRoot, String(session));
104
- if (await fs.pathExists(dir)) {
105
- for (const f of await fs.readdir(dir))
106
- if (/\.xml$/i.test(f))
107
- await fs.remove(path.join(dir, f));
108
- }
109
- }
110
- const n = await extractAndDistributeXmlBySession(zipPath, originalRoot);
111
- if (n === 0) {
112
- console.warn("[CRI] No XML extracted. Archive empty or layout changed?");
113
- }
114
- else {
115
- console.log(`[CRI] Distributed ${n} XML file(s) into session folders.`);
116
- }
117
- if (!options["parseDebats"]) {
118
- console.log("[CRI] parseDebats not requested → done.");
119
- return;
120
- }
121
- for (const session of sessions) {
122
- const originalSessionDir = path.join(originalRoot, String(session));
123
- if (!(await fs.pathExists(originalSessionDir))) {
124
- continue;
125
- }
126
- const xmlFiles = (await fs.readdir(originalSessionDir)).filter((f) => /^d\d{8}\.xml$/i.test(f)).sort();
127
- const transformedSessionDir = path.join(transformedRoot, String(session));
128
- await fs.ensureDir(transformedSessionDir);
129
- const now = Date.now();
130
- for (const f of xmlFiles) {
131
- const yyyymmdd = f.slice(1, 9);
132
- const xmlPath = path.join(originalSessionDir, f);
133
- // === ONLY-RECENT
134
- if (options["only-recent"]) {
135
- const cutoff = now - options["only-recent"] * 24 * 3600 * 1000;
136
- const seanceTs = Date.parse(`${yyyymmdd.slice(0, 4)}-${yyyymmdd.slice(4, 6)}-${yyyymmdd.slice(6, 8)}`);
137
- if (seanceTs < cutoff) {
138
- const files = await fs.readdir(transformedSessionDir);
139
- const dayFiles = files.filter((fn) => fn.startsWith(`CRSSN${yyyymmdd}E`) && fn.endsWith(".json"));
140
- if (dayFiles.length > 0) {
141
- for (const fn of dayFiles) {
142
- const match = fn.match(/^CRSSN(\d{8})E(.+)\.json$/);
143
- const eventId = match?.[2];
144
- if (!eventId)
145
- continue;
146
- const crPath = path.join(transformedSessionDir, fn);
147
- try {
148
- const cr = await fs.readJSON(crPath);
149
- await linkCriEventIntoAgenda(dataDir, yyyymmdd, eventId, cr.uid, cr, session);
150
- }
151
- catch (e) {
152
- console.warn(`[CR] [${session}] Could not relink existing CR into a reunion for ${yyyymmdd} event=${eventId}:`, e);
153
- }
154
- }
155
- continue;
156
- }
157
- }
158
- }
159
- // === Charger les events SP du jour depuis les agendas groupés ===
160
- const dayEvents = await loadAgendaSpEventsForDate(dataDir, yyyymmdd, session);
161
- if (dayEvents.length === 0) {
162
- console.warn(`[CRI] [${session}] No agenda SP events found for ${yyyymmdd} → skip split/link`);
163
- continue;
164
- }
165
- // === Lire XML + construire index DOM ===
166
- let raw;
167
- let $;
168
- let order;
169
- let idx;
170
- try {
171
- raw = await fs.readFile(xmlPath, "utf8");
172
- $ = cheerio.load(raw, { xml: false });
173
- order = $("body *").toArray();
174
- idx = new Map(order.map((el, i) => [el, i]));
175
- }
176
- catch (e) {
177
- console.warn(`[CRI] [${session}] Cannot read/parse ${f}:`, e);
178
- continue;
179
- }
180
- // === Extraire sommaire + matcher vers events agenda ===
181
- const blocks = extractSommaireBlocks($, idx);
182
- const intervals = buildIntervalsByAgendaEvents($, idx, order, blocks, dayEvents);
183
- if (!intervals.length) {
184
- console.warn(`[CRI] [${session}] No confident split intervals for ${yyyymmdd} → skip`);
185
- continue;
186
- }
187
- // === Parser / écrire / linker chaque segment par event ===
188
- for (const iv of intervals) {
189
- const outName = `CRSSN${yyyymmdd}E${iv.agendaEventId}.json`;
190
- const outPath = path.join(transformedSessionDir, outName);
191
- const cr = await parseCompteRenduIntervalFromFile(xmlPath, iv.startIndex, iv.endIndex, iv.agendaEventId);
192
- if (!cr) {
193
- console.warn(`[CRI] [${session}] Empty or no points for ${yyyymmdd} event=${iv.agendaEventId} → skip`);
194
- continue;
195
- }
196
- await fs.ensureDir(transformedSessionDir);
197
- await fs.writeJSON(outPath, cr, { spaces: 2 });
198
- try {
199
- await linkCriEventIntoAgenda(dataDir, yyyymmdd, iv.agendaEventId, cr.uid, cr, session);
200
- }
201
- catch (e) {
202
- console.warn(`[CR] [${session}] Could not link CR into agenda for ${yyyymmdd} event=${iv.agendaEventId}:`, e);
203
- }
204
- }
205
- }
206
- }
207
- }
208
- async function linkCriEventIntoAgenda(dataDir, yyyymmdd, agendaEventId, crUid, cr, session) {
209
- const agendadDir = path.join(dataDir, AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, session.toString());
210
- fs.ensureDirSync(agendadDir);
211
- const dateISO = `${yyyymmdd.slice(0, 4)}-${yyyymmdd.slice(4, 6)}-${yyyymmdd.slice(6, 8)}`;
212
- const agendaUid = makeReunionUid(dateISO, "SP", agendaEventId, null);
213
- const agendaPath = path.join(agendadDir, `${agendaUid}.json`);
214
- let agenda = null;
215
- if (await fs.pathExists(agendaPath)) {
216
- try {
217
- agenda = await fs.readJSON(agendaPath);
218
- }
219
- catch (e) {
220
- console.warn(`[CR] unreadable reunion JSON → ${agendaPath} (${e})`);
221
- agenda = null;
222
- }
223
- }
224
- if (!agenda) {
225
- console.warn(`[CR] Missing reunion file for SP event=${agendaEventId}: ${agendaPath}`);
226
- return;
227
- }
228
- ;
229
- agenda.compteRenduRefUid = crUid;
230
- await fs.writeJSON(agendaPath, agenda, { spaces: 2 });
231
- console.log(`[CR] Linked CR ${crUid} → ${path.basename(agendaPath)} (event=${agendaEventId})`);
232
- }
233
- function buildIntervalsByAgendaEvents($, idx, order, blocks, dayEvents) {
234
- const MIN_SCORE = 0.65;
235
- const MIN_GAP = 0.08;
236
- const firstIntervenant = $("div.intervenant").first()[0];
237
- const firstIntervenantIdx = firstIntervenant ? (idx.get(firstIntervenant) ?? null) : null;
238
- const pivots = [];
239
- for (const b of blocks) {
240
- if (isNoiseBlock(b.text))
241
- continue;
242
- let best = null;
243
- let second = 0;
244
- for (const ev of dayEvents) {
245
- const s = scoreSommaireBlockForEvent(b.text, ev);
246
- if (!best || s > best.score) {
247
- second = best?.score ?? second;
248
- best = { ev, score: s };
249
- }
250
- else if (s > second) {
251
- second = s;
252
- }
253
- }
254
- if (!best)
255
- continue;
256
- const resolved = resolveTargetIndex($, idx, b.targetId);
257
- const contentStartIndex = resolved ?? b.startIndex;
258
- if (firstIntervenantIdx != null && contentStartIndex < firstIntervenantIdx && resolved == null) {
259
- continue;
260
- }
261
- if (best.score < MIN_SCORE)
262
- continue;
263
- if (best.score - second < MIN_GAP)
264
- continue;
265
- pivots.push({
266
- agendaEventId: best.ev.id,
267
- startIndex: contentStartIndex,
268
- score: best.score,
269
- });
270
- }
271
- if (pivots.length === 0)
272
- return [];
273
- // Dédupe par event (on garde le premier startIndex)
274
- const byEvent = new Map();
275
- for (const p of pivots.sort((a, b) => a.startIndex - b.startIndex)) {
276
- if (!byEvent.has(p.agendaEventId)) {
277
- byEvent.set(p.agendaEventId, {
278
- startIndex: p.startIndex,
279
- score: p.score,
280
- });
281
- }
282
- }
283
- const sorted = Array.from(byEvent.entries())
284
- .map(([agendaEventId, v]) => ({
285
- agendaEventId,
286
- startIndex: v.startIndex,
287
- score: v.score,
288
- }))
289
- .sort((a, b) => a.startIndex - b.startIndex);
290
- // Construction des intervalles
291
- const intervals = [];
292
- for (let i = 0; i < sorted.length; i++) {
293
- const cur = sorted[i];
294
- const next = sorted[i + 1];
295
- const endIndex = next ? next.startIndex - 1 : order.length - 1;
296
- intervals.push({
297
- agendaEventId: cur.agendaEventId,
298
- startIndex: cur.startIndex,
299
- endIndex,
300
- score: cur.score,
301
- });
302
- }
303
- return intervals;
304
- }
305
- async function loadAgendaSpEventsForDate(dataDir, yyyymmdd, session) {
306
- const agendasDir = path.join(dataDir, AGENDA_FOLDER, DATA_TRANSFORMED_FOLDER, session.toString());
307
- if (!(await fs.pathExists(agendasDir)))
308
- return [];
309
- const files = (await fs.readdir(agendasDir)).filter((fn) => fn.startsWith(`RUSN${yyyymmdd}IDS`) && fn.endsWith(".json"));
310
- const events = [];
311
- for (const fn of files) {
312
- try {
313
- const g = (await fs.readJSON(path.join(agendasDir, fn)));
314
- const e = g?.events?.[0];
315
- if (e && e.type === "Séance publique")
316
- events.push(e);
317
- }
318
- catch { }
319
- }
320
- return events;
321
- }
322
- function cssEscapeIdent(s) {
323
- return s.replace(/\\/g, "\\\\").replace(/"/g, '\\"');
324
- }
325
- function resolveTargetIndex($, idx, targetId) {
326
- if (!targetId)
327
- return null;
328
- const safe = cssEscapeIdent(targetId);
329
- const el = $(`[id="${safe}"]`)[0] || $(`[name="${safe}"]`)[0];
330
- if (!el)
331
- return null;
332
- const i = idx.get(el);
333
- return i == null ? null : i;
334
- }
335
- async function main() {
336
- const dataDir = options["dataDir"];
337
- assert(dataDir, "Missing argument: data directory");
338
- console.time("CRI processing time");
339
- await retrieveCriXmlDump(dataDir, options);
340
- console.timeEnd("CRI processing time");
341
- }
342
- main()
343
- .then(() => process.exit(0))
344
- .catch((error) => {
345
- console.error(error);
346
- process.exit(1);
347
- });
@@ -1,3 +0,0 @@
1
- import { DocumentMetadata } from "../types/texte";
2
- export declare function processTexte(texteMetadata: DocumentMetadata, originalTextesDir: string, transformedTextesDir: string, options: any): Promise<void>;
3
- export declare function processRapport(rapportMetadata: any, originalRapportsDir: string, options: any): Promise<void>;
@@ -1,219 +0,0 @@
1
- import assert from "assert";
2
- import commandLineArgs from "command-line-args";
3
- import fs from "fs-extra";
4
- import { DateTime } from "luxon";
5
- import path from "path";
6
- import { DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER, iterLoadSenatRapportUrls, iterLoadSenatTexteUrls, RAPPORT_FOLDER, TEXTE_FOLDER, } from "../loaders";
7
- import { parseExposeDesMotifs, parseTexte, parseTexteFromFile } from "../parsers/texte";
8
- import { getSessionsFromStart, UNDEFINED_SESSION } from "../types/sessions";
9
- import { commonOptions } from "./shared/cli_helpers";
10
- import { ensureAndClearDir, fetchWithRetry, isOptionEmptyOrHasValue } from "./shared/util";
11
- const optionsDefinitions = [
12
- ...commonOptions,
13
- {
14
- alias: "F",
15
- help: "formats of documents to retrieve (xml/html/pdf for textes, html/pdf for rapports); leave empty for all",
16
- multiple: true,
17
- name: "formats",
18
- type: String,
19
- },
20
- {
21
- help: "types of documents to retrieve (textes/rapports); leave empty for all",
22
- multiple: true,
23
- name: "types",
24
- type: String,
25
- },
26
- {
27
- help: "force retrieve all documents, even already retrieved ones",
28
- name: "force",
29
- type: Boolean,
30
- },
31
- ];
32
- const options = commandLineArgs(optionsDefinitions);
33
- const textDecoder = new TextDecoder("utf8");
34
- const today = DateTime.now();
35
- function isDocumentRecent(documentDate, daysThreshold) {
36
- if (!documentDate)
37
- return false;
38
- const docDate = DateTime.fromISO(documentDate);
39
- return docDate.isValid && today.diff(docDate, "days").days <= daysThreshold;
40
- }
41
- function shouldDownload(filePath, docDate, options) {
42
- if (options.force)
43
- return true;
44
- if (!fs.existsSync(filePath))
45
- return true;
46
- if (options.onlyRecent !== undefined) {
47
- return isDocumentRecent(docDate, options.onlyRecent);
48
- }
49
- return false;
50
- }
51
- async function downloadDocument(documentUrl, verbose) {
52
- if (verbose) {
53
- console.log(`Downloading document ${documentUrl}…`);
54
- }
55
- try {
56
- const response = await fetchWithRetry(documentUrl);
57
- if (!response.ok) {
58
- if (response.status === 404) {
59
- if (verbose) {
60
- console.warn(`Document ${documentUrl} not found`);
61
- }
62
- }
63
- else {
64
- if (verbose) {
65
- console.error(`An error occurred while retrieving document ${documentUrl}: ${response.status}`);
66
- }
67
- }
68
- return null;
69
- }
70
- return response.arrayBuffer();
71
- }
72
- catch (error) {
73
- console.error(error.message);
74
- return null;
75
- }
76
- }
77
- async function processDocument(url, destPath, docDate, options) {
78
- if (!shouldDownload(destPath, docDate, options)) {
79
- if (options.verbose)
80
- console.info(`Already downloaded ${destPath}…`);
81
- return { success: true, skipped: true, buffer: null };
82
- }
83
- const arrayBuffer = await downloadDocument(url, options.verbose);
84
- if (!arrayBuffer) {
85
- return { success: false, skipped: false, buffer: null };
86
- }
87
- const buffer = Buffer.from(arrayBuffer);
88
- await fs.outputFile(destPath, buffer);
89
- return { success: true, skipped: false, buffer };
90
- }
91
- export async function processTexte(texteMetadata, originalTextesDir, transformedTextesDir, options) {
92
- const texteDir = path.join(originalTextesDir, `${texteMetadata.session ?? UNDEFINED_SESSION}`, texteMetadata.name);
93
- let exposeDesMotifsContent = null;
94
- if (texteMetadata.url_expose_des_motifs) {
95
- const exposePath = path.join(texteDir, `${texteMetadata.name}-expose.html`);
96
- const res = await processDocument(texteMetadata.url_expose_des_motifs.toString(), exposePath, texteMetadata.date, options);
97
- if (res.buffer) {
98
- exposeDesMotifsContent = res.buffer;
99
- }
100
- else if (res.skipped && options.parseDocuments) {
101
- if (await fs.pathExists(exposePath)) {
102
- exposeDesMotifsContent = await fs.readFile(exposePath);
103
- }
104
- }
105
- }
106
- const formats = [
107
- { type: "xml", url: texteMetadata.url_xml, isParseTarget: true },
108
- { type: "html", url: texteMetadata.url_html, isParseTarget: false },
109
- { type: "pdf", url: texteMetadata.url_pdf, isParseTarget: false },
110
- ];
111
- for (const format of formats) {
112
- if (!isOptionEmptyOrHasValue(options.formats, format.type))
113
- continue;
114
- if (!format.url)
115
- continue;
116
- const destPath = path.join(texteDir, `${texteMetadata.name}.${format.type}`);
117
- const result = await processDocument(format.url.toString(), destPath, texteMetadata.date, options);
118
- // Specific logic: Parsing (Only applies to XML)
119
- if (format.isParseTarget && options.parseDocuments) {
120
- await parseDocument(texteMetadata.session, transformedTextesDir, destPath, texteMetadata.name, result.buffer, exposeDesMotifsContent, options);
121
- }
122
- }
123
- }
124
- export async function processRapport(rapportMetadata, originalRapportsDir, options) {
125
- const rapportDir = path.join(originalRapportsDir, `${rapportMetadata.session ?? UNDEFINED_SESSION}`, rapportMetadata.name);
126
- const formats = [
127
- { type: "html", url: rapportMetadata.url_html },
128
- { type: "pdf", url: rapportMetadata.url_pdf },
129
- ];
130
- for (const format of formats) {
131
- if (!isOptionEmptyOrHasValue(options["formats"], format.type))
132
- continue;
133
- const destPath = path.join(rapportDir, `${rapportMetadata.name}.${format.type}`);
134
- await processDocument(format.url.toString(), destPath, rapportMetadata.date, options);
135
- }
136
- }
137
- async function retrieveTextes(dataDir, sessions) {
138
- const originalTextesDir = path.join(dataDir, TEXTE_FOLDER, DATA_ORIGINAL_FOLDER);
139
- const transformedTextesDir = path.join(dataDir, TEXTE_FOLDER, DATA_TRANSFORMED_FOLDER);
140
- if (options["parseDocuments"]) {
141
- ensureAndClearDir(transformedTextesDir);
142
- }
143
- const dlOptions = {
144
- force: options["force"],
145
- silent: options["silent"],
146
- verbose: options["verbose"],
147
- onlyRecent: options["only-recent"],
148
- formats: options["formats"],
149
- parseDocuments: options["parseDocuments"],
150
- };
151
- for (const session of sessions) {
152
- for (const { item: texteMetadata } of iterLoadSenatTexteUrls(dataDir, session)) {
153
- await processTexte(texteMetadata, originalTextesDir, transformedTextesDir, dlOptions);
154
- }
155
- }
156
- }
157
- async function retrieveRapports(dataDir, sessions) {
158
- const originalRapportsDir = path.join(dataDir, RAPPORT_FOLDER, DATA_ORIGINAL_FOLDER);
159
- const dlOptions = {
160
- force: options["force"],
161
- silent: options["silent"],
162
- verbose: options["verbose"],
163
- onlyRecent: options["only-recent"],
164
- formats: options["formats"],
165
- };
166
- for (const session of sessions) {
167
- for (const { item: rapportMetadata } of iterLoadSenatRapportUrls(dataDir, session)) {
168
- await processRapport(rapportMetadata, originalRapportsDir, dlOptions);
169
- }
170
- }
171
- }
172
- async function parseDocument(session, transformedTextesDir, textePath, texteName, texteBuffer, exposeDesMotifs = null, options = {}) {
173
- if (options.verbose) {
174
- console.log(`Parsing texte ${textePath}…`);
175
- }
176
- let parsedTexte;
177
- if (texteBuffer) {
178
- const texteXml = textDecoder.decode(texteBuffer);
179
- parsedTexte = parseTexte(texteXml);
180
- }
181
- else {
182
- parsedTexte = await parseTexteFromFile(textePath);
183
- }
184
- if (!parsedTexte)
185
- return null;
186
- if (exposeDesMotifs) {
187
- if (options.verbose) {
188
- console.log("Parsing exposé des motifs…");
189
- }
190
- const exposeDesMotifsHtml = textDecoder.decode(exposeDesMotifs);
191
- parsedTexte.exposeDesMotifs = parseExposeDesMotifs(exposeDesMotifsHtml);
192
- }
193
- const transformedTexteDir = path.join(transformedTextesDir, `${session ?? UNDEFINED_SESSION}`, texteName);
194
- await fs.outputJSON(path.join(transformedTexteDir, `${texteName}.json`), parsedTexte, { spaces: 2 });
195
- return parsedTexte;
196
- }
197
- async function main() {
198
- const dataDir = options["dataDir"];
199
- assert(dataDir, "Missing argument: data directory");
200
- const sessions = getSessionsFromStart(options["fromSession"]);
201
- console.time("documents processing time");
202
- if (isOptionEmptyOrHasValue(options["types"], "textes")) {
203
- await retrieveTextes(dataDir, sessions);
204
- }
205
- if (isOptionEmptyOrHasValue(options["types"], "rapports")) {
206
- await retrieveRapports(dataDir, sessions);
207
- }
208
- if (!options["silent"]) {
209
- console.timeEnd("documents processing time");
210
- }
211
- }
212
- if (process.argv[1].endsWith("retrieve_documents.ts")) {
213
- main()
214
- .then(() => process.exit(0))
215
- .catch((error) => {
216
- console.log(error);
217
- process.exit(1);
218
- });
219
- }