@tricoteuses/senat 3.1.0 → 3.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (306) hide show
  1. package/lib/src/loaders.d.ts +3 -3
  2. package/lib/src/loaders.js +1 -1
  3. package/lib/src/model/agenda.d.ts +1 -1
  4. package/lib/src/model/commission.d.ts +2 -2
  5. package/lib/src/model/seance.d.ts +1 -1
  6. package/lib/src/types/ameli.d.ts +4 -1761
  7. package/lib/src/types/ameli.js +1 -1074
  8. package/lib/src/types/debats.d.ts +2 -380
  9. package/lib/src/types/debats.js +1 -266
  10. package/lib/src/types/dosleg.d.ts +69 -2953
  11. package/lib/src/types/dosleg.js +1 -2005
  12. package/lib/src/types/questions.d.ts +2 -699
  13. package/lib/src/types/questions.js +1 -493
  14. package/lib/src/types/sens.d.ts +7 -7842
  15. package/lib/src/types/sens.js +1 -4691
  16. package/lib/src/utils/nvs-parsing.d.ts +1 -1
  17. package/lib/src/utils/nvs-parsing.js +9 -1
  18. package/lib/src/videos/pipeline.d.ts +3 -3
  19. package/lib/src/videos/pipeline.js +2 -2
  20. package/package.json +1 -1
  21. package/lib/add-js-extensions-v2.d.ts +0 -1
  22. package/lib/add-js-extensions-v2.js +0 -23
  23. package/lib/add-js-extensions.d.ts +0 -1
  24. package/lib/add-js-extensions.js +0 -17
  25. package/lib/aggregates.d.ts +0 -52
  26. package/lib/aggregates.js +0 -930
  27. package/lib/aggregates.mjs +0 -713
  28. package/lib/aggregates.ts +0 -833
  29. package/lib/config.d.ts +0 -10
  30. package/lib/config.js +0 -16
  31. package/lib/config.mjs +0 -16
  32. package/lib/config.ts +0 -26
  33. package/lib/databases.d.ts +0 -2
  34. package/lib/databases.js +0 -26
  35. package/lib/databases.mjs +0 -57
  36. package/lib/databases.ts +0 -71
  37. package/lib/datasets.d.ts +0 -34
  38. package/lib/datasets.js +0 -233
  39. package/lib/datasets.mjs +0 -78
  40. package/lib/datasets.ts +0 -118
  41. package/lib/fields.d.ts +0 -10
  42. package/lib/fields.js +0 -68
  43. package/lib/fields.mjs +0 -22
  44. package/lib/fields.ts +0 -29
  45. package/lib/git.d.ts +0 -26
  46. package/lib/git.js +0 -167
  47. package/lib/index.d.ts +0 -13
  48. package/lib/index.js +0 -1
  49. package/lib/index.mjs +0 -7
  50. package/lib/index.ts +0 -64
  51. package/lib/inserters.d.ts +0 -98
  52. package/lib/inserters.js +0 -500
  53. package/lib/inserters.mjs +0 -360
  54. package/lib/inserters.ts +0 -521
  55. package/lib/legislatures.json +0 -38
  56. package/lib/loaders.d.ts +0 -58
  57. package/lib/loaders.js +0 -286
  58. package/lib/loaders.mjs +0 -158
  59. package/lib/loaders.ts +0 -271
  60. package/lib/model/agenda.d.ts +0 -6
  61. package/lib/model/agenda.js +0 -148
  62. package/lib/model/ameli.d.ts +0 -51
  63. package/lib/model/ameli.js +0 -149
  64. package/lib/model/ameli.mjs +0 -84
  65. package/lib/model/ameli.ts +0 -100
  66. package/lib/model/commission.d.ts +0 -18
  67. package/lib/model/commission.js +0 -269
  68. package/lib/model/debats.d.ts +0 -67
  69. package/lib/model/debats.js +0 -95
  70. package/lib/model/debats.mjs +0 -43
  71. package/lib/model/debats.ts +0 -68
  72. package/lib/model/documents.d.ts +0 -12
  73. package/lib/model/documents.js +0 -151
  74. package/lib/model/dosleg.d.ts +0 -7
  75. package/lib/model/dosleg.js +0 -326
  76. package/lib/model/dosleg.mjs +0 -196
  77. package/lib/model/dosleg.ts +0 -240
  78. package/lib/model/index.d.ts +0 -7
  79. package/lib/model/index.js +0 -7
  80. package/lib/model/index.mjs +0 -5
  81. package/lib/model/index.ts +0 -15
  82. package/lib/model/questions.d.ts +0 -45
  83. package/lib/model/questions.js +0 -89
  84. package/lib/model/questions.mjs +0 -71
  85. package/lib/model/questions.ts +0 -93
  86. package/lib/model/scrutins.d.ts +0 -13
  87. package/lib/model/scrutins.js +0 -114
  88. package/lib/model/seance.d.ts +0 -3
  89. package/lib/model/seance.js +0 -267
  90. package/lib/model/sens.d.ts +0 -146
  91. package/lib/model/sens.js +0 -454
  92. package/lib/model/sens.mjs +0 -415
  93. package/lib/model/sens.ts +0 -516
  94. package/lib/model/texte.d.ts +0 -7
  95. package/lib/model/texte.js +0 -256
  96. package/lib/model/texte.mjs +0 -208
  97. package/lib/model/texte.ts +0 -229
  98. package/lib/model/util.d.ts +0 -9
  99. package/lib/model/util.js +0 -38
  100. package/lib/model/util.mjs +0 -19
  101. package/lib/model/util.ts +0 -32
  102. package/lib/parsers/texte.d.ts +0 -7
  103. package/lib/parsers/texte.js +0 -228
  104. package/lib/raw_types/ameli.d.ts +0 -914
  105. package/lib/raw_types/ameli.js +0 -5
  106. package/lib/raw_types/ameli.mjs +0 -163
  107. package/lib/raw_types/debats.d.ts +0 -207
  108. package/lib/raw_types/debats.js +0 -5
  109. package/lib/raw_types/debats.mjs +0 -58
  110. package/lib/raw_types/dosleg.d.ts +0 -1619
  111. package/lib/raw_types/dosleg.js +0 -5
  112. package/lib/raw_types/dosleg.mjs +0 -438
  113. package/lib/raw_types/questions.d.ts +0 -419
  114. package/lib/raw_types/questions.js +0 -5
  115. package/lib/raw_types/questions.mjs +0 -11
  116. package/lib/raw_types/senat.d.ts +0 -11368
  117. package/lib/raw_types/senat.js +0 -5
  118. package/lib/raw_types/sens.d.ts +0 -8248
  119. package/lib/raw_types/sens.js +0 -5
  120. package/lib/raw_types/sens.mjs +0 -508
  121. package/lib/raw_types_kysely/ameli.d.ts +0 -915
  122. package/lib/raw_types_kysely/ameli.js +0 -7
  123. package/lib/raw_types_kysely/ameli.mjs +0 -5
  124. package/lib/raw_types_kysely/ameli.ts +0 -951
  125. package/lib/raw_types_kysely/debats.d.ts +0 -207
  126. package/lib/raw_types_kysely/debats.js +0 -7
  127. package/lib/raw_types_kysely/debats.mjs +0 -5
  128. package/lib/raw_types_kysely/debats.ts +0 -222
  129. package/lib/raw_types_kysely/dosleg.d.ts +0 -3532
  130. package/lib/raw_types_kysely/dosleg.js +0 -7
  131. package/lib/raw_types_kysely/dosleg.mjs +0 -5
  132. package/lib/raw_types_kysely/dosleg.ts +0 -3621
  133. package/lib/raw_types_kysely/questions.d.ts +0 -414
  134. package/lib/raw_types_kysely/questions.js +0 -7
  135. package/lib/raw_types_kysely/questions.mjs +0 -5
  136. package/lib/raw_types_kysely/questions.ts +0 -426
  137. package/lib/raw_types_kysely/sens.d.ts +0 -4394
  138. package/lib/raw_types_kysely/sens.js +0 -7
  139. package/lib/raw_types_kysely/sens.mjs +0 -5
  140. package/lib/raw_types_kysely/sens.ts +0 -4499
  141. package/lib/raw_types_schemats/ameli.d.ts +0 -539
  142. package/lib/raw_types_schemats/ameli.js +0 -2
  143. package/lib/raw_types_schemats/ameli.mjs +0 -2
  144. package/lib/raw_types_schemats/ameli.ts +0 -601
  145. package/lib/raw_types_schemats/debats.d.ts +0 -127
  146. package/lib/raw_types_schemats/debats.js +0 -2
  147. package/lib/raw_types_schemats/debats.mjs +0 -2
  148. package/lib/raw_types_schemats/debats.ts +0 -145
  149. package/lib/raw_types_schemats/dosleg.d.ts +0 -977
  150. package/lib/raw_types_schemats/dosleg.js +0 -2
  151. package/lib/raw_types_schemats/dosleg.mjs +0 -2
  152. package/lib/raw_types_schemats/dosleg.ts +0 -2193
  153. package/lib/raw_types_schemats/questions.d.ts +0 -235
  154. package/lib/raw_types_schemats/questions.js +0 -2
  155. package/lib/raw_types_schemats/questions.mjs +0 -2
  156. package/lib/raw_types_schemats/questions.ts +0 -249
  157. package/lib/raw_types_schemats/sens.d.ts +0 -6915
  158. package/lib/raw_types_schemats/sens.js +0 -2
  159. package/lib/raw_types_schemats/sens.mjs +0 -2
  160. package/lib/raw_types_schemats/sens.ts +0 -2907
  161. package/lib/scripts/convert_data.d.ts +0 -1
  162. package/lib/scripts/convert_data.js +0 -354
  163. package/lib/scripts/convert_data.mjs +0 -181
  164. package/lib/scripts/convert_data.ts +0 -243
  165. package/lib/scripts/data-download.d.ts +0 -1
  166. package/lib/scripts/data-download.js +0 -12
  167. package/lib/scripts/datautil.d.ts +0 -8
  168. package/lib/scripts/datautil.js +0 -34
  169. package/lib/scripts/datautil.mjs +0 -16
  170. package/lib/scripts/datautil.ts +0 -19
  171. package/lib/scripts/images/transparent_150x192.jpg +0 -0
  172. package/lib/scripts/images/transparent_155x225.jpg +0 -0
  173. package/lib/scripts/parse_textes.d.ts +0 -1
  174. package/lib/scripts/parse_textes.js +0 -44
  175. package/lib/scripts/parse_textes.mjs +0 -46
  176. package/lib/scripts/parse_textes.ts +0 -65
  177. package/lib/scripts/retrieve_agenda.d.ts +0 -1
  178. package/lib/scripts/retrieve_agenda.js +0 -132
  179. package/lib/scripts/retrieve_cr_commission.d.ts +0 -1
  180. package/lib/scripts/retrieve_cr_commission.js +0 -364
  181. package/lib/scripts/retrieve_cr_seance.d.ts +0 -6
  182. package/lib/scripts/retrieve_cr_seance.js +0 -347
  183. package/lib/scripts/retrieve_documents.d.ts +0 -3
  184. package/lib/scripts/retrieve_documents.js +0 -219
  185. package/lib/scripts/retrieve_documents.mjs +0 -249
  186. package/lib/scripts/retrieve_documents.ts +0 -298
  187. package/lib/scripts/retrieve_open_data.d.ts +0 -1
  188. package/lib/scripts/retrieve_open_data.js +0 -315
  189. package/lib/scripts/retrieve_open_data.mjs +0 -217
  190. package/lib/scripts/retrieve_open_data.ts +0 -268
  191. package/lib/scripts/retrieve_senateurs_photos.d.ts +0 -1
  192. package/lib/scripts/retrieve_senateurs_photos.js +0 -147
  193. package/lib/scripts/retrieve_senateurs_photos.mjs +0 -147
  194. package/lib/scripts/retrieve_senateurs_photos.ts +0 -177
  195. package/lib/scripts/retrieve_videos.d.ts +0 -1
  196. package/lib/scripts/retrieve_videos.js +0 -461
  197. package/lib/scripts/shared/cli_helpers.d.ts +0 -95
  198. package/lib/scripts/shared/cli_helpers.js +0 -91
  199. package/lib/scripts/shared/cli_helpers.ts +0 -36
  200. package/lib/scripts/shared/util.d.ts +0 -4
  201. package/lib/scripts/shared/util.js +0 -35
  202. package/lib/scripts/shared/util.ts +0 -33
  203. package/lib/scripts/test_iter_load.d.ts +0 -1
  204. package/lib/scripts/test_iter_load.js +0 -12
  205. package/lib/src/ameli.d.ts +0 -66
  206. package/lib/src/ameli.js +0 -1
  207. package/lib/src/databases.d.ts +0 -3
  208. package/lib/src/databases.js +0 -26
  209. package/lib/src/db_types/ameli.d.ts +0 -1762
  210. package/lib/src/db_types/ameli.js +0 -1074
  211. package/lib/src/db_types/debats.d.ts +0 -380
  212. package/lib/src/db_types/debats.js +0 -266
  213. package/lib/src/db_types/dosleg.d.ts +0 -2954
  214. package/lib/src/db_types/dosleg.js +0 -2005
  215. package/lib/src/db_types/questions.d.ts +0 -699
  216. package/lib/src/db_types/questions.js +0 -493
  217. package/lib/src/db_types/sens.d.ts +0 -7843
  218. package/lib/src/db_types/sens.js +0 -4691
  219. package/lib/src/debats.d.ts +0 -38
  220. package/lib/src/debats.js +0 -1
  221. package/lib/src/dosleg.d.ts +0 -142
  222. package/lib/src/dosleg.js +0 -193
  223. package/lib/src/model/ameli_postgres.d.ts +0 -67
  224. package/lib/src/model/ameli_postgres.js +0 -150
  225. package/lib/src/other_types/questions.d.ts +0 -2
  226. package/lib/src/other_types/questions.js +0 -1
  227. package/lib/src/questions.d.ts +0 -53
  228. package/lib/src/questions.js +0 -1
  229. package/lib/src/raw_types/senat.d.ts +0 -11372
  230. package/lib/src/raw_types/senat.js +0 -5
  231. package/lib/src/rich_types/agenda.d.ts +0 -45
  232. package/lib/src/rich_types/agenda.js +0 -1
  233. package/lib/src/rich_types/compte_rendu.d.ts +0 -83
  234. package/lib/src/rich_types/compte_rendu.js +0 -1
  235. package/lib/src/rich_types/sessions.d.ts +0 -6
  236. package/lib/src/rich_types/sessions.js +0 -19
  237. package/lib/src/rich_types/texte.d.ts +0 -72
  238. package/lib/src/rich_types/texte.js +0 -15
  239. package/lib/src/scripts/test_iter_load.d.ts +0 -1
  240. package/lib/src/scripts/test_iter_load.js +0 -12
  241. package/lib/src/sens.d.ts +0 -104
  242. package/lib/src/sens.js +0 -1
  243. package/lib/strings.d.ts +0 -1
  244. package/lib/strings.js +0 -18
  245. package/lib/strings.mjs +0 -18
  246. package/lib/strings.ts +0 -26
  247. package/lib/tsconfig.tsbuildinfo +0 -1
  248. package/lib/types/agenda.d.ts +0 -44
  249. package/lib/types/agenda.js +0 -1
  250. package/lib/types/ameli.d.ts +0 -5
  251. package/lib/types/ameli.js +0 -1
  252. package/lib/types/ameli.mjs +0 -13
  253. package/lib/types/ameli.ts +0 -21
  254. package/lib/types/compte_rendu.d.ts +0 -83
  255. package/lib/types/compte_rendu.js +0 -1
  256. package/lib/types/debats.d.ts +0 -2
  257. package/lib/types/debats.js +0 -1
  258. package/lib/types/debats.mjs +0 -2
  259. package/lib/types/debats.ts +0 -6
  260. package/lib/types/dosleg.d.ts +0 -70
  261. package/lib/types/dosleg.js +0 -1
  262. package/lib/types/dosleg.mjs +0 -151
  263. package/lib/types/dosleg.ts +0 -284
  264. package/lib/types/questions.d.ts +0 -2
  265. package/lib/types/questions.js +0 -1
  266. package/lib/types/questions.mjs +0 -1
  267. package/lib/types/questions.ts +0 -3
  268. package/lib/types/sens.d.ts +0 -10
  269. package/lib/types/sens.js +0 -1
  270. package/lib/types/sens.mjs +0 -1
  271. package/lib/types/sens.ts +0 -12
  272. package/lib/types/sessions.d.ts +0 -5
  273. package/lib/types/sessions.js +0 -84
  274. package/lib/types/sessions.mjs +0 -43
  275. package/lib/types/sessions.ts +0 -42
  276. package/lib/types/texte.d.ts +0 -74
  277. package/lib/types/texte.js +0 -16
  278. package/lib/types/texte.mjs +0 -16
  279. package/lib/types/texte.ts +0 -76
  280. package/lib/typings/windows-1252.d.js +0 -2
  281. package/lib/typings/windows-1252.d.mjs +0 -2
  282. package/lib/typings/windows-1252.d.ts +0 -11
  283. package/lib/utils/cr_spliting.d.ts +0 -28
  284. package/lib/utils/cr_spliting.js +0 -265
  285. package/lib/utils/date.d.ts +0 -10
  286. package/lib/utils/date.js +0 -100
  287. package/lib/utils/nvs-timecode.d.ts +0 -7
  288. package/lib/utils/nvs-timecode.js +0 -79
  289. package/lib/utils/reunion_grouping.d.ts +0 -9
  290. package/lib/utils/reunion_grouping.js +0 -361
  291. package/lib/utils/reunion_odj_building.d.ts +0 -5
  292. package/lib/utils/reunion_odj_building.js +0 -154
  293. package/lib/utils/reunion_parsing.d.ts +0 -23
  294. package/lib/utils/reunion_parsing.js +0 -209
  295. package/lib/utils/scoring.d.ts +0 -14
  296. package/lib/utils/scoring.js +0 -147
  297. package/lib/utils/string_cleaning.d.ts +0 -7
  298. package/lib/utils/string_cleaning.js +0 -57
  299. package/lib/validators/config.d.ts +0 -9
  300. package/lib/validators/config.js +0 -10
  301. package/lib/validators/config.mjs +0 -54
  302. package/lib/validators/config.ts +0 -79
  303. package/lib/validators/senat.d.ts +0 -0
  304. package/lib/validators/senat.js +0 -28
  305. package/lib/validators/senat.mjs +0 -24
  306. package/lib/validators/senat.ts +0 -26
@@ -1,364 +0,0 @@
1
- import fs, { ensureDir } from "fs-extra";
2
- import assert from "assert";
3
- import path from "path";
4
- import * as cheerio from "cheerio";
5
- import { COMMISSION_FOLDER, DATA_ORIGINAL_FOLDER, DATA_TRANSFORMED_FOLDER } from "../loaders";
6
- import { loadAgendaForDate, parseCommissionMetadataFromHtml, linkCRtoCommissionGroup } from "../utils/cr_spliting";
7
- import { cleanTitle, extractDayH3Sections, parseCommissionCRSectionFromDom } from "../model/commission";
8
- import commandLineArgs from "command-line-args";
9
- import { commonOptions } from "./shared/cli_helpers";
10
- import { sessionStartYearFromDate } from "../model/seance";
11
- import { getSessionsFromStart } from "../types/sessions";
12
- import { ensureAndClearDir, fetchWithRetry } from "./shared/util";
13
- import { jaccard, jaccardTokenSim } from "../utils/scoring";
14
- class CommissionCRDownloadError extends Error {
15
- constructor(message, url) {
16
- super(`An error occurred while retrieving Commission CR ${url}: ${message}`);
17
- }
18
- }
19
- const optionsDefinitions = [
20
- ...commonOptions,
21
- { name: "concurrency", type: Number, defaultValue: 6, help: "Max parallel downloads" },
22
- { name: "politenessMs", type: Number, defaultValue: 150, help: "Delay per worker (ms)" },
23
- {
24
- help: "parse and convert comptes-rendus des débats into JSON",
25
- name: "parseDebats",
26
- type: Boolean,
27
- },
28
- ];
29
- const options = commandLineArgs(optionsDefinitions);
30
- const COMMISSION_HUBS = {
31
- "Commission des affaires étrangères": [
32
- "https://www.senat.fr/compte-rendu-commissions/affaires-etrangeres.html",
33
- "https://www.senat.fr/compte-rendu-commissions/affaires-etrangeres_archives.html",
34
- ],
35
- "Commission des affaires économiques": [
36
- "https://www.senat.fr/compte-rendu-commissions/economie.html",
37
- "https://www.senat.fr/compte-rendu-commissions/economie_archives.html",
38
- ],
39
- "Commission de l'amenagement du territoire et du développement durable": [
40
- "https://www.senat.fr/compte-rendu-commissions/developpement-durable.html",
41
- "https://www.senat.fr/compte-rendu-commissions/developpement-durable_archives.html",
42
- ],
43
- "Commission de la culture": [
44
- "https://www.senat.fr/compte-rendu-commissions/culture.html",
45
- "https://www.senat.fr/compte-rendu-commissions/culture_archives.html",
46
- ],
47
- "Commission des finances": [
48
- "https://www.senat.fr/compte-rendu-commissions/finances.html",
49
- "https://www.senat.fr/compte-rendu-commissions/finances_archives.html",
50
- ],
51
- "Commission des lois": [
52
- "https://www.senat.fr/compte-rendu-commissions/lois.html",
53
- "https://www.senat.fr/compte-rendu-commissions/lois_archives.html",
54
- ],
55
- "Commission des affaires sociales": [
56
- "https://www.senat.fr/compte-rendu-commissions/affaires-sociales.html",
57
- "https://www.senat.fr/compte-rendu-commissions/affaires-sociales_archives.html",
58
- ],
59
- "Commission des affaires européennes": [
60
- "https://www.senat.fr/compte-rendu-commissions/affaires-europeennes.html",
61
- "https://www.senat.fr/compte-rendu-commissions/affaires-europeennes_archives.html",
62
- ],
63
- };
64
- async function harvestWeeklyLinksFromHub(hubUrl) {
65
- const res = await fetchWithRetry(hubUrl);
66
- if (!res.ok)
67
- return [];
68
- const html = await res.text();
69
- const $ = cheerio.load(html);
70
- const out = [];
71
- $("a[href]").each((_, a) => {
72
- const href = ($(a).attr("href") || "").trim();
73
- const m = href.match(/\/compte-rendu-commissions\/(\d{8})\/([a-z0-9\-]+)\.html$/i);
74
- if (m) {
75
- const url = href.startsWith("http") ? href : new URL(href, hubUrl).toString();
76
- out.push(url);
77
- }
78
- });
79
- return Array.from(new Set(out));
80
- }
81
- async function discoverCommissionWeeklyPages(fromSession) {
82
- const results = [];
83
- for (const [commissionKey, hubs] of Object.entries(COMMISSION_HUBS)) {
84
- for (const hubUrl of hubs) {
85
- try {
86
- const links = await harvestWeeklyLinksFromHub(hubUrl);
87
- for (const url of links) {
88
- const m = url.match(/\/compte-rendu-commissions\/(\d{8})\/([a-z0-9\-]+)\.html$/i);
89
- if (!m)
90
- continue;
91
- const yyyymmdd = m[1];
92
- const year = Number(yyyymmdd.slice(0, 4));
93
- const month = Number(yyyymmdd.slice(4, 6));
94
- const session = month >= 10 ? year : year - 1;
95
- if (session < fromSession)
96
- continue;
97
- results.push({ url, yyyymmdd, commissionKey });
98
- }
99
- }
100
- catch (e) {
101
- console.warn(`[COM-CR][hub-fail] ${hubUrl} → ${e?.message ?? e}`);
102
- }
103
- }
104
- }
105
- return results.sort((a, b) => a.yyyymmdd.localeCompare(b.yyyymmdd));
106
- }
107
- function toHourShort(hhmm) {
108
- if (!hhmm)
109
- return null;
110
- const m = hhmm.match(/^(\d{2}):(\d{2})$/);
111
- return m ? `${m[1]}${m[2]}` : null;
112
- }
113
- function timeToMinutes(hhmm) {
114
- const [h, m] = hhmm.split(":").map((n) => parseInt(n, 10));
115
- return (h || 0) * 60 + (m || 0);
116
- }
117
- async function tryDownload(url) {
118
- const res = await fetch(url, { redirect: "follow" });
119
- if (res.status === 404)
120
- return null;
121
- if (!res.ok)
122
- throw new CommissionCRDownloadError(String(res.status), url);
123
- const ab = await res.arrayBuffer();
124
- return Buffer.from(ab);
125
- }
126
- function normOrgane(s) {
127
- return s
128
- .toLowerCase()
129
- .normalize("NFD")
130
- .replace(/[\u0300-\u036f]/g, "")
131
- .replace(/&/g, " et ")
132
- .replace(/[^a-z0-9\s-]/g, " ")
133
- .replace(/\s+/g, " ")
134
- .trim();
135
- }
136
- function toTokens(s) {
137
- return new Set(normOrgane(s)
138
- .split(/\s+/)
139
- .filter((t) => t.length >= 3 && !["commission", "des", "de", "du", "d", "la", "le", "les", "et"].includes(t)));
140
- }
141
- function reunionOrganeCandidates(h) {
142
- const any = h;
143
- const out = [any.organeSlug, any.organeKey, any.organe, h.titre].filter(Boolean);
144
- return Array.from(new Set(out.map(normOrgane)));
145
- }
146
- function organeSimilarity(h, commissionKey) {
147
- const keyTokens = toTokens(commissionKey.replace(/-/g, " "));
148
- const cand = reunionOrganeCandidates(h).map(toTokens);
149
- let best = 0;
150
- for (const B of cand)
151
- best = Math.max(best, jaccard(keyTokens, B));
152
- return best; // 0..1
153
- }
154
- function timeProximityScore(h, openHHMM, maxDeltaMin) {
155
- if (!openHHMM)
156
- return 0;
157
- const hhmm = (h.startTime ?? null);
158
- if (!hhmm)
159
- return 0;
160
- const d = Math.abs(timeToMinutes(hhmm) - timeToMinutes(openHHMM));
161
- if (d > maxDeltaMin)
162
- return 0;
163
- return 1 - d / maxDeltaMin; // 0..1 (1 = même heure)
164
- }
165
- function titleSimilarity(reunion, sectionTitle) {
166
- const t = reunion.titre ?? "";
167
- const o = reunion.objet ?? "";
168
- if (!sectionTitle.trim())
169
- return 0;
170
- const sTit = jaccardTokenSim(t, sectionTitle);
171
- const sObj = jaccardTokenSim(o, sectionTitle);
172
- return Math.max(sTit, sObj);
173
- }
174
- async function retrieveCommissionCRs(options = {}) {
175
- const dataDir = options["dataDir"];
176
- const fromSession = Number(options["fromSession"]);
177
- const concurrency = Number(options["concurrency"] ?? 6);
178
- const politenessMs = Number(options["politenessMs"] ?? 150);
179
- const commissionsRootDir = path.join(dataDir, COMMISSION_FOLDER);
180
- const originalRoot = path.join(commissionsRootDir, DATA_ORIGINAL_FOLDER);
181
- if (!options["keepDir"]) {
182
- ensureAndClearDir(originalRoot);
183
- }
184
- else {
185
- ensureDir(originalRoot);
186
- }
187
- const discovered = await discoverCommissionWeeklyPages(fromSession);
188
- console.log(`[COM-CR][discover] ${discovered.length} links (>= session ${fromSession})`);
189
- const jobs = discovered.map(({ url, yyyymmdd, commissionKey }) => {
190
- const d = new Date(Number(yyyymmdd.slice(0, 4)), Number(yyyymmdd.slice(4, 6)) - 1, Number(yyyymmdd.slice(6, 8)));
191
- const session = sessionStartYearFromDate(d);
192
- const dir = path.join(originalRoot, String(session), commissionKey);
193
- fs.ensureDirSync(dir);
194
- const slug = url.replace(/^.*\/(\d{8})\/([^\/]+)\.html$/i, "$2");
195
- const outPath = path.join(dir, `${yyyymmdd}.${slug}.html`);
196
- return { url, outPath, yyyymmdd, commissionKey };
197
- });
198
- console.log(`[COM-CR] Downloading ${jobs.length} links → ${path.relative(process.cwd(), originalRoot)}`);
199
- let completed = 0, saved = 0, skipped = 0, notFound = 0;
200
- const workers = Array.from({ length: Math.max(1, concurrency) }, async () => {
201
- while (true) {
202
- const job = jobs.shift();
203
- if (!job)
204
- break;
205
- const { url, outPath, yyyymmdd } = job;
206
- try {
207
- if (await fs.pathExists(outPath)) {
208
- skipped++;
209
- }
210
- else {
211
- const buf = await tryDownload(url);
212
- if (!buf) {
213
- notFound++;
214
- console.warn(`[COM-CR][404] ${url} → week=${yyyymmdd}`);
215
- }
216
- else {
217
- await fs.writeFile(outPath, buf);
218
- saved++;
219
- }
220
- }
221
- }
222
- catch (e) {
223
- console.error(`[COM-CR][err] ${url} → ${e?.message || e}`);
224
- }
225
- finally {
226
- completed++;
227
- if (politenessMs > 0)
228
- await new Promise((r) => setTimeout(r, politenessMs));
229
- }
230
- }
231
- });
232
- await Promise.all(workers);
233
- console.log(`[COM-CR] done: saved=${saved} | skipped=${skipped} | 404=${notFound} | total=${completed}`);
234
- const sessions = getSessionsFromStart(options["fromSession"]);
235
- const comRoot = path.join(dataDir, COMMISSION_FOLDER);
236
- const transformedRoot = path.join(comRoot, DATA_TRANSFORMED_FOLDER);
237
- if (options["keepDir"])
238
- ensureDir(transformedRoot);
239
- else
240
- ensureAndClearDir(transformedRoot);
241
- for (const session of sessions) {
242
- const originalSessionDir = path.join(originalRoot, String(session));
243
- if (!(await fs.pathExists(originalSessionDir)))
244
- continue;
245
- const commissionDirs = (await fs.readdir(originalSessionDir, { withFileTypes: true }))
246
- .filter((d) => d.isDirectory())
247
- .map((d) => d.name); // ex: "affaires-etrangeres", "finances", etc.
248
- for (const commissionKey of commissionDirs) {
249
- const commissionDir = path.join(originalSessionDir, commissionKey);
250
- const htmlFiles = (await fs.readdir(commissionDir)).filter((f) => /\.html?$/i.test(f)).sort();
251
- let totalFiles = 0;
252
- let linkedFiles = 0;
253
- for (const f of htmlFiles) {
254
- const htmlPath = path.join(commissionDir, f);
255
- let meta;
256
- let raw = "";
257
- try {
258
- raw = await fs.readFile(htmlPath, "utf8");
259
- meta = parseCommissionMetadataFromHtml(raw, f);
260
- }
261
- catch (e) {
262
- console.warn(`[COM-CR][PRE][${session}] Cannot read/parse ${f}:`, e);
263
- continue;
264
- }
265
- if (!meta?.days?.length)
266
- continue;
267
- const $ = cheerio.load(raw, { xmlMode: false });
268
- for (const day of meta.days) {
269
- const yyyymmdd = day.date.replace(/-/g, "");
270
- const dt = new Date(Number(day.date.slice(0, 4)), Number(day.date.slice(5, 7)) - 1, Number(day.date.slice(8, 10)));
271
- const daySession = sessionStartYearFromDate(dt);
272
- let hits = await loadAgendaForDate(dataDir, yyyymmdd, daySession);
273
- console.log(`[COM-CR][TRANSFORM] ${f} → ${hits.length} agenda events on ${day.date} :`);
274
- const sections = extractDayH3Sections($, day.date);
275
- if (sections.length === 0) {
276
- console.warn(`[COM-CR][TRANSFORM] no sections found for ${f} on ${day.date}, skipping.`);
277
- continue;
278
- }
279
- const MAX_TIME_DELTA_MIN = 120;
280
- const ORGANE_GATE = 0.55;
281
- const TITLE_GATE = 0.2;
282
- const W_ORG = 0.4;
283
- const W_TIM = 0.4;
284
- const W_TIT = 0.2;
285
- for (let sIdx = 0; sIdx < sections.length; sIdx++) {
286
- const sec = sections[sIdx];
287
- let best = null;
288
- let reason = "fallback-none";
289
- if (hits.length) {
290
- const scored = hits
291
- .map((h) => {
292
- const sOrg = organeSimilarity(h, commissionKey); // 0..1
293
- const sTim = timeProximityScore(h, sec.time ?? day.openTime ?? null, MAX_TIME_DELTA_MIN); // 0..1
294
- const sTit = titleSimilarity(h, sec.title); // 0..1
295
- const total = W_ORG * sOrg + W_TIM * sTim + W_TIT * sTit;
296
- return { h, sOrg, sTim, sTit, total };
297
- })
298
- .filter((x) => x.sOrg >= ORGANE_GATE && x.sTit >= TITLE_GATE)
299
- .sort((a, b) => b.total - a.total);
300
- if (scored[0]) {
301
- best = scored[0].h;
302
- reason =
303
- scored[0].sTit >= Math.max(scored[0].sOrg, scored[0].sTim)
304
- ? "title"
305
- : scored[0].sOrg >= scored[0].sTim
306
- ? "organe"
307
- : "time";
308
- }
309
- }
310
- const hourShort = toHourShort(day.openTime) ?? "NA";
311
- const cr = parseCommissionCRSectionFromDom($, htmlPath, {
312
- dateISO: day.date,
313
- hourShort,
314
- organe: commissionKey,
315
- section: sec,
316
- matched: best ?? undefined,
317
- });
318
- if (!cr) {
319
- console.warn(`[COM-CR][TRANSFORM] parse failed for section#${sIdx} ${path.basename(htmlPath)} → ${best ? best.uid : "NO-GROUP"} (${commissionKey})`);
320
- continue;
321
- }
322
- const fileUid = cr.uid;
323
- const transformedSessionDir = path.join(transformedRoot, String(daySession));
324
- fs.ensureDirSync(transformedSessionDir);
325
- const outPath = path.join(transformedSessionDir, `${fileUid}.json`);
326
- await fs.writeJSON(outPath, cr, { spaces: 2 });
327
- const titreGuess = cleanTitle(sections[sIdx].title) || "Commission du " + day.date;
328
- const up = await linkCRtoCommissionGroup({
329
- dataDir,
330
- dateISO: day.date,
331
- organeDetected: commissionKey,
332
- hourShort,
333
- crUid: fileUid,
334
- titreGuess,
335
- groupUid: best ? best.uid : undefined,
336
- });
337
- totalFiles++;
338
- if (up.created || up.updated)
339
- linkedFiles++;
340
- else {
341
- console.warn(`[COM-CR][AGENDA][WARN] CR ${fileUid} (section#${sIdx}) not linked (reason=${reason})`);
342
- }
343
- }
344
- }
345
- }
346
- if (!options["silent"]) {
347
- console.log(`[COM-CR][SESSION ${session}][${commissionKey}] Processed ${totalFiles} CR files, linked to agenda: ${linkedFiles}`);
348
- }
349
- }
350
- }
351
- }
352
- async function main() {
353
- const dataDir = options["dataDir"];
354
- assert(dataDir, "Missing argument: data directory");
355
- console.time("CRI processing time");
356
- await retrieveCommissionCRs(options);
357
- console.timeEnd("CRI processing time");
358
- }
359
- main()
360
- .then(() => process.exit(0))
361
- .catch((error) => {
362
- console.error(error);
363
- process.exit(1);
364
- });
@@ -1,6 +0,0 @@
1
- /**
2
- * Needs to be ran after retrieve_agenda.ts script !
3
- * - downloads the ZIP of comptes-rendus des débats (CRI) from data.senat.fr
4
- * - extracts XML files, distributes them by session/year
5
- */
6
- export declare function retrieveCriXmlDump(dataDir: string, options?: Record<string, any>): Promise<void>;