@tricoteuses/senat 3.1.2 → 3.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. package/lib/src/rich_types/dosleg.js +13 -3
  2. package/lib/src/rich_types/sens.d.ts +2 -0
  3. package/lib/src/scripts/data-download.js +16 -9
  4. package/lib/src/scripts/retrieve_open_data.js +4 -2
  5. package/lib/src/scripts/shared/make_generate_zod_schemas.js +6 -2
  6. package/lib/src/server/databases_postgres.js +2 -1
  7. package/lib/src/server/documents.js +2 -2
  8. package/lib/src/server/dosleg.js +2 -2
  9. package/lib/src/server/sens.js +70 -0
  10. package/lib/src/utils/reunion_parsing.js +1 -1
  11. package/package.json +3 -1
  12. package/lib/src/config.d.ts +0 -43
  13. package/lib/src/config.js +0 -37
  14. package/lib/src/conversion_textes.d.ts +0 -11
  15. package/lib/src/conversion_textes.js +0 -320
  16. package/lib/src/databases_postgres.d.ts +0 -4
  17. package/lib/src/databases_postgres.js +0 -23
  18. package/lib/src/datasets.d.ts +0 -38
  19. package/lib/src/datasets.js +0 -247
  20. package/lib/src/git.d.ts +0 -27
  21. package/lib/src/git.js +0 -251
  22. package/lib/src/loaders.d.ts +0 -52
  23. package/lib/src/loaders.js +0 -260
  24. package/lib/src/model/agenda.d.ts +0 -6
  25. package/lib/src/model/agenda.js +0 -148
  26. package/lib/src/model/ameli.d.ts +0 -67
  27. package/lib/src/model/ameli.js +0 -150
  28. package/lib/src/model/commission.d.ts +0 -19
  29. package/lib/src/model/commission.js +0 -269
  30. package/lib/src/model/debats.d.ts +0 -39
  31. package/lib/src/model/debats.js +0 -112
  32. package/lib/src/model/documents.d.ts +0 -32
  33. package/lib/src/model/documents.js +0 -182
  34. package/lib/src/model/dosleg.d.ts +0 -144
  35. package/lib/src/model/dosleg.js +0 -468
  36. package/lib/src/model/index.d.ts +0 -7
  37. package/lib/src/model/index.js +0 -7
  38. package/lib/src/model/questions.d.ts +0 -54
  39. package/lib/src/model/questions.js +0 -91
  40. package/lib/src/model/scrutins.d.ts +0 -48
  41. package/lib/src/model/scrutins.js +0 -121
  42. package/lib/src/model/seance.d.ts +0 -3
  43. package/lib/src/model/seance.js +0 -267
  44. package/lib/src/model/sens.d.ts +0 -112
  45. package/lib/src/model/sens.js +0 -385
  46. package/lib/src/model/util.d.ts +0 -1
  47. package/lib/src/model/util.js +0 -15
  48. package/lib/src/raw_types/ameli.d.ts +0 -1762
  49. package/lib/src/raw_types/ameli.js +0 -1074
  50. package/lib/src/raw_types/debats.d.ts +0 -380
  51. package/lib/src/raw_types/debats.js +0 -266
  52. package/lib/src/raw_types/dosleg.d.ts +0 -2954
  53. package/lib/src/raw_types/dosleg.js +0 -2005
  54. package/lib/src/raw_types/questions.d.ts +0 -699
  55. package/lib/src/raw_types/questions.js +0 -493
  56. package/lib/src/raw_types/sens.d.ts +0 -7843
  57. package/lib/src/raw_types/sens.js +0 -4691
  58. package/lib/src/raw_types_schemats/ameli.d.ts +0 -541
  59. package/lib/src/raw_types_schemats/ameli.js +0 -2
  60. package/lib/src/raw_types_schemats/debats.d.ts +0 -127
  61. package/lib/src/raw_types_schemats/debats.js +0 -2
  62. package/lib/src/raw_types_schemats/dosleg.d.ts +0 -977
  63. package/lib/src/raw_types_schemats/dosleg.js +0 -2
  64. package/lib/src/raw_types_schemats/questions.d.ts +0 -237
  65. package/lib/src/raw_types_schemats/questions.js +0 -2
  66. package/lib/src/raw_types_schemats/sens.d.ts +0 -2709
  67. package/lib/src/raw_types_schemats/sens.js +0 -2
  68. package/lib/src/types/agenda.d.ts +0 -45
  69. package/lib/src/types/agenda.js +0 -1
  70. package/lib/src/types/ameli.d.ts +0 -5
  71. package/lib/src/types/ameli.js +0 -1
  72. package/lib/src/types/compte_rendu.d.ts +0 -83
  73. package/lib/src/types/compte_rendu.js +0 -1
  74. package/lib/src/types/debats.d.ts +0 -2
  75. package/lib/src/types/debats.js +0 -1
  76. package/lib/src/types/dosleg.d.ts +0 -70
  77. package/lib/src/types/dosleg.js +0 -1
  78. package/lib/src/types/questions.d.ts +0 -2
  79. package/lib/src/types/questions.js +0 -1
  80. package/lib/src/types/sens.d.ts +0 -8
  81. package/lib/src/types/sens.js +0 -1
  82. package/lib/src/types/sessions.d.ts +0 -6
  83. package/lib/src/types/sessions.js +0 -19
  84. package/lib/src/types/texte.d.ts +0 -72
  85. package/lib/src/types/texte.js +0 -15
  86. package/lib/src/validators/config.d.ts +0 -9
  87. package/lib/src/validators/config.js +0 -10
@@ -1,148 +0,0 @@
1
- import { JSDOM } from "jsdom";
2
- import { DateTime } from "luxon";
3
- import path from "path";
4
- import { ID_DATE_FORMAT, STANDARD_DATE_FORMAT } from "../scripts/datautil.js";
5
- const FR_TZ = "Europe/Paris";
6
- function eventIsSeance(eventElement) {
7
- return eventElement.classList.contains("evt-seance");
8
- }
9
- function getEventType(eventClasses) {
10
- const typeClass = [...eventClasses].find((className) => className.startsWith("evt-")) || null;
11
- switch (typeClass) {
12
- case "evt-seance":
13
- return "Séance publique";
14
- case "evt-instanz":
15
- return "Commissions";
16
- case "evt-cemi":
17
- return "Mission de contrôle";
18
- case "evt-deleg":
19
- return "Offices et délégations";
20
- case "evt-bureau":
21
- return "Instances décisionnelles";
22
- }
23
- return null;
24
- }
25
- function getUrlDossierSenat(lienElements) {
26
- const urlElement = [...lienElements].find((lienElement) => lienElement.textContent?.includes("dossier législatif"));
27
- return urlElement ? urlElement.getAttribute("href") : null;
28
- }
29
- function getQuantieme(eventElement, seancesElements) {
30
- const seanceIndex = seancesElements.indexOf(eventElement);
31
- if (seancesElements.length === 1 && seanceIndex === 0) {
32
- return "Unique";
33
- }
34
- else {
35
- switch (seanceIndex) {
36
- case 0:
37
- return "Première";
38
- case 1:
39
- return "Deuxième";
40
- case 2:
41
- return "Troisième";
42
- case 3:
43
- return "Quatrième";
44
- case 4:
45
- return "Cinquième";
46
- }
47
- }
48
- return "Non défini";
49
- }
50
- /**
51
- * Normalize time string to become a simple start time ("H'h'mm") or a duration ("'de 'H'h'mm' à 'H'h'mm").
52
- */
53
- function normalizeTime(timeStr) {
54
- return timeStr
55
- ?.replace(/^À l'issue de l'espace réservé .* et au plus tard\s/i, "") // Must be processed first
56
- ?.replace(/^(?:le )?matin/i, "10h00") // We chose "matin" to mean 10h00
57
- ?.replace(/^(?:l')?après-midi/i, "16h00") // We chose "après-midi" to mean 16h00
58
- ?.replace(/^(?:le )?soir/i, "20h00") // We chose "soir" to mean 20h00
59
- ?.replace(/^(?:la )?nuit/i, "22h00") // We chose "nuit" to mean 22h00
60
- ?.replace(/^à\s/gi, "")
61
- ?.replace(/heures/gi, "h00")
62
- ?.replace(/\set.*/i, "")
63
- ?.replace(/,.*/, "")
64
- ?.replace(/\s\(hors hémicycle\)/i, "")
65
- ?.replace(/\s*h\s*/gi, "h");
66
- }
67
- export function getStartAndEndTimes(timeStr, dateISO) {
68
- const normalizedTime = normalizeTime(timeStr);
69
- if (!normalizedTime) {
70
- return { startTime: null, endTime: null };
71
- }
72
- const rangeMatch = normalizedTime.match(/^de (?<start>\d{1,2}h\d{2}) à (?<end>\d{1,2}h\d{2})$/i);
73
- const toUtcTimeOnly = (value) => {
74
- if (!value)
75
- return null;
76
- const time = DateTime.fromFormat(value, "H'h'mm", { zone: FR_TZ });
77
- if (!time.isValid)
78
- return null;
79
- const local = DateTime.fromISO(dateISO, { zone: FR_TZ }).set({
80
- hour: time.hour,
81
- minute: time.minute,
82
- second: 0,
83
- millisecond: 0,
84
- });
85
- if (!local.isValid)
86
- return null;
87
- return local.toUTC().toFormat("HH:mm:ss.SSS'Z'");
88
- };
89
- if (rangeMatch?.groups) {
90
- const { start, end } = rangeMatch.groups;
91
- return {
92
- startTime: toUtcTimeOnly(start),
93
- endTime: toUtcTimeOnly(end),
94
- };
95
- }
96
- return {
97
- startTime: toUtcTimeOnly(normalizedTime),
98
- endTime: null,
99
- };
100
- }
101
- function transformAgenda(document, fileName) {
102
- const agendaEvents = [];
103
- const eventElements = document.querySelectorAll(".evt");
104
- const seanceElements = Array.from(eventElements).filter((eventElement) => eventIsSeance(eventElement));
105
- for (const eventElement of eventElements) {
106
- const id = eventElement.previousElementSibling?.getAttribute("name") || null;
107
- if (!id) {
108
- continue;
109
- }
110
- const type = getEventType(eventElement.classList);
111
- const date = DateTime.fromFormat(fileName, ID_DATE_FORMAT).toFormat(STANDARD_DATE_FORMAT);
112
- const timeOriginal = eventElement.querySelector(".time")?.textContent || null;
113
- const { startTime, endTime } = getStartAndEndTimes(timeOriginal, date);
114
- const titre = eventElement.querySelector(".titre")?.textContent?.trim() || "";
115
- const organe = eventElement.querySelector(".organe")?.textContent?.trim() || null;
116
- const objet = eventElement.querySelector(".objet")?.textContent?.trim()?.replace(/^- /, "") || null;
117
- const lieu = eventElement.querySelector(".lieu")?.textContent || null;
118
- const videoElement = eventElement.querySelector(".video");
119
- const urlDossierSenat = getUrlDossierSenat(eventElement.querySelectorAll(".lien a"));
120
- agendaEvents.push({
121
- id,
122
- type,
123
- date,
124
- startTime,
125
- endTime,
126
- timeOriginal,
127
- titre,
128
- organe,
129
- objet,
130
- lieu,
131
- captationVideo: videoElement !== null,
132
- urlDossierSenat: urlDossierSenat,
133
- quantieme: eventIsSeance(eventElement) ? getQuantieme(eventElement, seanceElements) : null,
134
- });
135
- }
136
- return agendaEvents;
137
- }
138
- export async function parseAgendaFromFile(htmlFilePath) {
139
- try {
140
- const { document } = (await JSDOM.fromFile(htmlFilePath, { contentType: "text/html" })).window;
141
- const fileName = path.parse(htmlFilePath).name;
142
- return transformAgenda(document, fileName);
143
- }
144
- catch (error) {
145
- console.error(`Could not parse texte with error ${error.message}`);
146
- }
147
- return null;
148
- }
@@ -1,67 +0,0 @@
1
- export interface AmendementAuteurRow {
2
- group_politique_code: string | null;
3
- groupe_politique_id: number | null;
4
- groupe_politique_libelle: string | null;
5
- groupe_politique_libelle_court: string | null;
6
- homonyme: string | null;
7
- matricule: string | null;
8
- nom: string | null;
9
- prenom: string | null;
10
- qualite: string | null;
11
- rang: string | null;
12
- }
13
- export interface AmendementResult {
14
- accepte_gouvernement: string | null;
15
- alinea: number | null;
16
- auteur_est_gouvernement: boolean;
17
- au_nom_de_commission: string | null;
18
- au_nom_de_groupe_politique: string | null;
19
- auteurs: AmendementAuteurRow[];
20
- avis_commission: string | null;
21
- avis_gouvernement: string | null;
22
- code_commission: string | null;
23
- date_depot: string | null;
24
- discussion_commune_id: number | null;
25
- dispositif: string | null;
26
- etat: string;
27
- etat_texte: string | null;
28
- etat_texte_libelle: string | null;
29
- id: number;
30
- identique_id: number | null;
31
- intitule_texte: string | null;
32
- lecture: string | null;
33
- motion_libelle: string | null;
34
- nature: string;
35
- nature_texte: string | null;
36
- nature_texte_libelle: string | null;
37
- numero: string;
38
- numero_absolu: string | null;
39
- numero_adoption_texte: string | null;
40
- numero_texte: string | null;
41
- objet: string | null;
42
- observations: string | null;
43
- observations_additionnelles: string | null;
44
- ordre: number | null;
45
- parent_id: number | null;
46
- revision: string | null;
47
- session: string;
48
- session_libelle: string;
49
- signet_dossier_legislatif: string | null;
50
- sort: string | null;
51
- scrutin_num: string | null;
52
- subdivision_commission_id: number | null;
53
- subdivision_dupliquee: string | null;
54
- subdivision_libelle: string | null;
55
- subdivision_libelle_court: string | null;
56
- subdivision_mere_id: number | null;
57
- subdivision_position_discussion: number | null;
58
- subdivision_position_texte: number | null;
59
- subdivision_signet: string | null;
60
- subdivision_type: string | null;
61
- texte_id: number | null;
62
- type_rectification: string | null;
63
- type_session: string | null;
64
- type_texte: string | null;
65
- url: string;
66
- }
67
- export declare function findAllAmendements(fromSession?: number): AsyncGenerator<AmendementResult, void, unknown>;
@@ -1,150 +0,0 @@
1
- import { streamUnsafeQuery } from "../databases_postgres.js";
2
- function buildFindAllAmendementsQuery(fromSession) {
3
- const params = [];
4
- const whereSession = fromSession === undefined ? "" : "where ses.ann >= $1";
5
- if (fromSession !== undefined) {
6
- params.push(fromSession);
7
- }
8
- return {
9
- params,
10
- query: `
11
- select
12
- ses.ann::text as session,
13
- ses.lil as session_libelle,
14
- typses.lib as type_session,
15
- txt_ameli.doslegsignet as signet_dossier_legislatif,
16
- nat.libcourt as nature_texte,
17
- nat.lib as nature_texte_libelle,
18
- txt_ameli.numabs as numero_texte,
19
- txt_ameli.numado as numero_adoption_texte,
20
- txt_ameli.int as intitule_texte,
21
- etatxt.lic as etat_texte,
22
- etatxt.lib as etat_texte_libelle,
23
- etatxt.txttyp as type_texte,
24
- lec_ameli.lib as lecture,
25
- case amd.typ
26
- when 'A' then 'Amendement'
27
- when 'M' then 'Motion'
28
- when 'S' then 'Sous-amendement'
29
- else ''
30
- end as nature,
31
- amd.id,
32
- amd.amdperid as parent_id,
33
- amd.ideid as identique_id,
34
- amd.discomid as discussion_commune_id,
35
- amd.num as numero,
36
- amd.numabs as numero_absolu,
37
- amd.ord as ordre,
38
- amd.accgou as accepte_gouvernement,
39
- amd.txtid as texte_id,
40
- sub.lib as subdivision_libelle,
41
- sub.lic as subdivision_libelle_court,
42
- sub.pos as subdivision_position_texte,
43
- sub.posder as subdivision_position_discussion,
44
- sub.merid as subdivision_mere_id,
45
- sub.sig as subdivision_signet,
46
- sub.comdelid as subdivision_commission_id,
47
- sub.dupl as subdivision_dupliquee,
48
- typsub.lib as subdivision_type,
49
- amd.alinea,
50
- amd.obs as observations,
51
- amd.mot as observations_additionnelles,
52
- to_char(amd.datdep, 'YYYY-MM-DD') as date_depot,
53
- amd.dis as dispositif,
54
- amd.obj as objet,
55
- typrect.lib as type_rectification,
56
- mot.lib as motion_libelle,
57
- case amd.etaid
58
- when 7 then 'Diffusé'
59
- when 8 then 'Retiré avant réunion ou séance'
60
- when 9 then 'Examiné en commission ou séance'
61
- when 10 then 'Irrecevable'
62
- when 11 then 'Irrecevable'
63
- else ''
64
- end as etat,
65
- avicom.lib as avis_commission,
66
- avigvt.lib as avis_gouvernement,
67
- coalesce(sor.lib, irr.libirr) as sort,
68
- amd.rev as revision,
69
- (
70
- case
71
- when amd.num like '%COM%' then 'https://www.senat.fr/amendements/commissions/'
72
- else 'https://www.senat.fr/amendements/'
73
- end ||
74
- ses.ann::text || '-' ||
75
- (ses.ann + 1)::text || '/' ||
76
- txt_ameli.numabs || '/Amdt_' ||
77
- amd.num || '.html'
78
- ) as url,
79
- grppol_ameli.lilcou as au_nom_de_groupe_politique,
80
- rtrim(com_ameli.lil) as au_nom_de_commission,
81
- rtrim(com_ameli.cod) as code_commission,
82
- (cab.entid is not null) as auteur_est_gouvernement,
83
- (
84
- select amescr.scrnum::text
85
- from senat.dosleg_amescr as amescr
86
- left join senat.dosleg_scr as scr
87
- on amescr.scrnum = scr.scrnum
88
- and amescr.sesann = scr.sesann
89
- left join senat.dosleg_date_seance as date_seance on scr.code = date_seance.code
90
- where amescr.amescrnum = amd.num
91
- and amescr.sesann = ses.ann
92
- and date_seance.lecidt = texte.lecassidt
93
- limit 1
94
- ) as scrutin_num,
95
- (
96
- select coalesce(json_agg(author_rows order by author_rows.rang nulls last), '[]'::json)
97
- from (
98
- select
99
- amdsen.prenomuse as prenom,
100
- amdsen.hom as homonyme,
101
- amdsen.nomuse as nom,
102
- amdsen.qua as qualite,
103
- amdsen.rng::text as rang,
104
- sen_ameli.mat as matricule,
105
- amdsen.grpid as groupe_politique_id,
106
- grppol_ameli.cod as group_politique_code,
107
- grppol_ameli.libcou as groupe_politique_libelle_court,
108
- grppol_ameli.lilcou as groupe_politique_libelle
109
- from senat.ameli_amdsen as amdsen
110
- left join senat.ameli_sen_ameli as sen_ameli on amdsen.senid = sen_ameli.entid
111
- left join senat.ameli_grppol_ameli as grppol_ameli on amdsen.grpid = grppol_ameli.entid
112
- where amdsen.amdid = amd.id
113
- order by amdsen.rng asc
114
- ) as author_rows
115
- ) as auteurs
116
- from senat.ameli_amd as amd
117
- left join senat.ameli_sub as sub on amd.subid = sub.id
118
- left join senat.ameli_typsub as typsub on sub.typid = typsub.id
119
- left join senat.ameli_typrect as typrect on amd.typrectid = typrect.id
120
- left join senat.ameli_txt_ameli as txt_ameli on amd.txtid = txt_ameli.id
121
- left join senat.ameli_etatxt as etatxt on txt_ameli.txtetaid = etatxt.id
122
- left join senat.ameli_ses as ses on txt_ameli.sesdepid = ses.id
123
- left join senat.ameli_typses as typses on typses.id = ses.typid
124
- left join senat.ameli_nat as nat on txt_ameli.natid = nat.id
125
- left join senat.ameli_lec_ameli as lec_ameli on txt_ameli.lecid = lec_ameli.id
126
- left join senat.dosleg_texte as texte
127
- on ses.ann = texte.sesann
128
- and txt_ameli.numabs = texte.texnum
129
- left join senat.dosleg_lecass as lecass on texte.lecassidt = lecass.lecassidt
130
- left join senat.ameli_mot as mot on amd.motid = mot.id
131
- left join senat.ameli_avicom as avicom on amd.avcid = avicom.id
132
- left join senat.ameli_avigvt as avigvt on amd.avgid = avigvt.id
133
- left join senat.ameli_sor as sor on amd.sorid = sor.id
134
- left join senat.ameli_irr as irr on amd.irrid = irr.id
135
- left join senat.ameli_grppol_ameli as grppol_ameli on amd.nomentid = grppol_ameli.entid
136
- left join senat.ameli_com_ameli as com_ameli on amd.nomentid = com_ameli.entid
137
- left join senat.ameli_cab as cab on amd.nomentid = cab.entid
138
- ${whereSession}
139
- `,
140
- };
141
- }
142
- export async function* findAllAmendements(fromSession) {
143
- const { query, params } = buildFindAllAmendementsQuery(fromSession);
144
- for await (const row of streamUnsafeQuery(query, params)) {
145
- yield {
146
- ...row,
147
- auteurs: row.auteurs ?? [],
148
- };
149
- }
150
- }
@@ -1,19 +0,0 @@
1
- import * as cheerio from "cheerio";
2
- import type { AnyNode } from "domhandler";
3
- import { CompteRendu } from "../types/compte_rendu.js";
4
- import { Reunion } from "../types/agenda.js";
5
- export declare function getRemainingTextAfterSpeakerHeader($: cheerio.CheerioAPI, $p: cheerio.Cheerio<AnyNode>): string;
6
- export type DaySection = {
7
- title: string;
8
- $start: cheerio.Cheerio<AnyNode>;
9
- time?: string;
10
- };
11
- export declare function cleanTitle(t: string): string;
12
- export declare function extractDayH3Sections($: cheerio.CheerioAPI, dateISO: string): DaySection[];
13
- export declare function parseCommissionCRSectionFromDom($: cheerio.CheerioAPI, htmlFilePath: string, opts: {
14
- dateISO: string;
15
- hourShort: string | null;
16
- organe?: string | null;
17
- section: DaySection;
18
- matched?: Reunion;
19
- }): CompteRendu | null;
@@ -1,269 +0,0 @@
1
- import * as cheerio from "cheerio";
2
- import path from "path";
3
- import { makeReunionUid } from "../utils/reunion_parsing.js";
4
- import { norm } from "../utils/string_cleaning.js";
5
- import { frDateToISO, hourShortToStartTime } from "../utils/date.js";
6
- import { toCRDate } from "./util.js";
7
- const PARA_h3_SEL = "p.sh_justify, p.sh_center, p.sh_marge, p[align], li, h3";
8
- function findDayRoot($, targetISO) {
9
- let $root = $();
10
- $("h2").each((_, el) => {
11
- const txt = norm($(el).text());
12
- const m = txt.match(/(?:Lundi|Mardi|Mercredi|Jeudi|Vendredi|Samedi|Dimanche)\s+(.+)$/i);
13
- const iso = m ? frDateToISO(m[1]) : undefined;
14
- if (iso === targetISO && $root.length === 0)
15
- $root = $(el);
16
- });
17
- return $root;
18
- }
19
- function normalizeSpaces(s) {
20
- return s.replace(/[\u00A0\u202F\u2009]/g, " ");
21
- }
22
- function stripIntroPunct(s) {
23
- return s.replace(/^[\s]*[.:;]?\s*(?:[–—-]\s*)+/u, "");
24
- }
25
- function collectLeadingHeaderStrongEls($, $clone) {
26
- const els = [];
27
- const nodes = $clone.contents().toArray();
28
- for (const node of nodes) {
29
- if (node.type === "text") {
30
- if (norm(node.data || ""))
31
- break;
32
- continue;
33
- }
34
- if (node.type === "tag") {
35
- const $n = $(node);
36
- if ($n.is("strong, b")) {
37
- els.push(node);
38
- continue;
39
- }
40
- if ($n.is("a") && $n.children("strong, b").length) {
41
- $n.children("strong, b").each((_, el) => {
42
- els.push(el);
43
- });
44
- continue;
45
- }
46
- break;
47
- }
48
- }
49
- return els;
50
- }
51
- // Remove orateur's name from text and clean intro punct
52
- export function getRemainingTextAfterSpeakerHeader($, $p) {
53
- const $clone = $p.clone();
54
- // 1) Remove <strong> at start
55
- const headerStrongEls = collectLeadingHeaderStrongEls($, $clone);
56
- for (const el of headerStrongEls)
57
- $(el).remove();
58
- // 2) normalize + clean intro punct
59
- let remainingHtml = $clone.html() || "";
60
- remainingHtml = normalizeSpaces(cheerio.load(remainingHtml).text());
61
- remainingHtml = stripIntroPunct(remainingHtml);
62
- const remainingText = norm(remainingHtml || "");
63
- return remainingText;
64
- }
65
- function buildPointsFromParagraphs($, paras) {
66
- const points = [];
67
- let ordreAbsoluSeance = 0;
68
- const normSpeaker = (s) => s
69
- .normalize("NFKC")
70
- .replace(/\s+/g, " ")
71
- .replace(/[.:]\s*$/, "")
72
- .trim();
73
- const normQual = (s) => s
74
- .normalize("NFKC")
75
- .replace(/\s+/g, " ")
76
- .replace(/^\s*,\s*|\s+$/g, "")
77
- .replace(/[\s\u00A0]*[.,;:–—-]+$/u, "")
78
- .trim();
79
- let currentOrateur = null;
80
- let currentQualite = "";
81
- let currentTexte = "";
82
- function isPresidentQual(qual) {
83
- return /\bprésident(e)?\b/i.test(qual);
84
- }
85
- // Flush the buffered speaker’s text into points[] if any.
86
- function flush() {
87
- if (!currentOrateur || !currentTexte.trim())
88
- return;
89
- ordreAbsoluSeance++;
90
- points.push({
91
- code_grammaire: "PAROLE_GENERIQUE",
92
- roledebat: isPresidentQual(currentQualite) ? "président" : "",
93
- ordre_absolu_seance: String(ordreAbsoluSeance),
94
- orateurs: { orateur: { nom: currentOrateur, id: "", qualite: currentQualite || "" } },
95
- texte: { _: currentTexte.trim() },
96
- });
97
- currentOrateur = null;
98
- currentQualite = "";
99
- currentTexte = "";
100
- }
101
- function addPoint(payload) {
102
- ordreAbsoluSeance++;
103
- points.push({ ...payload, ordre_absolu_seance: String(ordreAbsoluSeance) });
104
- }
105
- for (const $p of paras) {
106
- if ($p.closest("table").length)
107
- continue;
108
- const tagName = ($p.prop("tagName") || "").toString().toLowerCase();
109
- const rawText = ($p.text() || "").replace(/\u00a0/g, " ").trim();
110
- const text = norm(rawText);
111
- if (!text || text.length <= 3)
112
- continue;
113
- const html = ($p.html() || "").trim();
114
- const italicSpans = $p.find("i, em, span[style*='italic']");
115
- const firstItalicOuter = italicSpans.length ? $(italicSpans[0]).prop("outerHTML") || "" : "";
116
- const htmlBeforeFirstItalic = firstItalicOuter ? html.split(firstItalicOuter)[0].trim() : "";
117
- const isPureItalic = italicSpans.length > 0 && italicSpans.length === $p.find("span,i,em").length && htmlBeforeFirstItalic === "";
118
- if (tagName === "h3") {
119
- flush();
120
- addPoint({
121
- code_style: "Titre",
122
- code_grammaire: "TITRE_TEXTE_DISCUSSION",
123
- texte: { _: text },
124
- });
125
- continue;
126
- }
127
- const boldSpans = $p.find("strong, b");
128
- const joinedBold = norm(boldSpans
129
- .map((_, el) => $(el).text() || "")
130
- .get()
131
- .join(""));
132
- const [namePartRaw, qualPartRaw] = joinedBold.split(/\s*,\s+/, 2);
133
- const namePart = namePartRaw ? normSpeaker(namePartRaw) : "";
134
- const qualPart = qualPartRaw ? normQual(qualPartRaw) : "";
135
- const looksLikeName = namePart.length > 3 && /^(M\.|Mme)[\s\u00A0\u202F]+/i.test(namePart);
136
- const startsWithName = namePart && text.startsWith(namePart);
137
- const isNewSpeaker = looksLikeName && startsWithName && namePart !== currentOrateur;
138
- if (isNewSpeaker) {
139
- flush();
140
- currentOrateur = namePart;
141
- currentQualite = qualPart;
142
- const remainingText = getRemainingTextAfterSpeakerHeader($, $p);
143
- currentTexte = remainingText;
144
- continue;
145
- }
146
- if (isPureItalic || (!joinedBold && !currentOrateur && text)) {
147
- flush();
148
- addPoint({
149
- code_style: "Info Italiques",
150
- code_grammaire: "PAROLE_GENERIQUE",
151
- texte: { _: "<i>" + text + "</i>" },
152
- });
153
- continue;
154
- }
155
- // concat text because same orateur
156
- if (currentOrateur) {
157
- const removeOrateurFromText = getRemainingTextAfterSpeakerHeader($, $p);
158
- currentTexte += (currentTexte ? "<br/><br/>" : "") + removeOrateurFromText;
159
- continue;
160
- }
161
- }
162
- flush();
163
- return points;
164
- }
165
- const TIME_RE = /(?:\b[àa]\s*)?(\d{1,2})\s*(?:h|heures?)\s*(?:([0-5]\d))?/i;
166
- export function cleanTitle(t) {
167
- return (t || "").replace(/\s+/g, " ").trim();
168
- }
169
- function parseTimeToHHmm(text) {
170
- const m = normalizeSpaces(text).match(TIME_RE);
171
- if (!m)
172
- return undefined;
173
- const hh = m[1]?.padStart(2, "0");
174
- const mm = (m[2] ?? "00").padStart(2, "0");
175
- const h = Number(hh);
176
- if (h >= 0 && h <= 23)
177
- return `${hh}:${mm}`;
178
- return undefined;
179
- }
180
- function findNearbyTime($, $h3) {
181
- let cur = $h3.prev();
182
- for (let i = 0; i < 3 && cur.length; i++, cur = cur.prev()) {
183
- const direct = parseTimeToHHmm(cur.text());
184
- if (direct)
185
- return direct;
186
- const italic = parseTimeToHHmm(cur.find("i, em").first().text());
187
- if (italic)
188
- return italic;
189
- }
190
- return undefined;
191
- }
192
- export function extractDayH3Sections($, dateISO) {
193
- const sections = [];
194
- const $dayRoot = findDayRoot($, dateISO);
195
- if ($dayRoot.length === 0)
196
- return sections;
197
- const $range = $dayRoot.nextUntil("h2");
198
- const $h3s = $range.filter("h3").add($range.find("h3"));
199
- $h3s.each((_, el) => {
200
- const $h3 = $(el);
201
- const title = cleanTitle($h3.text());
202
- if (!title)
203
- return;
204
- const time = findNearbyTime($, $h3);
205
- sections.push({ title, $start: $h3, time });
206
- });
207
- return sections;
208
- }
209
- export function parseCommissionCRSectionFromDom($, htmlFilePath, opts) {
210
- try {
211
- const { dateISO, hourShort, organe, section, matched } = opts;
212
- const seanceRef = matched?.uid ?? makeReunionUid(dateISO, "COM", matched?.events[0].id ?? hourShort ?? "", organe ?? undefined);
213
- const uid = seanceRef.replace(/^RU/, "CRC");
214
- const dateSeance = toCRDate(dateISO, matched?.startTime ?? hourShortToStartTime(hourShort));
215
- const $dayRoot = findDayRoot($, dateISO);
216
- if ($dayRoot.length === 0) {
217
- console.warn(`[COM-CR][parse] day root not found for ${dateISO} in ${path.basename(htmlFilePath)}`);
218
- return null;
219
- }
220
- const paras = [];
221
- let $cursor = section.$start;
222
- // Jump title if we do not want to add it to paragraphes
223
- $cursor = $cursor.next();
224
- while ($cursor.length && !$cursor.is("h2") && !$cursor.is("h3")) {
225
- if ($cursor.is(PARA_h3_SEL)) {
226
- paras.push($cursor);
227
- }
228
- else {
229
- const $ps = $cursor.find(PARA_h3_SEL);
230
- if ($ps.length)
231
- $ps.each((_, p) => {
232
- paras.push($(p));
233
- });
234
- }
235
- $cursor = $cursor.next();
236
- }
237
- const points = buildPointsFromParagraphs($, paras);
238
- if (points.length < 4 || !points.some((pt) => pt.code_grammaire === "PAROLE_GENERIQUE" && pt.orateurs)) {
239
- console.warn(`[COM-CR][parse] Insufficient points or no interventions found for a section in ${path.basename(htmlFilePath)}`);
240
- return null;
241
- }
242
- const session = dateISO.slice(5, 7) >= "10" ? `${dateISO.slice(0, 4)}` : `${Number(dateISO.slice(0, 4)) - 1}`;
243
- const contenu = {
244
- quantiemes: { journee: dateISO, session },
245
- point: points,
246
- };
247
- const metadonnees = {
248
- dateSeance,
249
- dateSeanceJour: dateISO,
250
- numSeanceJour: "",
251
- numSeance: "",
252
- typeAssemblee: "SN",
253
- legislature: "",
254
- session,
255
- nomFichierJo: path.basename(htmlFilePath),
256
- validite: "non-certifie",
257
- etat: "definitif",
258
- diffusion: "publique",
259
- version: "1",
260
- environnement: "prod",
261
- heureGeneration: new Date(),
262
- };
263
- return { uid, seanceRef, sessionRef: session, metadonnees, contenu };
264
- }
265
- catch (e) {
266
- console.error(`[COM-CR][parse] error section file=${path.basename(htmlFilePath)}:`, e);
267
- return null;
268
- }
269
- }
@@ -1,39 +0,0 @@
1
- export interface DebatAuteurRow {
2
- code: string | null;
3
- matricule: string | null;
4
- nom: string | null;
5
- prenom: string | null;
6
- }
7
- export interface DebatInterventionRow {
8
- analyse: string | null;
9
- auteur: DebatAuteurRow | null;
10
- auteur_code: string;
11
- fonction_intervenant: string | null;
12
- id: string | null;
13
- url: string | null;
14
- }
15
- export interface DebatSectionRow {
16
- categorie: string | null;
17
- id?: string | null;
18
- interventions: DebatInterventionRow[];
19
- lecture_id?: string | null;
20
- libelle?: string | null;
21
- numero?: string | null;
22
- objet: string | null;
23
- type: string | null;
24
- url?: string | null;
25
- }
26
- export interface DebatLectureRow {
27
- id: string;
28
- }
29
- export interface DebatResult {
30
- date_seance: string | null;
31
- etat_synchronisation: string | null;
32
- id: string | null;
33
- lectures: DebatLectureRow[];
34
- numero: string | null;
35
- sections: DebatSectionRow[];
36
- sections_divers: DebatSectionRow[];
37
- url: string | null;
38
- }
39
- export declare function findAll(): AsyncGenerator<DebatResult, void, unknown>;